# CS 254 Machine Learning Final Project

We will be using the ***NFL scores and betting data*** dataset to analyze successful betting techniques and winning strategies.

### Overview

1. Import packages 
2. Wrangle and clean
3. Explore data
4. Classify and predict
---

### Part 1 (Import Packages)

In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.calibration import CalibratedClassifierCV as CCV
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import GaussianNB
#List of imports will get longer

### Part 2 *(Wrangle and clean)* 

In [168]:
data = pd.read_csv("nfl-scores-and-betting-data/spreadspoke_scores.csv")
teams = pd.read_csv("nfl-scores-and-betting-data/nfl_teams.csv")

# replace blank cells with NaN
data = data.replace(r'^\s*$', np.nan, regex=True)
# Only need data points with over/under and spread
data = data[(data.over_under_line.isnull() == False) & (data.spread_favorite.isnull() == False)]
#realign indicies after gleaning
data.reset_index(drop=True, inplace=True)
#Convert the over under stat to float
data['over_under_line'] = data.over_under_line.astype(float)
# Use team_id from teams for usability
data['team_home'] = data.team_home.map(teams.set_index('team_name')['team_id'].to_dict())
data['team_away'] = data.team_away.map(teams.set_index('team_name')['team_id'].to_dict())
# removing extra columns 
data = data[['schedule_date', 'schedule_season', 'schedule_week', 'team_home',
       'team_away', 'team_favorite_id', 'spread_favorite',
       'over_under_line', 'score_home', 'score_away',
       'stadium_neutral']]
#Home and away favorites to determine favorite results
data['home_fav'] = (data.team_home == data.team_favorite_id).astype(int)
data['away_fav'] = (data.team_away == data.team_favorite_id).astype(int)
#Create a result array for wins and losses based on Home team favoritism
data['result'] = (data.score_home > data.score_away).astype(int)



### Part 3 *(Explore data)* 

In [169]:
data[50:60]

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,team_away,team_favorite_id,spread_favorite,over_under_line,score_home,score_away,stadium_neutral,home_fav,away_fav,result
50,09/16/1979,1979,3,LAC,BUF,LAC,-9.0,44.0,27.0,19.0,False,1,0,1
51,09/16/1979,1979,3,SEA,OAK,SEA,-3.0,44.0,27.0,10.0,False,1,0,1
52,09/16/1979,1979,3,ARI,PIT,PIT,-6.0,40.0,21.0,24.0,False,0,1,0
53,09/17/1979,1979,3,WAS,NYG,WAS,-6.0,37.0,27.0,0.0,False,1,0,1
54,09/23/1979,1979,4,BUF,NYJ,BUF,-3.0,42.0,46.0,31.0,False,1,0,1
55,09/23/1979,1979,4,CIN,TEN,TEN,-3.0,37.0,27.0,30.0,False,0,1,0
56,09/23/1979,1979,4,DEN,SEA,DEN,-6.0,37.0,37.0,34.0,False,1,0,1
57,09/23/1979,1979,4,DET,ATL,ATL,-6.0,37.0,24.0,23.0,False,0,1,1
58,09/23/1979,1979,4,KC,OAK,OAK,-3.0,41.0,35.0,7.0,False,0,1,1
59,09/23/1979,1979,4,MIA,CHI,MIA,-6.0,37.0,31.0,16.0,False,1,0,1


In [170]:
data.shape

(9860, 14)

In [171]:
data.describe().round(2)

Unnamed: 0,schedule_season,spread_favorite,over_under_line,score_home,score_away,home_fav,away_fav,result
count,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0
mean,1999.46,-5.37,41.84,22.64,19.83,0.67,0.32,0.58
std,11.41,3.42,4.67,10.4,10.03,0.47,0.46,0.49
min,1967.0,-26.5,28.0,0.0,0.0,0.0,0.0,0.0
25%,1990.0,-7.0,38.0,16.0,13.0,0.0,0.0,0.0
50%,2000.0,-4.5,41.5,22.0,20.0,1.0,0.0,1.0
75%,2009.0,-3.0,45.0,30.0,27.0,1.0,1.0,1.0
max,2018.0,0.0,63.5,62.0,59.0,1.0,1.0,1.0


In [172]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,schedule_season,spread_favorite,over_under_line,score_home,score_away,stadium_neutral,home_fav,away_fav,result
schedule_season,1.0,-0.021,0.33,0.063,0.085,0.012,-0.0088,0.026,-0.0065
spread_favorite,-0.021,1.0,-0.059,-0.15,0.12,-0.014,-0.23,0.18,-0.15
over_under_line,0.33,-0.059,1.0,0.21,0.2,0.05,0.0059,0.0016,0.005
score_home,0.063,-0.15,0.21,1.0,-0.023,0.0012,0.22,-0.22,0.55
score_away,0.085,0.12,0.2,-0.023,1.0,0.023,-0.23,0.23,-0.57
stadium_neutral,0.012,-0.014,0.05,0.0012,0.023,1.0,-0.021,0.024,-0.02
home_fav,-0.0088,-0.23,0.0059,0.22,-0.23,-0.021,1.0,-0.97,0.28
away_fav,0.026,0.18,0.0016,-0.22,0.23,0.024,-0.97,1.0,-0.28
result,-0.0065,-0.15,0.005,0.55,-0.57,-0.02,0.28,-0.28,1.0


In [173]:
win_percentage = round(sum(data.result)/len(data.result),2)
fav_percentage = sum(((data.home_fav == 1) & (data.result == 1)) | ((data.away_fav == 1) & (data.result == 0))) / len(data.result)

print("Home win percentage = %{}".format(round(win_percentage*100, 2)))
print("Away win percentage = %{}".format(round((1 - win_percentage)*100, 2)))
print("Favored win percentage = %{}".format(round((fav_percentage)*100, 2)))

Home win percentage = %58.0
Away win percentage = %42.0
Favored win percentage = %65.16


### Part 4 *(Classify and predict)* 

In [185]:
X = data[['over_under_line','spread_favorite','home_fav']]
y = data['result']
results=[]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)
classifier = DecisionTreeClassifier(min_samples_split=2, max_depth=5)

kfold = model_selection.KFold(n_splits=5, random_state=0)
cv_results = model_selection.cross_val_score(classifier, X, y, cv=kfold, scoring = 'roc_auc')
results.append(cv_results)

msg = "%f (%f)" % (cv_results.mean(), cv_results.std())

print(msg)
classifier.fit(X_train,y_train)
y_predict = classifier.predict(X_test)

print(accuracy_score(y_test, y_predict))

pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['Predicted Loss', 'Predicted Win'],
    index=['True Loss', 'True Win']
)


0.688052 (0.013731)
0.6369168356997972


Unnamed: 0,Predicted Loss,Predicted Win
True Loss,400,413
True Win,303,856


In [193]:
X = data[['over_under_line','spread_favorite','home_fav']]
y = data[['result']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=1)

gnb = GaussianNB()
dtc = DecisionTreeClassifier(max_depth=5, criterion='entropy')
lrg = LogisticRegression(solver='liblinear')
vote = VotingClassifier(estimators=[('dtc', dtc), ('lrg', lrg)], voting='soft')

model = CCV(vote, method='isotonic', cv=3)
model.fit(X_train, y_train)

predicted = model.predict_proba(X_test)[:,1]

# ROC AUC Score higher is better while Brier Score the lower the better
print("Metrics" + "\t\t" + "My Model")
print("ROC AUC Score: " +  "\t" + "{:.4f}".format(roc_auc_score(y_test, predicted)))
print("Brier Score: " + "\t" + "{:.4f}".format(brier_score_loss(y_test, predicted)))
print(predicted)


Metrics		My Model
ROC AUC Score: 	0.6912
Brier Score: 	0.2165
[0.71931861 0.67301296 0.60702316 ... 0.85841993 0.79370224 0.43980672]


  y = column_or_1d(y, warn=True)


In [192]:
test = X_test.copy()
test.loc[:,'hm_prob'] = predicted
test.loc[:, 'result'] = y

test['my_bet_won'] = (((test.hm_prob >= 0.55) & (y.result == 1)) | ((test.hm_prob <= 0.40) & (test.result == 0))).astype(int)
test['my_bet_lost'] = (((test.hm_prob >= 0.55) & (y.result == 0)) | ((test.hm_prob <= 0.40) & (test.result == 1))).astype(int)

print("Model Win Percentage: " + "{:.4f}".format(test.my_bet_won.sum() / (test.my_bet_lost.sum() + test.my_bet_won.sum())))

print("Total Number of Bets Won: " + str(test.my_bet_won.sum()))
print("Total Number of Bets Made: " + str((test.my_bet_lost.sum() + test.my_bet_won.sum())))
print("Possible Games: " + str(len(test)))


Model Win Percentage: 0.6924
Total Number of Bets Won: 1067
Total Number of Bets Made: 1541
Possible Games: 1972
