# CS 254 Machine Learning Final Project

We will be using the ***NFL scores and betting data*** dataset to analyze successful betting techniques and winning strategies.

### Overview

1. Import packages 
2. Wrangle and clean
3. Explore data
4. Classify and predict
---

### Part 1 (Import Packages)

In [224]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#List of imports will get longer

### Part 2 *(Wrangle and clean)* 

In [253]:
data = pd.read_csv("nfl-scores-and-betting-data/spreadspoke_scores.csv")
teams = pd.read_csv("nfl-scores-and-betting-data/nfl_teams.csv")

# replace blank cells with NaN
data = data.replace(r'^\s*$', np.nan, regex=True)

# Only need data points with over/under and spread
data = data[(data.over_under_line.isnull() == False) & (data.spread_favorite.isnull() == False)]

#realign indicies after gleaning
data.reset_index(drop=True, inplace=True)

#Convert the over under stat to float
data['over_under_line'] = data.over_under_line.astype(float)

# Use team_id from teams for usability
data['team_home'] = data.team_home.map(teams.set_index('team_name')['team_id'].to_dict())
data['team_away'] = data.team_away.map(teams.set_index('team_name')['team_id'].to_dict())

# removing extra columns 
data = data[['schedule_date', 'schedule_season', 'schedule_week', 'team_home',
       'team_away', 'team_favorite_id', 'spread_favorite',
       'over_under_line', 'score_home', 'score_away',
       'stadium_neutral']]

#Home and away favorites to determine favorite results
data['home_fav'] = (data.team_home == data.team_favorite_id).astype(int)
data['away_fav'] = (data.team_away == data.team_favorite_id).astype(int)

#Create a result array for wins and losses based on Home team favoritism
data['result'] = (data.score_home > data.score_away).astype(int)



### Part 2 *(Explore data)* 

In [248]:
data[50:60]

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,team_away,team_favorite_id,spread_favorite,over_under_line,score_home,score_away,stadium_neutral,home_fav,away_fav,result
50,09/23/1979,1979,4,NYG,PHI,PHI,-5.0,37.0,13.0,17.0,False,0,1,0
51,09/23/1979,1979,4,PIT,IND,PIT,-14.0,38.0,17.0,13.0,False,1,0,1
52,09/23/1979,1979,4,SF,NO,PICK,0.0,42.0,21.0,30.0,False,0,0,0
53,09/23/1979,1979,4,ARI,WAS,ARI,-3.0,39.0,7.0,17.0,False,1,0,0
54,09/23/1979,1979,4,TB,LAR,LAR,-3.0,35.0,21.0,6.0,False,0,1,1
55,09/24/1979,1979,4,CLE,DAL,DAL,-3.0,40.0,26.0,7.0,False,0,1,1
56,09/30/1979,1979,5,ATL,WAS,ATL,-3.0,37.0,7.0,16.0,False,1,0,0
57,09/30/1979,1979,5,IND,BUF,IND,-2.0,42.0,13.0,31.0,False,1,0,0
58,09/30/1979,1979,5,CHI,TB,CHI,-1.0,34.0,13.0,17.0,False,1,0,0
59,09/30/1979,1979,5,DAL,CIN,DAL,-10.0,40.0,38.0,13.0,False,1,0,1


In [249]:
data.shape

(9848, 14)

In [250]:
data.describe().round(2)

Unnamed: 0,schedule_season,spread_favorite,over_under_line,score_home,score_away,home_fav,away_fav,result
count,9848.0,9848.0,9848.0,9848.0,9848.0,9848.0,9848.0,9848.0
mean,1999.49,-5.37,41.85,22.65,19.84,0.67,0.32,0.58
std,11.37,3.42,4.67,10.4,10.03,0.47,0.46,0.49
min,1979.0,-26.5,28.0,0.0,0.0,0.0,0.0,0.0
25%,1990.0,-7.0,38.0,16.0,13.0,0.0,0.0,0.0
50%,2000.0,-4.5,41.5,22.0,20.0,1.0,0.0,1.0
75%,2009.0,-3.0,45.0,30.0,27.0,1.0,1.0,1.0
max,2018.0,0.0,63.5,62.0,59.0,1.0,1.0,1.0


In [251]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,schedule_season,spread_favorite,over_under_line,score_home,score_away,stadium_neutral,home_fav,away_fav,result
schedule_season,1.0,-0.023,0.33,0.062,0.083,0.045,-0.0092,0.027,-0.0063
spread_favorite,-0.023,1.0,-0.059,-0.15,0.12,-0.0085,-0.23,0.18,-0.15
over_under_line,0.33,-0.059,1.0,0.2,0.2,0.068,0.0057,0.002,0.005
score_home,0.062,-0.15,0.2,1.0,-0.023,0.0052,0.22,-0.22,0.55
score_away,0.083,0.12,0.2,-0.023,1.0,0.031,-0.23,0.23,-0.57
stadium_neutral,0.045,-0.0085,0.068,0.0052,0.031,1.0,-0.02,0.023,-0.022
home_fav,-0.0092,-0.23,0.0057,0.22,-0.23,-0.02,1.0,-0.97,0.28
away_fav,0.027,0.18,0.002,-0.22,0.23,0.023,-0.97,1.0,-0.28
result,-0.0063,-0.15,0.005,0.55,-0.57,-0.022,0.28,-0.28,1.0


In [252]:
win_percentage = round(sum(data.result)/len(data.result),2)
fav_percentage = sum(((df.home_favorite == 1) & (df.result == 1)) | ((df.away_favorite == 1) & (df.result == 0))) / len(data.result)

print("Home win percentage = %{}".format(round(win_percentage*100, 2)))
print("Away win percentage = %{}".format(round((1 - win_percentage)*100, 2)))
print("Favored win percentage = %{}".format(round((fav_percentage)*100, 2)))

Home win percentage = %58.0
Away win percentage = %42.0
Favored win percentage = %77.1
