In [632]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('wrangled.csv')


In [633]:
df.head()

Unnamed: 0.1,Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,home_id,away_id
0,0,09/01/1979,1979,1,False,Tampa Bay Buccaneers,31,16,Detroit Lions,TB,-3.0,30.0,Houlihan's Stadium,False,79.0,9.0,87.0,,TB,DET
1,1,09/02/1979,1979,1,False,Buffalo Bills,7,9,Miami Dolphins,MIA,-5.0,39.0,Ralph Wilson Stadium,False,74.0,15.0,74.0,,BUF,MIA
2,2,09/02/1979,1979,1,False,Chicago Bears,6,3,Green Bay Packers,CHI,-3.0,31.0,Soldier Field,False,78.0,11.0,68.0,,CHI,GB
3,3,09/02/1979,1979,1,False,Denver Broncos,10,0,Cincinnati Bengals,DEN,-3.0,31.5,Mile High Stadium,False,69.0,6.0,38.0,,DEN,CIN
4,4,09/02/1979,1979,1,False,Kansas City Chiefs,14,0,Baltimore Colts,KC,-1.0,37.0,Arrowhead Stadium,False,76.0,8.0,71.0,,KC,IND


In [634]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10177 entries, 0 to 10176
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           10177 non-null  int64  
 1   schedule_date        10177 non-null  object 
 2   schedule_season      10177 non-null  int64  
 3   schedule_week        10177 non-null  object 
 4   schedule_playoff     10177 non-null  bool   
 5   team_home            10177 non-null  object 
 6   score_home           10177 non-null  int64  
 7   score_away           10177 non-null  int64  
 8   team_away            10177 non-null  object 
 9   team_favorite_id     10177 non-null  object 
 10  spread_favorite      10177 non-null  float64
 11  over_under_line      10177 non-null  object 
 12  stadium              10177 non-null  object 
 13  stadium_neutral      10177 non-null  bool   
 14  weather_temperature  9665 non-null   float64
 15  weather_wind_mph     9665 non-null  

In [635]:
#double up the data so there is a row for each team in each game.  This seems redundant but the prior
#week info will be different when the home team is in the team position vs opponent position

data1 = [df["schedule_date"], df["schedule_season"], df["schedule_week"], df["schedule_playoff"], df["home_id"],df["score_home"], df["away_id"], df["score_away"], df["team_favorite_id"], df["spread_favorite"], df["over_under_line"]]
data2 = [df["schedule_date"], df["schedule_season"], df["schedule_week"], df["schedule_playoff"], df["away_id"],df["score_away"], df["home_id"], df["score_home"], df["team_favorite_id"], df["spread_favorite"], df["over_under_line"]]



headers = ["date", "season","week","playoff","team_id","team_score","opponent_id","opponent_score","favorite","spread","overunder"]

hometeams = pd.concat(data1, axis=1, keys=headers)
awayteams = pd.concat(data2, axis=1, keys=headers)

hometeams['home'] = 1
awayteams['home'] = 0

In [636]:
awayteams.head()

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home
0,09/01/1979,1979,1,False,DET,16,TB,31,TB,-3.0,30.0,0
1,09/02/1979,1979,1,False,MIA,9,BUF,7,MIA,-5.0,39.0,0
2,09/02/1979,1979,1,False,GB,3,CHI,6,CHI,-3.0,31.0,0
3,09/02/1979,1979,1,False,CIN,0,DEN,10,DEN,-3.0,31.5,0
4,09/02/1979,1979,1,False,IND,0,KC,14,KC,-1.0,37.0,0


In [637]:
df = pd.concat([hometeams,awayteams],ignore_index=True)

In [638]:
df.shape

(20354, 12)

In [639]:
df.head()

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home
0,09/01/1979,1979,1,False,TB,31,DET,16,TB,-3.0,30.0,1
1,09/02/1979,1979,1,False,BUF,7,MIA,9,MIA,-5.0,39.0,1
2,09/02/1979,1979,1,False,CHI,6,GB,3,CHI,-3.0,31.0,1
3,09/02/1979,1979,1,False,DEN,10,CIN,0,DEN,-3.0,31.5,1
4,09/02/1979,1979,1,False,KC,14,IND,0,KC,-1.0,37.0,1


In [640]:
df['week'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', 'Wildcard', 'Division', 'Conference',
       'Superbowl', '17', '18', 'WildCard', 'SuperBowl'], dtype=object)

In [641]:
#get rid of playoffs
keeps = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12','13', '14', '15', '16','17','18']

df = df[df.week.isin(keeps)]

In [642]:
df['week'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18'], dtype=object)

In [643]:
df['week'] = df['week'].astype(np.int64)

In [644]:
df.sort_values(by=['season','team_id','week'], inplace=True)

In [645]:
#add previous week - bye weeks mean not necessarily week-1
df['prev_week'] = df.week.shift(1)

In [646]:
#add previous date - bye weeks mean not necessarily week-1
df['prev_date'] = df.date.shift(1)

In [647]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19484 entries, 11 to 20331
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            19484 non-null  object 
 1   season          19484 non-null  int64  
 2   week            19484 non-null  int64  
 3   playoff         19484 non-null  bool   
 4   team_id         19484 non-null  object 
 5   team_score      19484 non-null  int64  
 6   opponent_id     19484 non-null  object 
 7   opponent_score  19484 non-null  int64  
 8   favorite        19484 non-null  object 
 9   spread          19484 non-null  float64
 10  overunder       19484 non-null  object 
 11  home            19484 non-null  int64  
 12  prev_week       19483 non-null  float64
 13  prev_date       19483 non-null  object 
dtypes: bool(1), float64(2), int64(5), object(6)
memory usage: 2.1+ MB


In [648]:
import time
df['date'] = pd.to_datetime(df['date'])
df['prev_date'] = pd.to_datetime(df['prev_date'])

In [649]:
#add days since last game
from datetime import datetime
df['days_since'] = (df['date'] - df['prev_date']).dt.days

In [650]:
df.loc[(df['season'] == 1999) & (df['team_id'] == 'DEN')]

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home,prev_week,prev_date,days_since
4608,1999-09-13,1999,1,False,DEN,21,MIA,38,DEN,-6.0,43.0,1,17.0,2000-01-02,-111.0
14792,1999-09-19,1999,2,False,DEN,10,KC,26,DEN,-3.5,40.5,0,1.0,1999-09-13,6.0
14813,1999-09-26,1999,3,False,DEN,10,TB,13,TB,-2.5,38.0,0,2.0,1999-09-19,7.0
4643,1999-10-03,1999,4,False,DEN,13,NYJ,21,DEN,-5.5,40.0,1,3.0,1999-09-26,7.0
14838,1999-10-10,1999,5,False,DEN,16,OAK,13,OAK,-7.0,38.0,0,4.0,1999-10-03,7.0
4671,1999-10-17,1999,6,False,DEN,31,GB,10,GB,-3.5,40.0,1,5.0,1999-10-10,7.0
14863,1999-10-24,1999,7,False,DEN,23,NE,24,NE,-3.5,40.5,0,6.0,1999-10-17,7.0
4698,1999-10-31,1999,8,False,DEN,20,MIN,23,MIN,-2.5,45.0,1,7.0,1999-10-24,7.0
14894,1999-11-07,1999,9,False,DEN,33,LAC,17,PICK,0.0,35.5,0,8.0,1999-10-31,7.0
14910,1999-11-14,1999,10,False,DEN,17,SEA,20,SEA,-5.5,41.0,0,9.0,1999-11-07,7.0


In [651]:
#add cumulative point totals
df['points_for'] = df.groupby(['season','team_id'])['team_score'].transform(pd.Series.cumsum)
df['points_against'] = df.groupby(['season','team_id'])['opponent_score'].transform(pd.Series.cumsum)

In [652]:
df.head(450)

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home,prev_week,prev_date,days_since,points_for,points_against
11,1979-09-02,1979,1,False,ARI,21,DAL,22,DAL,-4.0,37,1,,NaT,,21,22
10200,1979-09-09,1979,2,False,ARI,27,NYG,14,ARI,-3.0,36,0,1.0,1979-09-02,7.0,48,36
40,1979-09-16,1979,3,False,ARI,21,PIT,24,PIT,-6.0,40,1,2.0,1979-09-09,7.0,69,60
53,1979-09-23,1979,4,False,ARI,7,WAS,17,ARI,-3.0,39,1,3.0,1979-09-16,7.0,76,77
10239,1979-09-30,1979,5,False,ARI,0,LAR,21,LAR,-6.0,38,0,4.0,1979-09-23,7.0,76,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,1979-12-02,1979,14,False,WAS,38,GB,21,WAS,-9.0,34,1,13.0,1979-11-25,7.0,286,246
208,1979-12-09,1979,15,False,WAS,28,CIN,14,WAS,-6.0,42,1,14.0,1979-12-02,7.0,314,260
10392,1979-12-16,1979,16,False,WAS,34,DAL,35,DAL,-8.0,39.5,0,15.0,1979-12-09,7.0,348,295
245,1980-09-07,1980,1,False,ARI,35,NYG,41,ARI,-7.0,42,1,16.0,1979-12-16,266.0,35,41


In [653]:
df['overunder'].unique()

array(['37', '36', '40', '39', '38', '42', '43', '32', '35.5', '35', '45',
       '34', '44', '41', '31', '31.5', '44.5', '39.5', '30', '33', '42.5',
       '36.5', '40.5', '49', '50', '46', '48', '47', '51', '54', '45.5',
       '43.5', '46.5', '52', '52.5', '54.5', '47.5', '38.5', '37.5',
       '34.5', '41.5', '53', '55', '53.5', '48.5', '49.5', '50.5', '51.5',
       '56', '33.5', '28', '32.5', '36.6', '29.5', '58.5', '59.5', '58',
       '55.5', '63', '56.5', '57', '60', '30.5', '57.5', '59', '61.5',
       '63.5'], dtype=object)

In [654]:
df['overunder'] = df['overunder'].astype(np.float)

In [655]:
#add performance against spreads
df['covered_by'] = np.where(df['team_id'] == df['favorite'], df['team_score'] - df['opponent_score'] + df['spread'], df['opponent_score'] - df['team_score'] + df['spread'])
df['covered_over_by'] = ((df['team_score'] + df['opponent_score']) - df['overunder'])


In [656]:
df.prev_week.unique()

array([nan,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18.])

In [657]:
df.head()

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home,prev_week,prev_date,days_since,points_for,points_against,covered_by,covered_over_by
11,1979-09-02,1979,1,False,ARI,21,DAL,22,DAL,-4.0,37.0,1,,NaT,,21,22,-3.0,6.0
10200,1979-09-09,1979,2,False,ARI,27,NYG,14,ARI,-3.0,36.0,0,1.0,1979-09-02,7.0,48,36,10.0,5.0
40,1979-09-16,1979,3,False,ARI,21,PIT,24,PIT,-6.0,40.0,1,2.0,1979-09-09,7.0,69,60,-3.0,5.0
53,1979-09-23,1979,4,False,ARI,7,WAS,17,ARI,-3.0,39.0,1,3.0,1979-09-16,7.0,76,77,-13.0,-15.0
10239,1979-09-30,1979,5,False,ARI,0,LAR,21,LAR,-6.0,38.0,0,4.0,1979-09-23,7.0,76,98,15.0,-17.0


In [658]:
df['covered'] = np.where(df['covered_by'] > 0, 1,0)

In [659]:
df['covered_over'] = np.where(df['covered_over_by'] > 0, 1,0)

In [660]:
df['push'] = np.where(df['covered_by'] == 0, 1,0)

In [661]:
df['push_over'] = np.where(df['covered_over_by'] == 0, 1,0)

In [662]:
df['win_by'] = df['team_score'] - df['opponent_score']

In [663]:
df['win'] = np.where(df['win_by'] > 0, 1,0)

In [664]:
df['loss'] = np.where(df['win_by'] < 0, 1,0)

In [665]:
df['tie'] = np.where(df['win_by'] == 0, 1,0)

In [666]:
df['total_wins'] = df.groupby(['season','team_id'])['win'].transform(pd.Series.cumsum)
df['total_losses'] = df.groupby(['season','team_id'])['loss'].transform(pd.Series.cumsum)
df['win_loss_ratio'] = (df['total_wins'])/np.where(df['total_losses'] == 0, 1,df['total_losses'])
df['pfa_ratio'] = (df['points_for'])/np.where(df['points_against'] == 0, 1,df['points_against'])
df['net_wins'] = df['total_wins']-df['total_losses']
df['net_pfa'] = df['points_for']-df['points_against']
df['net_pfa_avg'] = (df['points_for']-df['points_against'])/df['week']

In [667]:
df['push'].value_counts()

0    18930
1      554
Name: push, dtype: int64

In [668]:
df['favorite'].value_counts()

PIT     850
NE      822
DEN     816
DAL     802
SF      786
GB      746
PHI     732
MIN     712
MIA     680
NO      652
NYG     650
LAC     646
SEA     644
KC      628
WAS     614
LAR     612
OAK     606
CHI     606
TEN     602
BUF     594
ATL     584
IND     580
NYJ     564
CIN     516
TB      476
DET     474
BAL     438
ARI     420
CLE     414
CAR     378
JAX     322
PICK    282
HOU     236
Name: favorite, dtype: int64

In [669]:
#bring previous week results into current week
df.sort_values(by=['season','week','home'], inplace=True)
df = df.merge(df,how='left',suffixes=('', '_p'), left_on=['team_id','season','prev_week'], right_on=['team_id','season','week'])

In [670]:
df.head()

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home,prev_week,prev_date,days_since,points_for,points_against,covered_by,covered_over_by,covered,covered_over,push,push_over,win_by,win,loss,tie,total_wins,total_losses,win_loss_ratio,pfa_ratio,net_wins,net_pfa,net_pfa_avg,date_p,week_p,playoff_p,team_score_p,opponent_id_p,opponent_score_p,favorite_p,spread_p,overunder_p,home_p,prev_week_p,prev_date_p,days_since_p,points_for_p,points_against_p,covered_by_p,covered_over_by_p,covered_p,covered_over_p,push_p,push_over_p,win_by_p,win_p,loss_p,tie_p,total_wins_p,total_losses_p,win_loss_ratio_p,pfa_ratio_p,net_wins_p,net_pfa_p,net_pfa_avg_p
0,1979-09-02,1979,1,False,ATL,40,NO,34,NO,-5.0,32.0,0,16.0,1979-12-16,-105.0,40,34,-11.0,42.0,0,1,0,0,6,1,0,0,1,0,1.0,1.176471,1,6,6.0,1979-12-16,16.0,False,31.0,SF,21.0,ATL,-3.0,45.0,1.0,15.0,1979-12-09,7.0,300.0,388.0,7.0,7.0,1.0,1.0,0.0,0.0,10.0,1.0,0.0,0.0,6.0,10.0,0.6,0.773196,-4.0,-88.0,-5.5
1,1979-09-02,1979,1,False,CIN,0,DEN,10,DEN,-3.0,31.5,0,16.0,1979-12-16,-105.0,0,10,7.0,-21.5,1,0,0,0,-10,0,1,0,0,1,0.0,0.0,-1,-10,-10.0,1979-12-16,16.0,False,16.0,CLE,12.0,CIN,-1.0,41.0,1.0,15.0,1979-12-09,7.0,337.0,421.0,3.0,-13.0,1.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,4.0,12.0,0.333333,0.800475,-8.0,-84.0,-5.25
2,1979-09-02,1979,1,False,CLE,25,NYJ,22,NYJ,-2.0,41.0,0,16.0,1979-12-16,-105.0,25,22,-5.0,6.0,0,1,0,0,3,1,0,0,1,0,1.0,1.136364,1,3,3.0,1979-12-16,16.0,False,12.0,CIN,16.0,CIN,-1.0,41.0,0.0,15.0,1979-12-09,7.0,359.0,352.0,3.0,-13.0,1.0,0.0,0.0,0.0,-4.0,0.0,1.0,0.0,9.0,7.0,1.285714,1.019886,2.0,7.0,0.4375
3,1979-09-02,1979,1,False,DAL,22,ARI,21,DAL,-4.0,37.0,0,16.0,1979-12-16,-105.0,22,21,-3.0,6.0,0,1,0,0,1,1,0,0,1,0,1.0,1.047619,1,1,1.0,1979-12-16,16.0,False,35.0,WAS,34.0,DAL,-8.0,39.5,1.0,15.0,1979-12-08,8.0,371.0,313.0,-7.0,29.5,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,11.0,5.0,2.2,1.185304,6.0,58.0,3.625
4,1979-09-01,1979,1,False,DET,16,TB,31,TB,-3.0,30.0,0,16.0,1979-12-17,-107.0,16,31,12.0,17.0,1,1,0,0,-15,0,1,0,0,1,0.0,0.516129,-1,-15,-15.0,1979-12-15,16.0,False,13.0,GB,18.0,DET,-3.0,34.0,1.0,15.0,1979-12-09,6.0,219.0,365.0,-8.0,-3.0,0.0,0.0,0.0,0.0,-5.0,0.0,1.0,0.0,2.0,14.0,0.142857,0.6,-12.0,-146.0,-9.125


In [671]:
#remove zero spread rows
df = df[(df.favorite != "PICK")]
#remove week 1 rows - previous week not valid
df = df[(df.week != 1)]

In [672]:
df_output = df.groupby('home').mean().T
df_output
#confirm averages make sense - home teams score 2.5pts more on average

home,0,1
season,2000.02,2000.02
week,9.58858,9.5904
playoff,False,False
team_score,19.9232,22.6155
opponent_score,22.6226,19.9211
spread,-5.48417,-5.48439
overunder,41.9082,41.9071
prev_week,8.52671,8.52472
days_since,7.46162,7.48661
points_for,188.69,190.348


In [673]:
df.head(20)

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home,prev_week,prev_date,days_since,points_for,points_against,covered_by,covered_over_by,covered,covered_over,push,push_over,win_by,win,loss,tie,total_wins,total_losses,win_loss_ratio,pfa_ratio,net_wins,net_pfa,net_pfa_avg,date_p,week_p,playoff_p,team_score_p,opponent_id_p,opponent_score_p,favorite_p,spread_p,overunder_p,home_p,prev_week_p,prev_date_p,days_since_p,points_for_p,points_against_p,covered_by_p,covered_over_by_p,covered_p,covered_over_p,push_p,push_over_p,win_by_p,win_p,loss_p,tie_p,total_wins_p,total_losses_p,win_loss_ratio_p,pfa_ratio_p,net_wins_p,net_pfa_p,net_pfa_avg_p
28,1979-09-09,1979,2,False,ARI,27,NYG,14,ARI,-3.0,36.0,0,1.0,1979-09-02,7.0,48,36,10.0,5.0,1,1,0,0,13,1,0,0,1,1,1.0,1.333333,0,12,6.0,1979-09-02,1.0,False,21.0,DAL,22.0,DAL,-4.0,37.0,1.0,,NaT,,21.0,22.0,-3.0,6.0,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.954545,-1.0,-1.0,-1.0
29,1979-09-10,1979,2,False,ATL,14,PHI,10,PHI,-4.0,35.5,0,1.0,1979-09-02,8.0,54,44,-8.0,-11.5,0,0,0,0,4,1,0,0,2,0,2.0,1.227273,2,10,5.0,1979-09-02,1.0,False,40.0,NO,34.0,NO,-5.0,32.0,0.0,16.0,1979-12-16,-105.0,40.0,34.0,-11.0,42.0,0.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,1.0,0.0,1.0,1.176471,1.0,6.0,6.0
30,1979-09-09,1979,2,False,CIN,24,BUF,51,CIN,-3.0,34.0,0,1.0,1979-09-02,7.0,24,61,-30.0,41.0,0,1,0,0,-27,0,1,0,0,2,0.0,0.393443,-2,-37,-18.5,1979-09-02,1.0,False,0.0,DEN,10.0,DEN,-3.0,31.5,0.0,16.0,1979-12-16,-105.0,0.0,10.0,7.0,-21.5,1.0,0.0,0.0,0.0,-10.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.0,-10.0,-10.0
32,1979-09-09,1979,2,False,DAL,21,SF,13,DAL,-13.0,39.0,0,1.0,1979-09-02,7.0,43,34,-5.0,-5.0,0,0,0,0,8,1,0,0,2,0,2.0,1.264706,2,9,4.5,1979-09-02,1.0,False,22.0,ARI,21.0,DAL,-4.0,37.0,0.0,16.0,1979-12-16,-105.0,22.0,21.0,-3.0,6.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.047619,1.0,1.0,1.0
33,1979-09-06,1979,2,False,LAR,13,DEN,9,DEN,-2.0,31.5,0,1.0,1979-09-02,4.0,30,33,-6.0,-9.5,0,0,0,0,4,1,0,0,1,1,1.0,0.909091,0,-3,-1.5,1979-09-02,1.0,False,17.0,OAK,24.0,LAR,-4.0,36.5,1.0,16.0,1979-12-17,-106.0,17.0,24.0,-11.0,4.5,0.0,1.0,0.0,0.0,-7.0,0.0,1.0,0.0,0.0,1.0,0.0,0.708333,-1.0,-7.0,-7.0
34,1979-09-09,1979,2,False,MIN,7,CHI,26,CHI,-6.0,31.5,0,1.0,1979-09-02,7.0,35,48,13.0,1.5,1,1,0,0,-19,0,1,0,1,1,1.0,0.729167,0,-13,-6.5,1979-09-02,1.0,False,28.0,SF,22.0,MIN,-7.0,32.0,1.0,16.0,1979-12-15,-104.0,28.0,22.0,-1.0,18.0,0.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,1.0,0.0,1.0,1.272727,1.0,6.0,6.0
35,1979-09-09,1979,2,False,NO,19,GB,28,GB,-3.0,35.0,0,1.0,1979-09-02,7.0,53,68,6.0,12.0,1,1,0,0,-9,0,1,0,0,2,0.0,0.779412,-2,-15,-7.5,1979-09-02,1.0,False,34.0,ATL,40.0,NO,-5.0,32.0,1.0,16.0,1979-12-16,-105.0,34.0,40.0,-11.0,42.0,0.0,1.0,0.0,0.0,-6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.85,-1.0,-6.0,-6.0
36,1979-09-09,1979,2,False,NYJ,3,NE,56,NE,-8.0,42.0,0,1.0,1979-09-02,7.0,25,81,45.0,17.0,1,1,0,0,-53,0,1,0,0,2,0.0,0.308642,-2,-56,-28.0,1979-09-02,1.0,False,22.0,CLE,25.0,NYJ,-2.0,41.0,1.0,16.0,1979-12-16,-105.0,22.0,25.0,-5.0,6.0,0.0,1.0,0.0,0.0,-3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.88,-1.0,-3.0,-3.0
37,1979-09-09,1979,2,False,OAK,10,LAC,30,LAC,-3.0,42.0,0,1.0,1979-09-02,7.0,34,47,17.0,-2.0,1,0,0,0,-20,0,1,0,1,1,1.0,0.723404,0,-13,-6.5,1979-09-02,1.0,False,24.0,LAR,17.0,LAR,-4.0,36.5,0.0,16.0,1979-12-15,-104.0,24.0,17.0,-11.0,4.5,0.0,1.0,0.0,0.0,7.0,1.0,0.0,0.0,1.0,0.0,1.0,1.411765,1.0,7.0,7.0
38,1979-09-09,1979,2,False,SEA,10,MIA,19,MIA,-7.0,40.5,0,1.0,1979-09-02,7.0,26,52,2.0,-11.5,1,0,0,0,-9,0,1,0,0,2,0.0,0.5,-2,-26,-13.0,1979-09-02,1.0,False,16.0,LAC,33.0,SEA,-2.0,42.5,1.0,16.0,1979-12-16,-105.0,16.0,33.0,-19.0,6.5,0.0,1.0,0.0,0.0,-17.0,0.0,1.0,0.0,0.0,1.0,0.0,0.484848,-1.0,-17.0,-17.0


In [674]:
print(df.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18002 entries, 28 to 19489
Data columns (total 66 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               18002 non-null  datetime64[ns]
 1   season             18002 non-null  int64         
 2   week               18002 non-null  int64         
 3   playoff            18002 non-null  bool          
 4   team_id            18002 non-null  object        
 5   team_score         18002 non-null  int64         
 6   opponent_id        18002 non-null  object        
 7   opponent_score     18002 non-null  int64         
 8   favorite           18002 non-null  object        
 9   spread             18002 non-null  float64       
 10  overunder          18002 non-null  float64       
 11  home               18002 non-null  int64         
 12  prev_week          18002 non-null  float64       
 13  prev_date          18002 non-null  datetime64[ns]
 14  days_

In [675]:
#identify teams that lost outright in previous week despite being favored
df['bounce_candidate'] = np.where((df['team_id'] == df['favorite_p']) & (df['win_p'] == 0), 1,0)

In [676]:
df['bounce_candidate'].value_counts()

0    14962
1     3040
Name: bounce_candidate, dtype: int64

In [677]:
pd.set_option("display.max.columns", None)
df.loc[(df['season'] == 1999) & (df['team_id'] == 'DEN')]

Unnamed: 0,date,season,week,playoff,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home,prev_week,prev_date,days_since,points_for,points_against,covered_by,covered_over_by,covered,covered_over,push,push_over,win_by,win,loss,tie,total_wins,total_losses,win_loss_ratio,pfa_ratio,net_wins,net_pfa,net_pfa_avg,date_p,week_p,playoff_p,team_score_p,opponent_id_p,opponent_score_p,favorite_p,spread_p,overunder_p,home_p,prev_week_p,prev_date_p,days_since_p,points_for_p,points_against_p,covered_by_p,covered_over_by_p,covered_p,covered_over_p,push_p,push_over_p,win_by_p,win_p,loss_p,tie_p,total_wins_p,total_losses_p,win_loss_ratio_p,pfa_ratio_p,net_wins_p,net_pfa_p,net_pfa_avg_p,bounce_candidate
8819,1999-09-19,1999,2,False,DEN,10,KC,26,DEN,-3.5,40.5,0,1.0,1999-09-13,6.0,31,64,-19.5,-4.5,0,0,0,0,-16,0,1,0,0,2,0.0,0.484375,-2,-33,-16.5,1999-09-13,1.0,False,21.0,MIA,38.0,DEN,-6.0,43.0,1.0,17.0,2000-01-02,-111.0,21.0,38.0,-23.0,16.0,0.0,1.0,0.0,0.0,-17.0,0.0,1.0,0.0,0.0,1.0,0.0,0.552632,-1.0,-17.0,-17.0,1
8850,1999-09-26,1999,3,False,DEN,10,TB,13,TB,-2.5,38.0,0,2.0,1999-09-19,7.0,41,77,0.5,-15.0,1,0,0,0,-3,0,1,0,0,3,0.0,0.532468,-3,-36,-12.0,1999-09-19,2.0,False,10.0,KC,26.0,DEN,-3.5,40.5,0.0,1.0,1999-09-13,6.0,31.0,64.0,-19.5,-4.5,0.0,0.0,0.0,0.0,-16.0,0.0,1.0,0.0,0.0,2.0,0.0,0.484375,-2.0,-33.0,-16.5,1
8893,1999-10-03,1999,4,False,DEN,13,NYJ,21,DEN,-5.5,40.0,1,3.0,1999-09-26,7.0,54,98,-13.5,-6.0,0,0,0,0,-8,0,1,0,0,4,0.0,0.55102,-4,-44,-11.0,1999-09-26,3.0,False,10.0,TB,13.0,TB,-2.5,38.0,0.0,2.0,1999-09-19,7.0,41.0,77.0,0.5,-15.0,1.0,0.0,0.0,0.0,-3.0,0.0,1.0,0.0,0.0,3.0,0.0,0.532468,-3.0,-36.0,-12.0,0
8907,1999-10-10,1999,5,False,DEN,16,OAK,13,OAK,-7.0,38.0,0,4.0,1999-10-03,7.0,70,111,-10.0,-9.0,0,0,0,0,3,1,0,0,1,4,0.25,0.630631,-3,-41,-8.2,1999-10-03,4.0,False,13.0,NYJ,21.0,DEN,-5.5,40.0,1.0,3.0,1999-09-26,7.0,54.0,98.0,-13.5,-6.0,0.0,0.0,0.0,0.0,-8.0,0.0,1.0,0.0,0.0,4.0,0.0,0.55102,-4.0,-44.0,-11.0,1
8949,1999-10-17,1999,6,False,DEN,31,GB,10,GB,-3.5,40.0,1,5.0,1999-10-10,7.0,101,121,-24.5,1.0,0,1,0,0,21,1,0,0,2,4,0.5,0.834711,-2,-20,-3.333333,1999-10-10,5.0,False,16.0,OAK,13.0,OAK,-7.0,38.0,0.0,4.0,1999-10-03,7.0,70.0,111.0,-10.0,-9.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,4.0,0.25,0.630631,-3.0,-41.0,-8.2,0
8963,1999-10-24,1999,7,False,DEN,23,NE,24,NE,-3.5,40.5,0,6.0,1999-10-17,7.0,124,145,-2.5,6.5,0,1,0,0,-1,0,1,0,2,5,0.4,0.855172,-3,-21,-3.0,1999-10-17,6.0,False,31.0,GB,10.0,GB,-3.5,40.0,1.0,5.0,1999-10-10,7.0,101.0,121.0,-24.5,1.0,0.0,1.0,0.0,0.0,21.0,1.0,0.0,0.0,2.0,4.0,0.5,0.834711,-2.0,-20.0,-3.333333,0
9004,1999-10-31,1999,8,False,DEN,20,MIN,23,MIN,-2.5,45.0,1,7.0,1999-10-24,7.0,144,168,0.5,-2.0,1,0,0,0,-3,0,1,0,2,6,0.333333,0.857143,-4,-24,-3.0,1999-10-24,7.0,False,23.0,NE,24.0,NE,-3.5,40.5,0.0,6.0,1999-10-17,7.0,124.0,145.0,-2.5,6.5,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,0.0,2.0,5.0,0.4,0.855172,-3.0,-21.0,-3.0,0
9045,1999-11-14,1999,10,False,DEN,17,SEA,20,SEA,-5.5,41.0,0,9.0,1999-11-07,7.0,194,205,-2.5,-4.0,0,0,0,0,-3,0,1,0,3,7,0.428571,0.946341,-4,-11,-1.1,1999-11-07,9.0,False,33.0,LAC,17.0,PICK,0.0,35.5,0.0,8.0,1999-10-31,7.0,177.0,185.0,-16.0,14.5,0.0,1.0,0.0,0.0,16.0,1.0,0.0,0.0,3.0,6.0,0.5,0.956757,-3.0,-8.0,-0.888889,0
9090,1999-11-22,1999,11,False,DEN,27,OAK,21,DEN,-1.5,37.0,1,10.0,1999-11-14,8.0,221,226,4.5,11.0,1,1,0,0,6,1,0,0,4,7,0.571429,0.977876,-3,-5,-0.454545,1999-11-14,10.0,False,17.0,SEA,20.0,SEA,-5.5,41.0,0.0,9.0,1999-11-07,7.0,194.0,205.0,-2.5,-4.0,0.0,0.0,0.0,0.0,-3.0,0.0,1.0,0.0,3.0,7.0,0.428571,0.946341,-4.0,-11.0,-1.1,0
9153,1999-12-05,1999,13,False,DEN,10,KC,16,DEN,-3.5,38.5,1,11.0,1999-11-22,13.0,231,242,-9.5,-12.5,0,0,0,0,-6,0,1,0,4,8,0.5,0.954545,-4,-11,-0.846154,1999-11-22,11.0,False,27.0,OAK,21.0,DEN,-1.5,37.0,1.0,10.0,1999-11-14,8.0,221.0,226.0,4.5,11.0,1.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,4.0,7.0,0.571429,0.977876,-3.0,-5.0,-0.454545,0


In [678]:
df = df.dropna()
df.shape

(18001, 67)

In [679]:
print(df[['prev_date','date_p']])

       prev_date     date_p
29    1979-09-02 1979-09-02
30    1979-09-02 1979-09-02
32    1979-09-02 1979-09-02
33    1979-09-02 1979-09-02
34    1979-09-02 1979-09-02
...          ...        ...
19485 2019-12-23 2019-12-23
19486 2019-12-21 2019-12-21
19487 2019-12-22 2019-12-22
19488 2019-12-22 2019-12-22
19489 2019-12-21 2019-12-21

[18001 rows x 2 columns]


In [680]:
df['bye']=df['week']-df['prev_week']

In [681]:
df['bye'].value_counts()

 1.0     16988
 2.0       946
 5.0        28
 9.0        26
-15.0        7
 0.0         6
Name: bye, dtype: int64

In [682]:
#remove unusual cases of teams suspension, etc. Teams should play all weeks except one "bye" week
keeps = [1, 2]

df = df[df.bye.isin(keeps)]


In [683]:
df.columns

Index(['date', 'season', 'week', 'playoff', 'team_id', 'team_score',
       'opponent_id', 'opponent_score', 'favorite', 'spread', 'overunder',
       'home', 'prev_week', 'prev_date', 'days_since', 'points_for',
       'points_against', 'covered_by', 'covered_over_by', 'covered',
       'covered_over', 'push', 'push_over', 'win_by', 'win', 'loss', 'tie',
       'total_wins', 'total_losses', 'win_loss_ratio', 'pfa_ratio', 'net_wins',
       'net_pfa', 'net_pfa_avg', 'date_p', 'week_p', 'playoff_p',
       'team_score_p', 'opponent_id_p', 'opponent_score_p', 'favorite_p',
       'spread_p', 'overunder_p', 'home_p', 'prev_week_p', 'prev_date_p',
       'days_since_p', 'points_for_p', 'points_against_p', 'covered_by_p',
       'covered_over_by_p', 'covered_p', 'covered_over_p', 'push_p',
       'push_over_p', 'win_by_p', 'win_p', 'loss_p', 'tie_p', 'total_wins_p',
       'total_losses_p', 'win_loss_ratio_p', 'pfa_ratio_p', 'net_wins_p',
       'net_pfa_p', 'net_pfa_avg_p', 'bounce_candida

In [684]:
#cull useless columns
df = df[['date', 'season', 'week', 'team_id', 'team_score',
       'opponent_id', 'opponent_score', 'favorite', 'spread', 'overunder',
       'home', 'prev_week','days_since', 'points_for',
       'points_against', 'covered_by', 'covered_over_by', 'covered',
       'covered_over', 'push', 'push_over', 'win_by', 'win', 'loss', 'tie',
       'total_wins', 'total_losses', 'win_loss_ratio', 'pfa_ratio', 'net_wins',
       'net_pfa', 'net_pfa_avg', 'date_p', 'favorite_p', 'home_p', 'covered_by_p',
       'covered_over_by_p', 'covered_p', 'covered_over_p', 'push_p',
       'push_over_p', 'win_by_p', 'win_p', 'loss_p', 'tie_p', 'bounce_candidate', 'bye']]


In [685]:
#break out date info
df['day_of_week'] = df['date'].dt.day_name()
df['month'] = df['date'].dt.month_name()

In [686]:
#scan for strange or missing values
for i in df.columns:
    x = df[i].value_counts()
    print("Column name is:",i,"and it value is:",x)
    print()

Column name is: date and it value is: 2010-01-03    32
2013-12-29    32
2018-12-30    32
2017-01-01    32
2017-12-31    32
              ..
2012-09-27     2
2005-11-21     2
1997-11-24     2
1989-11-27     2
1992-09-14     1
Name: date, Length: 1552, dtype: int64

Column name is: season and it value is: 2018    480
2019    480
2014    480
2008    480
2011    480
2009    480
2002    478
2017    478
2016    476
2013    476
2006    476
2012    476
2007    476
2003    474
2004    472
2015    472
2005    470
2010    470
2001    461
1999    459
2000    457
1995    444
1996    444
1998    444
1997    442
1991    418
1994    416
1993    416
1980    416
1986    414
1989    414
1992    412
1990    412
1988    408
1981    406
1984    406
1985    404
1983    402
1979    401
1987    274
1982    190
Name: season, dtype: int64

Column name is: week and it value is: 16    1222
15    1216
13    1214
14    1212
12    1206
2     1198
11    1148
3     1120
10    1104
9     1068
4     1061
8     1060
5    

In [687]:
df = pd.get_dummies(df, columns = ['day_of_week'], drop_first=True)
df = pd.get_dummies(df, columns = ['month'])

In [688]:
df = pd.get_dummies(df, columns = ['bye'])

In [689]:
df.head()

Unnamed: 0,date,season,week,team_id,team_score,opponent_id,opponent_score,favorite,spread,overunder,home,prev_week,days_since,points_for,points_against,covered_by,covered_over_by,covered,covered_over,push,push_over,win_by,win,loss,tie,total_wins,total_losses,win_loss_ratio,pfa_ratio,net_wins,net_pfa,net_pfa_avg,date_p,favorite_p,home_p,covered_by_p,covered_over_by_p,covered_p,covered_over_p,push_p,push_over_p,win_by_p,win_p,loss_p,tie_p,bounce_candidate,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,month_December,month_January,month_November,month_October,month_September,bye_1.0,bye_2.0
29,1979-09-10,1979,2,ATL,14,PHI,10,PHI,-4.0,35.5,0,1.0,8.0,54,44,-8.0,-11.5,0,0,0,0,4,1,0,0,2,0,2.0,1.227273,2,10,5.0,1979-09-02,NO,0.0,-11.0,42.0,0.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,1,1,0
30,1979-09-09,1979,2,CIN,24,BUF,51,CIN,-3.0,34.0,0,1.0,7.0,24,61,-30.0,41.0,0,1,0,0,-27,0,1,0,0,2,0.0,0.393443,-2,-37,-18.5,1979-09-02,DEN,0.0,7.0,-21.5,1.0,0.0,0.0,0.0,-10.0,0.0,1.0,0.0,0,0,0,1,0,0,0,0,0,0,1,1,0
32,1979-09-09,1979,2,DAL,21,SF,13,DAL,-13.0,39.0,0,1.0,7.0,43,34,-5.0,-5.0,0,0,0,0,8,1,0,0,2,0,2.0,1.264706,2,9,4.5,1979-09-02,DAL,0.0,-3.0,6.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,1,1,0
33,1979-09-06,1979,2,LAR,13,DEN,9,DEN,-2.0,31.5,0,1.0,4.0,30,33,-6.0,-9.5,0,0,0,0,4,1,0,0,1,1,1.0,0.909091,0,-3,-1.5,1979-09-02,LAR,1.0,-11.0,4.5,0.0,1.0,0.0,0.0,-7.0,0.0,1.0,0.0,1,0,0,0,1,0,0,0,0,0,1,1,0
34,1979-09-09,1979,2,MIN,7,CHI,26,CHI,-6.0,31.5,0,1.0,7.0,35,48,13.0,1.5,1,1,0,0,-19,0,1,0,1,1,1.0,0.729167,0,-13,-6.5,1979-09-02,MIN,1.0,-1.0,18.0,0.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,1,1,0


In [690]:
#binarize favorite
#identify teams that lost outright in previous week despite being favored
df['fav'] = np.where(df['team_id'] == df['favorite'], 1,0)
df['fav_p'] = np.where(df['team_id'] == df['favorite_p'], 1,0)

In [691]:
#remove columns we aren't predicting or don't need in the current week
df.columns

Index(['date', 'season', 'week', 'team_id', 'team_score', 'opponent_id',
       'opponent_score', 'favorite', 'spread', 'overunder', 'home',
       'prev_week', 'days_since', 'points_for', 'points_against', 'covered_by',
       'covered_over_by', 'covered', 'covered_over', 'push', 'push_over',
       'win_by', 'win', 'loss', 'tie', 'total_wins', 'total_losses',
       'win_loss_ratio', 'pfa_ratio', 'net_wins', 'net_pfa', 'net_pfa_avg',
       'date_p', 'favorite_p', 'home_p', 'covered_by_p', 'covered_over_by_p',
       'covered_p', 'covered_over_p', 'push_p', 'push_over_p', 'win_by_p',
       'win_p', 'loss_p', 'tie_p', 'bounce_candidate', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'month_December', 'month_January',
       'month_November', 'month_October', 'month_September', 'bye_1.0',
       'bye_2.0', 'fav', 'fav_p'],
      dtype='object')

In [694]:
df = df[['covered','week','spread', 'overunder', 'home','days_since','win_loss_ratio', 'pfa_ratio', 'net_wins', 'net_pfa', 'net_pfa_avg', 'home_p', 'covered_by_p', 'covered_over_by_p',
       'covered_p', 'covered_over_p','win_by_p',
       'win_p', 'loss_p', 'tie_p', 'bounce_candidate', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'month_December', 'month_January',
       'month_November', 'month_October', 'month_September',
       'bye_2.0', 'fav', 'fav_p']]

In [695]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17934 entries, 29 to 19489
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   covered               17934 non-null  int32  
 1   week                  17934 non-null  int64  
 2   spread                17934 non-null  float64
 3   overunder             17934 non-null  float64
 4   home                  17934 non-null  int64  
 5   days_since            17934 non-null  float64
 6   win_loss_ratio        17934 non-null  float64
 7   pfa_ratio             17934 non-null  float64
 8   net_wins              17934 non-null  int32  
 9   net_pfa               17934 non-null  int64  
 10  net_pfa_avg           17934 non-null  float64
 11  home_p                17934 non-null  float64
 12  covered_by_p          17934 non-null  float64
 13  covered_over_by_p     17934 non-null  float64
 14  covered_p             17934 non-null  float64
 15  covered_over_p    

In [708]:
df.columns

Index(['covered', 'week', 'spread', 'overunder', 'home', 'days_since',
       'win_loss_ratio', 'pfa_ratio', 'net_wins', 'net_pfa', 'net_pfa_avg',
       'home_p', 'covered_by_p', 'covered_over_by_p', 'covered_p',
       'covered_over_p', 'win_by_p', 'win_p', 'loss_p', 'tie_p',
       'bounce_candidate', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'month_December', 'month_January', 'month_November', 'month_October',
       'month_September', 'bye_2.0', 'fav', 'fav_p'],
      dtype='object')

In [711]:
df_include = df[['week', 'spread', 'overunder', 'home', 'days_since',
       'win_loss_ratio', 'pfa_ratio', 'net_wins', 'net_pfa', 'net_pfa_avg',
       'home_p', 'covered_by_p', 'covered_over_by_p', 'covered_p',
       'covered_over_p', 'win_by_p', 'win_p', 'loss_p', 'tie_p',
       'bounce_candidate', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'month_December', 'month_January', 'month_November', 'month_October',
       'month_September', 'bye_2.0', 'fav', 'fav_p']]

In [713]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df_include)
scaled_df = pd.DataFrame(scaled_df,columns=df_include.columns) 

In [714]:
scaled_df.head()

Unnamed: 0,week,spread,overunder,home,days_since,win_loss_ratio,pfa_ratio,net_wins,net_pfa,net_pfa_avg,home_p,covered_by_p,covered_over_by_p,covered_p,covered_over_p,win_by_p,win_p,loss_p,tie_p,bounce_candidate,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,month_December,month_January,month_November,month_October,month_September,bye_2.0,fav,fav_p
0,-1.643606,0.433028,-1.384394,-1.0,0.349187,0.278171,0.30087,0.487772,0.15259,0.7015,-1.000335,-0.810126,3.032757,-0.938121,1.037504,0.405173,1.001339,-0.996771,-0.047869,-0.450262,3.70461,-0.151039,-2.678453,-0.179202,-0.010561,-0.615378,-0.136067,-0.615378,-0.587868,2.16641,-0.23598,-0.999888,-0.986156
1,-1.643606,0.724263,-1.708168,-1.0,-0.207909,-0.906986,-1.319577,-0.490335,-0.570306,-2.598512,-1.000335,0.536283,-1.612355,1.06596,-0.963851,-0.676204,-0.998663,1.003239,-0.047869,-0.450262,-0.269934,-0.151039,0.37335,-0.179202,-0.010561,-0.615378,-0.136067,-0.615378,-0.587868,2.16641,-0.23598,1.000112,-0.986156
2,-1.643606,-2.188086,-0.628922,-1.0,-0.207909,0.278171,0.373617,0.487772,0.137209,0.631287,-1.000335,-0.211722,0.399308,-0.938121,1.037504,0.067243,1.001339,-0.996771,-0.047869,-0.450262,-0.269934,-0.151039,0.37335,-0.179202,-0.010561,-0.615378,-0.136067,-0.615378,-0.587868,2.16641,-0.23598,1.000112,1.014039
3,-1.643606,1.015498,-2.247792,-1.0,-1.879199,-0.314407,-0.317477,-0.001282,-0.04736,-0.21127,0.999665,-0.810126,0.289581,-0.938121,1.037504,-0.473445,-0.998663,1.003239,-0.047869,2.220927,-0.269934,-0.151039,-2.678453,5.580303,-0.010561,-0.615378,-0.136067,-0.615378,-0.587868,2.16641,-0.23598,-0.999888,1.014039
4,-1.643606,-0.149442,-2.247792,-1.0,-0.207909,-0.314407,-0.667138,-0.001282,-0.201168,-0.9134,0.999665,-0.062121,1.277124,-0.938121,1.037504,0.405173,1.001339,-0.996771,-0.047869,-0.450262,-0.269934,-0.151039,0.37335,-0.179202,-0.010561,-0.615378,-0.136067,-0.615378,-0.587868,2.16641,-0.23598,-0.999888,1.014039


In [716]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df_include, df['covered'], test_size=0.2)
