In [44]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [45]:
#Read in data
df = pd.read_csv("combined_edited_spreads_and_ratings_df.csv", index_col=0)

In [46]:
#Get each month number, and make into dummy columns
#Will fill in each column with a 1 when the game is played in that month
df.Month.unique()

array([ 9, 10, 11, 12,  1,  8], dtype=int64)

In [47]:
df = pd.concat([df, pd.get_dummies(df['Month'])], axis=1)
df.head()

Unnamed: 0,Date,Month,Team1Site,Team2Site,Team1,Team1_Win_Precentage,Final1,Open1,Close1,ML1,...,Spread_Close,Team1_Point_Dif,Team1Cover,Over_Under,1,8,9,10,11,12
0,9/02/2010,9,H,V,SouthCarolina,0.642857,41,13.5,13.0,-500,...,-13.0,28,1,O,0,0,1,0,0,0
1,9/02/2010,9,H,V,OhioState,0.923077,45,28.5,28.0,-5500,...,-28.0,38,1,O,0,0,1,0,0,0
2,9/02/2010,9,H,V,IowaState,0.416667,27,3.0,4.0,-190,...,-4.0,17,1,U,0,0,1,0,0,0
3,9/02/2010,9,H,V,Utah,0.769231,27,3.5,3.5,-160,...,-3.5,3,0,O,0,0,1,0,0,0
4,9/02/2010,9,H,V,Hawaii,0.714286,36,53.5,51.0,752,...,20.5,-13,1,O,0,0,1,0,0,0


In [48]:
#rename the columns with the month name instead of num for easier interpretation
df.rename(index=str, columns={1: "Jan", 8: "Aug", 9: "Sept", 10: "Oct", 11: "Nov", 12: "Dec"}, inplace=True)
df.drop('Month', axis=1, inplace=True)
df.head()

Unnamed: 0,Date,Team1Site,Team2Site,Team1,Team1_Win_Precentage,Final1,Open1,Close1,ML1,Team1_Conf,...,Spread_Close,Team1_Point_Dif,Team1Cover,Over_Under,Jan,Aug,Sept,Oct,Nov,Dec
0,9/02/2010,H,V,SouthCarolina,0.642857,41,13.5,13.0,-500,SEC,...,-13.0,28,1,O,0,0,1,0,0,0
1,9/02/2010,H,V,OhioState,0.923077,45,28.5,28.0,-5500,Big Ten,...,-28.0,38,1,O,0,0,1,0,0,0
2,9/02/2010,H,V,IowaState,0.416667,27,3.0,4.0,-190,Big 12,...,-4.0,17,1,U,0,0,1,0,0,0
3,9/02/2010,H,V,Utah,0.769231,27,3.5,3.5,-160,MWC,...,-3.5,3,0,O,0,0,1,0,0,0
4,9/02/2010,H,V,Hawaii,0.714286,36,53.5,51.0,752,WAC,...,20.5,-13,1,O,0,0,1,0,0,0


In [49]:
#In the Team Site columns, convert to numerical data
#Should not matter that much since most of the team1s are home and team2s are away but could have future implications
df['Team1Site'] = df['Team1Site'].map({'N': 0, 'H': 1, 'V':-1})
df['Team2Site'] = df['Team2Site'].map({'N': 0, 'H': 1, 'V':-1})

In [50]:
#Get dummy columns for the conferences
df["Team1_Conf"] = df["Team1_Conf"].str.replace(' ', '')
df = pd.concat([df, pd.get_dummies(df['Team1_Conf'])], axis=1)

In [51]:
#Move around columns
#BEWARE: very hard coded
cols = list(df)
my_new_list = ['Team1_'+ x for x in cols[-14:]]
cols = cols[:-14] + my_new_list
df.columns = cols
df.to_csv('transformation_CSVs/transformed_data1.csv')

In [52]:
#Get dummy columns for the conferences
df["Team2_Conf"] = df["Team2_Conf"].str.replace(' ', '')
df = pd.concat([df, pd.get_dummies(df['Team1_Conf'])], axis=1)

In [53]:
#Move around columns
cols = list(df)
my_new_list = ['Team2_'+ x for x in cols[-14:]]
cols = cols[:-14] + my_new_list
df.columns = cols
df.drop(['Team1_Conf', 'Team2_Conf'], axis=1, inplace=True)
df.to_csv('transformation_CSVs/transformed_data2.csv')

In [54]:
# get a list of columns
# very hard coded
cols = list(df)
# move the column to head of list using index, pop and insert
for ind,month in zip(range(1,7),cols[51:57]):
    cols.insert(ind,cols.pop(cols.index(month)))
    
for ind,conf in zip(range(48,62),cols[-14:]):
    cols.insert(ind,cols.pop(cols.index(conf)))

for ind,conf in zip(range(30,44),cols[-14:]):
    cols.insert(ind,cols.pop(cols.index(conf)))
#cols
#use loc to reorder
df = df.loc[:, cols]
df.to_csv('transformation_CSVs/transformed_data3.csv')

In [55]:
#Create X and y variables
#For this particular problem our y label is whether or not the team we designate as team 1, covers the spread
X = df.drop(columns = ['Date','Team1','Final1','Open1','Close1','Team2','Final2','Open2','Close2',
             'Team1_Win','Total_Open','Total_Close','Team1_Point_Dif','Team1Cover','Over_Under'])
y = df['Team1Cover']

In [56]:
#Save to CSVs
X.to_csv("transformation_CSVs/X_data.csv")
y.to_csv("transformation_CSVs/y_cover_yn_data.csv")

In [57]:
#Split into train and test sets
#Will use various validation tests as well
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)

In [58]:
#Save each of the variables into a CSV
variable_array = [X_train, X_test, y_train, y_test]
strings = ['X_train', 'X_test', 'y_train', 'y_test']
for df,name in zip(variable_array,strings):
    df.to_csv('transformation_CSVs/'+name+'.csv')
    df.to_csv(name+'.csv')

In [59]:
#Can save a version of scaled data, makes more sense to do this in the prelim model training notebook though
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [61]:
#Saving to a csv, might need it for something
X_train=pd.DataFrame(X_train, columns=[list(X)])
X_train.to_csv('transformation_CSVs/scale_X_train.csv')