In [1]:
import pandas as pd

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler

from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.layers import Dense, Dropout, PReLU, LeakyReLU

import keras

from keras.callbacks import EarlyStopping

In [2]:
pd.set_option("max.columns", None)

In [3]:
df = pd.read_csv("data/ranks.csv")
del df["Unnamed: 0"]
df1 = pd.read_csv("updated_game_scores.csv")

In [4]:
df_atl = df[(df['team'] == 'Atlanta')]
df_bos = df[(df['team'] == 'Boston')]
df_bkn = df[(df['team'] == 'Brooklyn')]
df_cha = df[(df['team'] == 'Charlotte')]
df_chi = df[(df['team'] == 'Chicago')]
df_cle = df[(df['team'] == 'Cleveland')]
df_dal = df[(df['team'] == 'Dallas')]
df_den = df[(df['team'] == 'Denver')]
df_det = df[(df['team'] == 'Detroit')]
df_gsw = df[(df['team'] == 'Golden State')]
df_hou = df[(df['team'] == 'Houston')]
df_ind = df[(df['team'] == 'Indiana')]
df_lac = df[(df['team'] == 'LA Clippers')]
df_lal = df[(df['team'] == 'LA Lakers')]
df_mem = df[(df['team'] == 'Memphis')]
df_mia = df[(df['team'] == 'Miami')]
df_mil = df[(df['team'] == 'Milwaukee')]
df_min = df[(df['team'] == 'Minnesota')]
df_nop = df[(df['team'] == 'New Orleans')]
df_nyk = df[(df['team'] == 'New York')]
df_okc = df[(df['team'] == 'Okla City')]
df_orl = df[(df['team'] == 'Orlando')]
df_phi = df[(df['team'] == 'Philadelphia')]
df_pho = df[(df['team'] == 'Phoenix')]
df_por = df[(df['team'] == 'Portland')]
df_sac = df[(df['team'] == 'Sacramento')]
df_sas = df[(df['team'] == 'San Antonio')]
df_tor = df[(df['team'] == 'Toronto')]
df_uta = df[(df['team'] == 'Utah')]
df_was = df[(df['team'] == 'Washington')]

In [5]:
df_atl_games = df1[(df1['home'] == 'Atlanta Hawks')| (df1['visitor'] =='Atlanta Hawks')]
df_bos_games = df1[(df1['home'] == 'Boston Celtics')| (df1['visitor'] =='Boston Celtics')]
df_bkn_games = df1[(df1['home'] == 'Brooklyn Nets')| (df1['visitor'] =='Brooklyn Nets')]
df_cha_games = df1[(df1['home'] == 'Charlotte Hornets')| (df1['visitor'] =='Charlotte Hornets')]
df_chi_games = df1[(df1['home'] == 'Chicago Bulls')| (df1['visitor'] =='Chicago Bulls')]
df_cle_games = df1[(df1['home'] == 'Cleveland Cavaliers')| (df1['visitor'] =='Cleveland Cavaliers')]
df_dal_games = df1[(df1['home'] == 'Dallas Mavericks')| (df1['visitor'] =='Dallas Mavericks')]
df_den_games = df1[(df1['home'] == 'Denver Nuggets')| (df1['visitor'] =='Denver Nuggets')]
df_det_games = df1[(df1['home'] == 'Detroit Pistons')| (df1['visitor'] =='Detroit Pistons')]
df_gsw_games = df1[(df1['home'] == 'Golden State Warriors')| (df1['visitor'] =='Golden State Warriors')]
df_hou_games = df1[(df1['home'] == 'Houston Rockets')| (df1['visitor'] =='Houston Rockets')]
df_ind_games = df1[(df1['home'] == 'Indiana Pacers')| (df1['visitor'] =='Indiana Pacers')]
df_lac_games = df1[(df1['home'] == 'Los Angeles Clippers')| (df1['visitor'] =='Los Angeles Clippers')]
df_lal_games = df1[(df1['home'] == 'Los Angeles Lakers')| (df1['visitor'] =='Los Angeles Lakers')]
df_mem_games = df1[(df1['home'] == 'Memphis Grizzlies')| (df1['visitor'] =='Memphis Grizzlies')]
df_mia_games = df1[(df1['home'] == 'Miami Heat')| (df1['visitor'] =='Miami Heat')]
df_mil_games = df1[(df1['home'] == 'Milwaukee Bucks')| (df1['visitor'] =='Milwaukee Bucks')]
df_min_games = df1[(df1['home'] == 'Minnesota Timberwolves')| (df1['visitor'] =='Minnesota Timberwolves')]
df_nop_games = df1[(df1['home'] == 'New Orleans Pelicans')| (df1['visitor'] =='New Orleans Pelicans')]
df_nyk_games = df1[(df1['home'] == 'New York Knicks')| (df1['visitor'] =='New York Knicks')]
df_okc_games = df1[(df1['home'] == 'Oklahoma City Thunder')| (df1['visitor'] =='Oklahoma City Thunder')]
df_orl_games = df1[(df1['home'] == 'Orlando Magic')| (df1['visitor'] =='Orlando Magic')]
df_phi_games = df1[(df1['home'] == 'Philadelphia 76ers')| (df1['visitor'] =='Philadelphia 76ers')]
df_pho_games = df1[(df1['home'] == 'Phoenix Suns')| (df1['visitor'] =='Phoenix Suns')]
df_por_games = df1[(df1['home'] == 'Portland Trail Blazers')| (df1['visitor'] =='Portland Trail Blazers')]
df_sac_games = df1[(df1['home'] == 'Sacramento Kings')| (df1['visitor'] =='Sacramento Kings')]
df_sas_games = df1[(df1['home'] == 'San Antonio Spurs')| (df1['visitor'] =='San Antonio Spurs')]
df_tor_games = df1[(df1['home'] == 'Toronto Raptors')| (df1['visitor'] =='Toronto Raptors')]
df_uta_games = df1[(df1['home'] == 'Utah Jazz')| (df1['visitor'] =='Utah Jazz')] 
df_was_games = df1[(df1['home'] == 'Washington Wizards')| (df1['visitor'] =='Washington Wizards')]

In [6]:
df_merged_lal = pd.merge(df_lal, df_lal_games, on="date")
df_merged_lal["team"] = "Los Angeles Lakers"                       ## Change HERE
df_merged_lal["home_game"] = 0
df_merged_lal["home_game"] = (df_merged_lal['team'] == df_merged_lal["home"])
df_merged_lal["date_time"] = pd.to_datetime(df_merged_lal['date'])
    
days_between = []
days_between.append("0 days")
for i in range(len(df_merged_lal) - 1):
    z = df_merged_lal["date_time"][i+1] - df_merged_lal["date_time"][i]
    days_between.append(z)
        
df_merged_lal["days_between"] = days_between
df_merged_lal["back_to_back"] = (df_merged_lal['days_between'] == "1 days")
    
del df_merged_lal["date_time"], df_merged_lal["days_between"], df_merged_lal["attendance"], df_merged_lal["id"], df_merged_lal["Unnamed: 0"] 

In [7]:
df = df_merged_lal

In [8]:
df.head(1)

Unnamed: 0,date,team,rank_assists,rank_blocks,rank_deffeciency,rank_drebounds,rank_fouls,rank_ft_pct,rank_oeffeciency,rank_opp_assists,rank_opp_blocks,rank_opp_drebounds,rank_opp_fouls,rank_opp_orebounds,rank_opp_ptsfastbreak,rank_opp_ptsfrom2,rank_opp_ptsfrom3,rank_opp_ptsinpaint,rank_opp_ptspergame,rank_opp_steals,rank_opp_threepointpct,rank_opp_turnovers,rank_opp_twopointpct,rank_orebounds,rank_ptsfrom2,rank_ptsfrom3,rank_ptsfromfastbreak,rank_ptsinpaint,rank_scoring,rank_steals,rank_threeptpct,rank_turnovers,rank_twoptpct,start (et),visitor,pts_visitor,home,pts_home,ot,home_game,back_to_back
0,2015-11-03,Los Angeles Lakers,26,4,27,19,23,5,10,29,14,24,10,12,27,30,14,30,29,3,13,30,29,17,25,3,11,26,7,26,23,13,15,10:30p,Denver Nuggets,120,Los Angeles Lakers,109,,True,False


In [9]:
df["score"] = 0
for i in df["home_game"]:
    if i == True:
        df["score"] = df["pts_home"]
    if i == False:
        df["score"] = df["pts_visitor"]

In [10]:
df.head(5)

Unnamed: 0,date,team,rank_assists,rank_blocks,rank_deffeciency,rank_drebounds,rank_fouls,rank_ft_pct,rank_oeffeciency,rank_opp_assists,rank_opp_blocks,rank_opp_drebounds,rank_opp_fouls,rank_opp_orebounds,rank_opp_ptsfastbreak,rank_opp_ptsfrom2,rank_opp_ptsfrom3,rank_opp_ptsinpaint,rank_opp_ptspergame,rank_opp_steals,rank_opp_threepointpct,rank_opp_turnovers,rank_opp_twopointpct,rank_orebounds,rank_ptsfrom2,rank_ptsfrom3,rank_ptsfromfastbreak,rank_ptsinpaint,rank_scoring,rank_steals,rank_threeptpct,rank_turnovers,rank_twoptpct,start (et),visitor,pts_visitor,home,pts_home,ot,home_game,back_to_back,score
0,2015-11-03,Los Angeles Lakers,26,4,27,19,23,5,10,29,14,24,10,12,27,30,14,30,29,3,13,30,29,17,25,3,11,26,7,26,23,13,15,10:30p,Denver Nuggets,120,Los Angeles Lakers,109,,True,False,120
1,2015-11-06,Los Angeles Lakers,27,6,30,20,29,6,8,30,18,28,9,14,25,30,15,30,30,2,12,30,29,15,27,6,10,25,4,22,26,8,12,7:30p,Los Angeles Lakers,104,Brooklyn Nets,98,,False,False,104
2,2015-11-08,Los Angeles Lakers,28,7,29,20,30,6,11,28,20,26,4,17,28,30,4,29,29,1,4,30,29,21,29,11,13,25,4,18,27,8,13,3:30p,Los Angeles Lakers,95,New York Knicks,99,,False,False,95
3,2015-11-10,Los Angeles Lakers,27,9,29,18,30,8,11,28,18,28,10,18,23,30,7,29,29,1,4,30,27,20,29,10,20,29,4,21,27,6,21,7:30p,Los Angeles Lakers,88,Miami Heat,101,,False,False,88
4,2015-11-11,Los Angeles Lakers,29,7,29,17,27,9,18,26,21,27,14,21,20,30,5,28,27,1,1,30,27,21,30,8,15,29,15,22,23,10,25,7:00p,Los Angeles Lakers,99,Orlando Magic,101,,False,True,99


In [11]:
del df["date"], df["team"], df["visitor"], df["pts_visitor"], df["home"], df["pts_home"], df["ot"]

In [12]:
df["start (et)"] = df["start (et)"].str.replace("p", "")

In [13]:
df["start (et)"] = df["start (et)"].astype(str)

In [14]:
df.head(1)

Unnamed: 0,rank_assists,rank_blocks,rank_deffeciency,rank_drebounds,rank_fouls,rank_ft_pct,rank_oeffeciency,rank_opp_assists,rank_opp_blocks,rank_opp_drebounds,rank_opp_fouls,rank_opp_orebounds,rank_opp_ptsfastbreak,rank_opp_ptsfrom2,rank_opp_ptsfrom3,rank_opp_ptsinpaint,rank_opp_ptspergame,rank_opp_steals,rank_opp_threepointpct,rank_opp_turnovers,rank_opp_twopointpct,rank_orebounds,rank_ptsfrom2,rank_ptsfrom3,rank_ptsfromfastbreak,rank_ptsinpaint,rank_scoring,rank_steals,rank_threeptpct,rank_turnovers,rank_twoptpct,start (et),home_game,back_to_back,score
0,26,4,27,19,23,5,10,29,14,24,10,12,27,30,14,30,29,3,13,30,29,17,25,3,11,26,7,26,23,13,15,10:30,True,False,120


In [15]:
target = "score"
X = df.drop(target, axis=1)
y = df[target]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [17]:
mapper = DataFrameMapper([
    (["rank_assists"], StandardScaler()),
    (["rank_blocks"], StandardScaler()),
    (["rank_deffeciency"], StandardScaler()),
    (["rank_drebounds"], StandardScaler()),
    (["rank_fouls"], StandardScaler()),
    (["rank_ft_pct"], StandardScaler()),
    (["rank_oeffeciency"], StandardScaler()),
    (["rank_opp_assists"], StandardScaler()),
    (["rank_opp_blocks"], StandardScaler()),
    (["rank_opp_drebounds"], StandardScaler()),
    (["rank_opp_fouls"], StandardScaler()),
    (["rank_opp_orebounds"], StandardScaler()),
    (["rank_opp_ptsfastbreak"], StandardScaler()),
    (["rank_opp_ptsfrom2"], StandardScaler()),
    (["rank_opp_ptsfrom3"], StandardScaler()),
    (["rank_opp_ptsinpaint"], StandardScaler()),
    (["rank_opp_ptspergame"], StandardScaler()),
    (["rank_opp_steals"], StandardScaler()),
    (["rank_opp_threepointpct"], StandardScaler()),
    (["rank_opp_turnovers"], StandardScaler()),
    (["rank_opp_twopointpct"], StandardScaler()),
    (["rank_orebounds"], StandardScaler()),
    (["rank_ptsfrom2"], StandardScaler()),
    (["rank_ptsfrom3"], StandardScaler()),
    (["rank_ptsfromfastbreak"], StandardScaler()),
    (["rank_ptsinpaint"], StandardScaler()),
    (["rank_scoring"], StandardScaler()),
    (["rank_steals"], StandardScaler()),
    (["rank_threeptpct"], StandardScaler()),
    (["rank_turnovers"], StandardScaler()),
    (["rank_twoptpct"], StandardScaler()),
    ("start (et)", LabelBinarizer()),
    ("home_game", LabelBinarizer()),
    ("back_to_back", LabelBinarizer())],df_out=True)

In [18]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [19]:
model = LinearRegression().fit(Z_train,y_train)
print("LinearRegression train score is " + str(model.score(Z_train,y_train)))
print("LinearRegression test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

LinearRegression train score is 0.3113998436876918
LinearRegression test score is -0.45694603132575873
Mean squared error is 12.98776785618503


In [20]:
model = HuberRegressor().fit(Z_train,y_train)
print("HuberRegressor train score is " + str(model.score(Z_train,y_train)))
print("HuberRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

HuberRegressor train score is 0.29949736281177286
HuberRegressor test score is -0.5396528989587543
Mean squared error is 13.35131975413622


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [21]:
model = Lasso().fit(Z_train,y_train)
print("Lasso train score is " + str(model.score(Z_train,y_train)))
print("Lasso test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Lasso train score is 0.09375985585728219
Lasso test score is 0.07589941644805343
Mean squared error is 10.343616428911853


In [22]:
model = Ridge().fit(Z_train,y_train)
print("Ridge train score is " + str(model.score(Z_train,y_train)))
print("Ridge test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Ridge train score is 0.30084828914155715
Ridge test score is -0.3391311539574877
Mean squared error is 12.451576521521083


In [23]:
model = ElasticNet().fit(Z_train,y_train)
print("ElasticNet train score is " + str(model.score(Z_train,y_train)))
print("ElasticNet test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

ElasticNet train score is 0.09765130443897108
ElasticNet test score is 0.07066630906414484
Mean squared error is 10.372862617196208


In [24]:
model = RandomForestRegressor().fit(Z_train,y_train)
print("RandomForestRegressor train score is " + str(model.score(Z_train,y_train)))
print("RandomForestRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

RandomForestRegressor train score is 0.8660618360388405
RandomForestRegressor test score is -0.10272470269054157
Mean squared error is 11.299164640300956


In [25]:
model = DecisionTreeRegressor().fit(Z_train,y_train)
print("DecisionTreeRegressor train score is " + str(model.score(Z_train,y_train)))
print("DecisionTreeRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

DecisionTreeRegressor train score is 1.0
DecisionTreeRegressor test score is -1.4621183127683048
Mean squared error is 16.88368512882812


In [26]:
model = AdaBoostRegressor().fit(Z_train,y_train)
print("AdaBoostRegressor train score is " + str(model.score(Z_train,y_train)))
print("AdaBoostRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

AdaBoostRegressor train score is 0.36096795250933855
AdaBoostRegressor test score is -0.039666096876177726
Mean squared error is 10.971341315294097


In [27]:
model = Sequential([
    Input(shape=(Z_train.shape[1],)),
    Dense(8, activation="relu"),
    Dense(4, activation="relu"),
    Dense(1, activation="relu")
])

model.compile(loss="mae", optimizer="adam")

early_stop = EarlyStopping(monitor="val_loss", patience=7, verbose=0, mode="min")

model.fit(Z_train, y_train,
            validation_data=(Z_test, y_test),
            epochs=200, batch_size=8,
            verbose=2, callbacks=[early_stop])

Epoch 1/200
39/39 - 1s - loss: 106.2030 - val_loss: 107.6295
Epoch 2/200
39/39 - 0s - loss: 105.7349 - val_loss: 106.9386
Epoch 3/200
39/39 - 0s - loss: 105.1140 - val_loss: 106.0871
Epoch 4/200
39/39 - 0s - loss: 104.2727 - val_loss: 104.8519
Epoch 5/200
39/39 - 0s - loss: 103.0331 - val_loss: 103.0866
Epoch 6/200
39/39 - 0s - loss: 101.1722 - val_loss: 100.5287
Epoch 7/200
39/39 - 0s - loss: 98.6667 - val_loss: 96.8758
Epoch 8/200
39/39 - 0s - loss: 95.2782 - val_loss: 92.6754
Epoch 9/200
39/39 - 0s - loss: 90.9529 - val_loss: 85.7759
Epoch 10/200
39/39 - 0s - loss: 84.3245 - val_loss: 77.2504
Epoch 11/200
39/39 - 0s - loss: 77.6130 - val_loss: 71.6778
Epoch 12/200
39/39 - 0s - loss: 74.1458 - val_loss: 68.4631
Epoch 13/200
39/39 - 0s - loss: 70.4000 - val_loss: 64.2123
Epoch 14/200
39/39 - 0s - loss: 65.1777 - val_loss: 59.3682
Epoch 15/200
39/39 - 0s - loss: 58.9611 - val_loss: 53.5470
Epoch 16/200
39/39 - 0s - loss: 51.1900 - val_loss: 46.4200
Epoch 17/200
39/39 - 0s - loss: 42.25

<tensorflow.python.keras.callbacks.History at 0x7fb27769a1d0>

In [28]:
X_test.sample(1).to_dict(orient='list')

{'back_to_back': [False],
 'home_game': [True],
 'rank_assists': [30],
 'rank_blocks': [23],
 'rank_deffeciency': [30],
 'rank_drebounds': [23],
 'rank_fouls': [12],
 'rank_ft_pct': [13],
 'rank_oeffeciency': [28],
 'rank_opp_assists': [29],
 'rank_opp_blocks': [27],
 'rank_opp_drebounds': [26],
 'rank_opp_fouls': [25],
 'rank_opp_orebounds': [20],
 'rank_opp_ptsfastbreak': [28],
 'rank_opp_ptsfrom2': [30],
 'rank_opp_ptsfrom3': [15],
 'rank_opp_ptsinpaint': [30],
 'rank_opp_ptspergame': [27],
 'rank_opp_steals': [10],
 'rank_opp_threepointpct': [11],
 'rank_opp_turnovers': [28],
 'rank_opp_twopointpct': [30],
 'rank_orebounds': [12],
 'rank_ptsfrom2': [27],
 'rank_ptsfrom3': [21],
 'rank_ptsfromfastbreak': [18],
 'rank_ptsinpaint': [27],
 'rank_scoring': [28],
 'rank_steals': [22],
 'rank_threeptpct': [30],
 'rank_turnovers': [11],
 'rank_twoptpct': [30],
 'start (et)': ['9:30']}

In [29]:
X_new = pd.DataFrame({
    'back_to_back': [False],
    'home_game': [False],
    'rank_assists': [6],
    'rank_blocks': [1],
    'rank_deffeciency': [3],
    'rank_drebounds': [13],
    'rank_fouls': [14],
    'rank_ft_pct': [25],
    'rank_oeffeciency': [4],
    'rank_opp_assists': [5],
    'rank_opp_blocks': [3],
    'rank_opp_drebounds': [1],
    'rank_opp_fouls': [10],
    'rank_opp_orebounds': [12],
    'rank_opp_ptsfastbreak': [29],
    'rank_opp_ptsfrom2': [9],
    'rank_opp_ptsfrom3': [3],
    'rank_opp_ptsinpaint': [7],
    'rank_opp_ptspergame': [7],
    'rank_opp_steals': [21],
    'rank_opp_threepointpct': [5],
    'rank_opp_turnovers': [7],
    'rank_opp_twopointpct': [8],
    'rank_orebounds': [13],
    'rank_ptsfrom2': [2],
    'rank_ptsfrom3': [21],
    'rank_ptsfromfastbreak': [2],
    'rank_ptsinpaint': [2],
    'rank_scoring': [7],
    'rank_steals': [6],
    'rank_threeptpct': [11],
    'rank_turnovers': [15],
    'rank_twoptpct': [3],
    'start (et)': ['8:00']
})

Z_new = mapper.transform(X_new)
model.predict(Z_new)

array([[114.10031]], dtype=float32)

In [30]:
from sklearn.metrics import r2_score

y_hat_score = model.predict(Z_test)[:, 0]
preds = pd.DataFrame({
    'score': y_test,
    'y_hat_score': y_hat_score
})

print(r2_score(preds['score'], preds['y_hat_score']))

-0.8824999721996474
