In [1]:
import pandas as pd

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler

from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.layers import Dense, Dropout, PReLU, LeakyReLU

import keras

from keras.callbacks import EarlyStopping

In [2]:
pd.set_option("max.columns", None)

In [3]:
df = pd.read_csv("mixed_scores_teams.csv")

In [4]:
df["score"] = 0
for i in df["home_game"]:
    if i == True:
        df["score"] = df["pts_home"]
    if i == False:
        df["score"] = df["pts_visitor"]

In [5]:
del df["Unnamed: 0"], df["date"], df["team"], df["visitor"], df["pts_visitor"], df["home"], df["pts_home"], df["ot"]

In [6]:
df.head(1)

Unnamed: 0,rank_assists,rank_blocks,rank_deffeciency,rank_drebounds,rank_fouls,rank_ft_pct,rank_oeffeciency,rank_opp_assists,rank_opp_blocks,rank_opp_drebounds,rank_opp_fouls,rank_opp_orebounds,rank_opp_ptsfastbreak,rank_opp_ptsfrom2,rank_opp_ptsfrom3,rank_opp_ptsinpaint,rank_opp_ptspergame,rank_opp_steals,rank_opp_threepointpct,rank_opp_turnovers,rank_opp_twopointpct,rank_orebounds,rank_ptsfrom2,rank_ptsfrom3,rank_ptsfromfastbreak,rank_ptsinpaint,rank_scoring,rank_steals,rank_threeptpct,rank_turnovers,rank_twoptpct,start (et),home_game,back_to_back,score
0,7,23,15,16,5,11,15,25,23,18,30,29,18,3,29,2,10,13,21,9,2,26,13,16,4,15,21,5,21,11,5,7:30p,False,False,92


In [7]:
df["start (et)"] = df["start (et)"].str.replace("p", "")

In [8]:
df["start (et)"] = df["start (et)"].astype(str)

In [9]:
target = "score"
X = df.drop(target, axis=1)
y = df[target]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [11]:
mapper = DataFrameMapper([
    (["rank_assists"], StandardScaler()),
    (["rank_blocks"], StandardScaler()),
    (["rank_deffeciency"], StandardScaler()),
    (["rank_drebounds"], StandardScaler()),
    (["rank_fouls"], StandardScaler()),
    (["rank_ft_pct"], StandardScaler()),
    (["rank_oeffeciency"], StandardScaler()),
    (["rank_opp_assists"], StandardScaler()),
    (["rank_opp_blocks"], StandardScaler()),
    (["rank_opp_drebounds"], StandardScaler()),
    (["rank_opp_fouls"], StandardScaler()),
    (["rank_opp_orebounds"], StandardScaler()),
    (["rank_opp_ptsfastbreak"], StandardScaler()),
    (["rank_opp_ptsfrom2"], StandardScaler()),
    (["rank_opp_ptsfrom3"], StandardScaler()),
    (["rank_opp_ptsinpaint"], StandardScaler()),
    (["rank_opp_ptspergame"], StandardScaler()),
    (["rank_opp_steals"], StandardScaler()),
    (["rank_opp_threepointpct"], StandardScaler()),
    (["rank_opp_turnovers"], StandardScaler()),
    (["rank_opp_twopointpct"], StandardScaler()),
    (["rank_orebounds"], StandardScaler()),
    (["rank_ptsfrom2"], StandardScaler()),
    (["rank_ptsfrom3"], StandardScaler()),
    (["rank_ptsfromfastbreak"], StandardScaler()),
    (["rank_ptsinpaint"], StandardScaler()),
    (["rank_scoring"], StandardScaler()),
    (["rank_steals"], StandardScaler()),
    (["rank_threeptpct"], StandardScaler()),
    (["rank_turnovers"], StandardScaler()),
    (["rank_twoptpct"], StandardScaler()),
    ("start (et)", LabelBinarizer()),
    ("home_game", LabelBinarizer()),
    ("back_to_back", LabelBinarizer())],df_out=True)

In [12]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [13]:
model = LinearRegression().fit(Z_train,y_train)
print("LinearRegression train score is " + str(model.score(Z_train,y_train)))
print("LinearRegression test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

LinearRegression train score is 0.04228396982617011
LinearRegression test score is 0.03170649640838963
Mean squared error is 12.1888259494245


In [14]:
model = HuberRegressor().fit(Z_train,y_train)
print("HuberRegressor train score is " + str(model.score(Z_train,y_train)))
print("HuberRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

HuberRegressor train score is 0.04076411753415843
HuberRegressor test score is 0.03070203913144476
Mean squared error is 12.19514633751953


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [15]:
model = Lasso().fit(Z_train,y_train)
print("Lasso train score is " + str(model.score(Z_train,y_train)))
print("Lasso test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Lasso train score is 0.02469377106953563
Lasso test score is 0.02044012096992842
Mean squared error is 12.259531135963236


In [16]:
model = Ridge().fit(Z_train,y_train)
print("Ridge train score is " + str(model.score(Z_train,y_train)))
print("Ridge test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Ridge train score is 0.042266653172165713
Ridge test score is 0.03184521643037441
Mean squared error is 12.187952818106378


In [17]:
model = ElasticNet().fit(Z_train,y_train)
print("ElasticNet train score is " + str(model.score(Z_train,y_train)))
print("ElasticNet test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

ElasticNet train score is 0.02756003091526915
ElasticNet test score is 0.02310609856709922
Mean squared error is 12.242836953003591


In [18]:
model = RandomForestRegressor().fit(Z_train,y_train)
print("RandomForestRegressor train score is " + str(model.score(Z_train,y_train)))
print("RandomForestRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

RandomForestRegressor train score is 0.8699052272887868
RandomForestRegressor test score is 0.07261370823359214
Mean squared error is 11.92857876398249


In [19]:
model = DecisionTreeRegressor().fit(Z_train,y_train)
print("DecisionTreeRegressor train score is " + str(model.score(Z_train,y_train)))
print("DecisionTreeRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

DecisionTreeRegressor train score is 1.0
DecisionTreeRegressor test score is -0.8044972797601115
Mean squared error is 16.63935480336456


In [20]:
model = AdaBoostRegressor().fit(Z_train,y_train)
print("AdaBoostRegressor train score is " + str(model.score(Z_train,y_train)))
print("AdaBoostRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

AdaBoostRegressor train score is 0.053018544946966006
AdaBoostRegressor test score is 0.01878122067352095
Mean squared error is 12.269907600511875


In [21]:
model = Sequential([
    Input(shape=(Z_train.shape[1],)),
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(8, activation="relu"),
    Dense(4, activation="relu"),
    Dense(1, activation="relu")
])

model.compile(loss="mae", optimizer="adam")

early_stop = EarlyStopping(monitor="val_loss", patience=7, verbose=0, mode="min")

model.fit(Z_train, y_train,
            validation_data=(Z_test, y_test),
            epochs=200, batch_size=8,
            verbose=2, callbacks=[early_stop])

Epoch 1/200
1146/1146 - 2s - loss: 25.5039 - val_loss: 11.5736
Epoch 2/200
1146/1146 - 1s - loss: 11.0529 - val_loss: 10.5879
Epoch 3/200
1146/1146 - 1s - loss: 10.3947 - val_loss: 10.7535
Epoch 4/200
1146/1146 - 1s - loss: 10.1692 - val_loss: 10.4358
Epoch 5/200
1146/1146 - 1s - loss: 10.0263 - val_loss: 10.0590
Epoch 6/200
1146/1146 - 1s - loss: 9.8583 - val_loss: 9.8490
Epoch 7/200
1146/1146 - 1s - loss: 9.8013 - val_loss: 9.8078
Epoch 8/200
1146/1146 - 1s - loss: 9.7443 - val_loss: 9.7284
Epoch 9/200
1146/1146 - 1s - loss: 9.6968 - val_loss: 9.8340
Epoch 10/200
1146/1146 - 1s - loss: 9.6438 - val_loss: 9.7725
Epoch 11/200
1146/1146 - 1s - loss: 9.5637 - val_loss: 9.6845
Epoch 12/200
1146/1146 - 1s - loss: 9.5723 - val_loss: 10.1967
Epoch 13/200
1146/1146 - 1s - loss: 9.5328 - val_loss: 9.8732
Epoch 14/200
1146/1146 - 1s - loss: 9.4904 - val_loss: 9.6571
Epoch 15/200
1146/1146 - 1s - loss: 9.4953 - val_loss: 10.0595
Epoch 16/200
1146/1146 - 1s - loss: 9.4940 - val_loss: 9.7470
Epoch

<tensorflow.python.keras.callbacks.History at 0x7facea5288d0>

In [22]:
X_test.sample(1).to_dict(orient='list')

{'back_to_back': [False],
 'home_game': [True],
 'rank_assists': [16],
 'rank_blocks': [4],
 'rank_deffeciency': [12],
 'rank_drebounds': [5],
 'rank_fouls': [16],
 'rank_ft_pct': [6],
 'rank_oeffeciency': [2],
 'rank_opp_assists': [5],
 'rank_opp_blocks': [14],
 'rank_opp_drebounds': [1],
 'rank_opp_fouls': [20],
 'rank_opp_orebounds': [16],
 'rank_opp_ptsfastbreak': [11],
 'rank_opp_ptsfrom2': [21],
 'rank_opp_ptsfrom3': [10],
 'rank_opp_ptsinpaint': [25],
 'rank_opp_ptspergame': [14],
 'rank_opp_steals': [28],
 'rank_opp_threepointpct': [16],
 'rank_opp_turnovers': [23],
 'rank_opp_twopointpct': [5],
 'rank_orebounds': [2],
 'rank_ptsfrom2': [3],
 'rank_ptsfrom3': [17],
 'rank_ptsfromfastbreak': [5],
 'rank_ptsinpaint': [4],
 'rank_scoring': [2],
 'rank_steals': [18],
 'rank_threeptpct': [5],
 'rank_turnovers': [27],
 'rank_twoptpct': [4],
 'start (et)': ['7:00']}

In [23]:
# Boston - apr. 25, 2021 - scored 104
X_new = pd.DataFrame({
    'back_to_back': [False],
    'home_game': [False],
    'rank_assists': [23],
    'rank_blocks': [10],
    'rank_deffeciency': [12],
    'rank_drebounds': [21],
    'rank_fouls': [22],
    'rank_ft_pct': [17],
    'rank_oeffeciency': [12],
    'rank_opp_assists': [7],
    'rank_opp_blocks': [10],
    'rank_opp_drebounds': [5],
    'rank_opp_fouls': [15],
    'rank_opp_orebounds': [11],
    'rank_opp_ptsfastbreak': [12],
    'rank_opp_ptsfrom2': [5],
    'rank_opp_ptsfrom3': [16],
    'rank_opp_ptsinpaint': [6],
    'rank_opp_ptspergame': [9],
    'rank_opp_steals': [11],
    'rank_opp_threepointpct': [17],
    'rank_opp_turnovers': [14],
    'rank_opp_twopointpct': [13],
    'rank_orebounds': [5],
    'rank_ptsfrom2': [15],
    'rank_ptsfrom3': [11],
    'rank_ptsfromfastbreak': [18],
    'rank_ptsinpaint': [19],
    'rank_scoring': [16],
    'rank_steals': [12],
    'rank_threeptpct': [9],
    'rank_turnovers': [17],
    'rank_twoptpct': [14],
    'start (et)': ['1:00']
})

Z_new = mapper.transform(X_new)
model.predict(Z_new)

array([[94.504684]], dtype=float32)

In [24]:
# Charlotte - apr. 25, 2021 - scored 125
X_new = pd.DataFrame({
    'back_to_back': [False],
    'home_game': [True],
    'rank_assists': [8],
    'rank_blocks': [18],
    'rank_deffeciency': [17],
    'rank_drebounds': [24],
    'rank_fouls': [4],
    'rank_ft_pct': [21],
    'rank_oeffeciency': [18],
    'rank_opp_assists': [30],
    'rank_opp_blocks': [13],
    'rank_opp_drebounds': [17],
    'rank_opp_fouls': [24],
    'rank_opp_orebounds': [27],
    'rank_opp_ptsfastbreak': [24],
    'rank_opp_ptsfrom2': [4],
    'rank_opp_ptsfrom3': [30],
    'rank_opp_ptsinpaint': [13],
    'rank_opp_ptspergame': [13],
    'rank_opp_steals': [24],
    'rank_opp_threepointpct': [16],
    'rank_opp_turnovers': [10],
    'rank_opp_twopointpct': [30],
    'rank_orebounds': [10],
    'rank_ptsfrom2': [25],
    'rank_ptsfrom3': [8],
    'rank_ptsfromfastbreak': [10],
    'rank_ptsinpaint': [21],
    'rank_scoring': [22],
    'rank_steals': [10],
    'rank_threeptpct': [8],
    'rank_turnovers': [26],
    'rank_twoptpct': [21],
    'start (et)': ['1:00']
})

Z_new = mapper.transform(X_new)
model.predict(Z_new)

array([[111.23766]], dtype=float32)

In [25]:
from sklearn.metrics import r2_score

y_hat_score = model.predict(Z_test)[:, 0]
preds = pd.DataFrame({
    'score': y_test,
    'y_hat_score': y_hat_score
})

print(r2_score(preds['score'], preds['y_hat_score']))

0.011054418162669055
