In [1]:
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
df = pd.read_csv('nba_elo.csv')
df.shape

(72039, 27)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72039 entries, 0 to 72038
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            72039 non-null  object 
 1   season          72039 non-null  int64  
 2   neutral         72039 non-null  int64  
 3   playoff         4626 non-null   object 
 4   team1           72039 non-null  object 
 5   team2           72039 non-null  object 
 6   elo1_pre        72039 non-null  float64
 7   elo2_pre        72039 non-null  float64
 8   elo_prob1       72039 non-null  float64
 9   elo_prob2       72039 non-null  float64
 10  elo1_post       72026 non-null  float64
 11  elo2_post       72026 non-null  float64
 12  carm-elo1_pre   5249 non-null   float64
 13  carm-elo2_pre   5249 non-null   float64
 14  carm-elo_prob1  5249 non-null   float64
 15  carm-elo_prob2  5249 non-null   float64
 16  carm-elo1_post  5249 non-null   float64
 17  carm-elo2_post  5249 non-null  

In [4]:
df = df.drop(df[df['score1'].isna()].index)

changes in league scoring system, from 2010 - only taking games from after then

In [5]:
as_datetime = pd.to_datetime(df['date'])

date_bound = pd.Timestamp('2010-01-01 00:00:00')

df.loc[:, 'date_as_dt'] = as_datetime
df.drop('date', axis=1)

df = df[df['date_as_dt'] > date_bound]
df = df.drop('date', axis=1)

df = df.reset_index().drop('index', axis=1)

In [6]:
df.shape

(16034, 27)

dropping duplicates in case there are any

In [7]:
df = df.drop_duplicates()

dropping features we don't want / need

In [8]:
df.columns

Index(['season', 'neutral', 'playoff', 'team1', 'team2', 'elo1_pre',
       'elo2_pre', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
       'carm-elo1_pre', 'carm-elo2_pre', 'carm-elo_prob1', 'carm-elo_prob2',
       'carm-elo1_post', 'carm-elo2_post', 'raptor1_pre', 'raptor2_pre',
       'raptor_prob1', 'raptor_prob2', 'score1', 'score2', 'quality',
       'importance', 'total_rating', 'date_as_dt'],
      dtype='object')

In [9]:
feats_to_drop = ['season', 'neutral', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post', 'carm-elo1_pre', 'carm-elo2_pre', 'carm-elo_prob1', 'carm-elo_prob2',
       'carm-elo1_post', 'carm-elo2_post', 'raptor1_pre', 'raptor2_pre',
       'raptor_prob1', 'raptor_prob2', 'quality',
       'importance', 'total_rating']

In [10]:
df = df.drop(feats_to_drop, axis=1)
df.shape

(16034, 8)

feature engineering target variable

In [11]:
win_col_bool = (df.score1 < df.score2)
df['team_won'] = win_col_bool.astype('int') + 1
df = df.drop(['score1', 'score2'], axis=1)
df.shape

(16034, 7)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16034 entries, 0 to 16033
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   playoff     1082 non-null   object        
 1   team1       16034 non-null  object        
 2   team2       16034 non-null  object        
 3   elo1_pre    16034 non-null  float64       
 4   elo2_pre    16034 non-null  float64       
 5   date_as_dt  16034 non-null  datetime64[ns]
 6   team_won    16034 non-null  int32         
dtypes: datetime64[ns](1), float64(2), int32(1), object(3)
memory usage: 939.5+ KB


In [13]:
df['playoff'].unique()

array([nan, 't', 'q', 's', 'c', 'f', 'p'], dtype=object)

In [14]:
df['enc_playoff'] = df['playoff'].notna().astype(int)

In [15]:
df = df.drop('playoff', axis=1)
df.shape

(16034, 7)

In [16]:
df.head()

Unnamed: 0,team1,team2,elo1_pre,elo2_pre,date_as_dt,team_won,enc_playoff
0,BOS,TOR,1660.5702,1496.2823,2010-01-02,1,0
1,IND,MIN,1374.8478,1304.9703,2010-01-02,1,0
2,NJN,CLE,1302.0889,1742.0925,2010-01-02,2,0
3,UTA,DEN,1560.92,1598.8054,2010-01-02,2,0
4,POR,GSW,1574.6274,1397.9839,2010-01-02,1,0


preparing to add more features

In [17]:
df.date_as_dt = df.date_as_dt.astype(str)

In [18]:
team_abbrev_map = {'BOS': "Boston", 'IND': "Indiana", 'NJN': "Brooklyn", 'UTA': "Utah", 'POR': "Portland", 'PHO': "Phoenix", 'NOP': "New Orleans", 'MIA': "Miami", 'WAS': "Washington",
 'CHI': "Chicago", 'MIL': "Milwaukee", 'SAC': "Sacramento", 'CLE': "Cleveland", 'LAL': "LA Lakers", 'DEN': "Denver", 'NYK': "New York", 'TOR': "Toronto", 'LAC': "LA Clippers",
 'PHI': "Philadelphia", 'CHO': "Charlotte", 'DAL': "Dallas", 'MIN': "Minnesota", 'SAS': "San Antonio", 'OKC': "Okla City", 'ORL': "Orlando", 'ATL': "Atlanta", 'GSW': "Golden State",
 'MEM': "Memphis", 'HOU': "Houston", 'DET': "Detroit", 'BRK': "Brooklyn"}

# for abb in team_abbrev_map:
#     print(f"{abb}: {team_abbrev_map[abb]}")

In [19]:
df['team1'] = df['team1'].map(team_abbrev_map)
df['team2'] = df['team2'].map(team_abbrev_map)
df

Unnamed: 0,team1,team2,elo1_pre,elo2_pre,date_as_dt,team_won,enc_playoff
0,Boston,Toronto,1660.570200,1496.282300,2010-01-02,1,0
1,Indiana,Minnesota,1374.847800,1304.970300,2010-01-02,1,0
2,Brooklyn,Cleveland,1302.088900,1742.092500,2010-01-02,2,0
3,Utah,Denver,1560.920000,1598.805400,2010-01-02,2,0
4,Portland,Golden State,1574.627400,1397.983900,2010-01-02,1,0
...,...,...,...,...,...,...,...
16029,Milwaukee,Boston,1631.037053,1722.739869,2022-05-13,2,1
16030,Golden State,Memphis,1629.192597,1647.657708,2022-05-13,1,1
16031,Boston,Milwaukee,1735.369036,1618.407885,2022-05-15,1,1
16032,Phoenix,Dallas,1650.983771,1628.637291,2022-05-15,2,1


In [20]:
# defensive
df_def = pd.read_csv('def_extra_info.csv')

In [22]:
try_merge = df.merge(df_def, left_on = ['team1', 'date_as_dt'], right_on=['Team', 'date'], how='left')
df = try_merge.merge(df_def, left_on = ['team2', 'date_as_dt'], right_on=['Team', 'date'], how='left', suffixes=('_defend1', '_defend2'))
df

Unnamed: 0,team1,team2,elo1_pre,elo2_pre,date_as_dt,team_won,enc_playoff,Team_defend1,Last 3_defend1,Last 1_defend1,Home_defend1,Away_defend1,date_defend1,Team_defend2,Last 3_defend2,Last 1_defend2,Home_defend2,Away_defend2,date_defend2
0,Boston,Toronto,1660.570200,1496.282300,2010-01-02,1,0,Boston,1.063,1.187,0.983,0.97,2010-01-02,Toronto,0.966,1.083,1.052,1.151,2010-01-02
1,Indiana,Minnesota,1374.847800,1304.970300,2010-01-02,1,0,Indiana,1.105,1.169,1.007,1.054,2010-01-02,Minnesota,1.119,1.125,1.042,1.093,2010-01-02
2,Brooklyn,Cleveland,1302.088900,1742.092500,2010-01-02,2,0,Brooklyn,1.058,0.987,1.049,1.074,2010-01-02,Cleveland,0.94,1.103,1.004,0.994,2010-01-02
3,Utah,Denver,1560.920000,1598.805400,2010-01-02,2,0,Utah,0.918,0.909,1.02,1.024,2010-01-02,Denver,1.09,1.053,1.013,1.048,2010-01-02
4,Portland,Golden State,1574.627400,1397.983900,2010-01-02,1,0,Portland,1.109,1.054,1.037,1.007,2010-01-02,Golden State,1.095,1.179,1.056,1.144,2010-01-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16029,Milwaukee,Boston,1631.037053,1722.739869,2022-05-13,2,1,Milwaukee,1.072,1.114,1.067,1.081,2022-05-13,Boston,1.062,1.145,1.039,1.038,2022-05-13
16030,Golden State,Memphis,1629.192597,1647.657708,2022-05-13,1,1,Golden State,1.084,1.28,1.01,1.074,2022-05-13,Memphis,1.065,0.907,1.034,1.069,2022-05-13
16031,Boston,Milwaukee,1735.369036,1618.407885,2022-05-15,1,1,Boston,1.056,0.957,1.039,1.036,2022-05-15,Milwaukee,1.117,1.088,1.068,1.081,2022-05-15
16032,Phoenix,Dallas,1650.983771,1628.637291,2022-05-15,2,1,Phoenix,1.059,1.183,1.043,1.059,2022-05-15,Dallas,1.035,0.9,1.035,1.094,2022-05-15


In [23]:
# offensive
df_off = pd.read_csv('off_extra_info.csv')

In [24]:
try_merge = df.merge(df_off, left_on = ['team1', 'date_as_dt'], right_on=['Team', 'date'], how='left')
df = try_merge.merge(df_off, left_on = ['team2', 'date_as_dt'], right_on=['Team', 'date'], how='left', suffixes=('_offense1', '_offense2'))
df

Unnamed: 0,team1,team2,elo1_pre,elo2_pre,date_as_dt,team_won,enc_playoff,Team_defend1,Last 3_defend1,Last 1_defend1,...,Last 1_offense1,Home_offense1,Away_offense1,date_offense1,Team_offense2,Last 3_offense2,Last 1_offense2,Home_offense2,Away_offense2,date_offense2
0,Boston,Toronto,1660.570200,1496.282300,2010-01-02,1,0,Boston,1.063,1.187,...,1.003,1.082,1.038,2010-01-02,Toronto,1.117,1.125,1.082,1.078,2010-01-02
1,Indiana,Minnesota,1374.847800,1304.970300,2010-01-02,1,0,Indiana,1.105,1.169,...,1.062,0.983,0.966,2010-01-02,Minnesota,1.004,0.997,0.956,0.974,2010-01-02
2,Brooklyn,Cleveland,1302.088900,1742.092500,2010-01-02,2,0,Brooklyn,1.058,0.987,...,1.081,0.949,0.949,2010-01-02,Cleveland,1.084,1.157,1.097,1.05,2010-01-02
3,Utah,Denver,1560.920000,1598.805400,2010-01-02,2,0,Utah,0.918,0.909,...,0.899,1.1,0.996,2010-01-02,Denver,1.007,1.003,1.129,1.043,2010-01-02
4,Portland,Golden State,1574.627400,1397.983900,2010-01-02,1,0,Portland,1.109,1.054,...,1.097,1.103,1.019,2010-01-02,Golden State,1.104,1.122,1.055,1.061,2010-01-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16029,Milwaukee,Boston,1631.037053,1722.739869,2022-05-13,2,1,Milwaukee,1.072,1.114,...,1.145,1.106,1.113,2022-05-13,Boston,1.072,1.114,1.105,1.11,2022-05-13
16030,Golden State,Memphis,1629.192597,1647.657708,2022-05-13,1,1,Golden State,1.084,1.28,...,0.907,1.112,1.075,2022-05-13,Memphis,1.084,1.28,1.123,1.084,2022-05-13
16031,Boston,Milwaukee,1735.369036,1618.407885,2022-05-15,1,1,Boston,1.056,0.957,...,1.088,1.105,1.109,2022-05-15,Milwaukee,1.056,0.957,1.103,1.113,2022-05-15
16032,Phoenix,Dallas,1650.983771,1628.637291,2022-05-15,2,1,Phoenix,1.059,1.183,...,0.9,1.134,1.1,2022-05-15,Dallas,1.059,1.183,1.106,1.086,2022-05-15


In [25]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16024,16025,16026,16027,16028,16029,16030,16031,16032,16033
team1,Boston,Indiana,Brooklyn,Utah,Portland,Phoenix,New Orleans,Miami,Washington,Chicago,...,Phoenix,Boston,Memphis,Philadelphia,Dallas,Milwaukee,Golden State,Boston,Phoenix,Miami
team2,Toronto,Minnesota,Cleveland,Denver,Golden State,Memphis,Houston,Charlotte,San Antonio,Orlando,...,Dallas,Milwaukee,Golden State,Miami,Phoenix,Boston,Memphis,Milwaukee,Dallas,Boston
elo1_pre,1660.5702,1374.8478,1302.0889,1560.92,1574.6274,1584.7948,1455.8378,1496.2817,1386.8387,1456.0247,...,1655.491645,1733.148803,1628.364233,1620.326163,1611.487285,1631.037053,1629.192597,1735.369036,1650.983771,1663.46029
elo2_pre,1496.2823,1304.9703,1742.0925,1598.8054,1397.9839,1483.7445,1605.8527,1474.0052,1605.9158,1652.8103,...,1624.129417,1620.628118,1648.486072,1651.130563,1668.133777,1722.739869,1647.657708,1618.407885,1628.637291,1743.269118
date_as_dt,2010-01-02,2010-01-02,2010-01-02,2010-01-02,2010-01-02,2010-01-02,2010-01-02,2010-01-02,2010-01-02,2010-01-02,...,2022-05-10,2022-05-11,2022-05-11,2022-05-12,2022-05-12,2022-05-13,2022-05-13,2022-05-15,2022-05-15,2022-05-17
team_won,1,1,2,2,1,2,1,2,2,1,...,1,2,1,2,1,2,1,1,2,1
enc_playoff,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Team_defend1,Boston,Indiana,Brooklyn,Utah,Portland,Phoenix,New Orleans,Miami,Washington,Chicago,...,Phoenix,Boston,Memphis,Philadelphia,Dallas,Milwaukee,Golden State,Boston,Phoenix,Miami
Last 3_defend1,1.063,1.105,1.058,0.918,1.109,1.079,1.05,0.964,1.073,0.914,...,1.115,0.982,1.082,1.067,1.07,1.072,1.084,1.056,1.059,0.988
Last 1_defend1,1.187,1.169,0.987,0.909,1.054,1.003,0.998,1.107,1.143,0.91,...,1.153,1.068,0.955,1.224,1.155,1.114,1.28,0.957,1.183,0.935


In [26]:
new_feats_to_drop = ['Team_defend1', 'Team_defend2', 'Team_offense1', 'Team_offense2', 'date_defend1', 'date_defend2', 'date_offense1', 'date_offense2']

In [27]:
df = df.drop(new_feats_to_drop, axis=1)
df.shape

(16034, 23)

In [28]:
df = df.dropna()
df.shape

(16025, 23)

creating game results for opposing teams

In [29]:
df2 = df.copy()
# switching team names
df2['team1'] = df['team2']
df2['team2'] = df['team1']

# switching defense
df2['Last 3_defend1'] = df['Last 3_defend2']
df2['Last 3_defend2'] = df['Last 3_defend1']
df2['Last 1_defend1'] = df['Last 1_defend2']
df2['Last 1_defend2'] = df['Last 1_defend1']
df2['Home_defend1'] = df['Home_defend2']
df2['Home_defend2'] = df['Home_defend1']
df2['Away_defend1'] = df['Away_defend2']
df2['Away_defend2'] = df['Away_defend1']

# switching offense
df2['Last 3_offense1'] = df['Last 3_offense2']
df2['Last 1_offense1'] = df['Last 1_offense2']
df2['Home_offense1'] = df['Home_offense2']
df2['Away_offense1'] = df['Away_offense2']
df2['Last 3_offense2'] = df['Last 3_offense1']
df2['Last 1_offense2'] = df['Last 1_offense1']
df2['Home_offense2'] = df['Home_offense1']
df2['Away_offense2'] = df['Away_offense1']
# switching team elos
df2['elo_1_pre'] = df['elo2_pre']
df2['elo_2_pre'] = df['elo1_pre']
df2['elo1_pre'] = df2['elo_1_pre']
df2['elo2_pre'] = df2['elo_2_pre']
df2 = df2.drop(['elo_1_pre', 'elo_2_pre'], axis=1)
# switching team win
df2['team_won'] = (df2['team_won'] == 1).astype(int) + 1

In [30]:
df = pd.concat([df, df2], axis=0)
df.shape

In [31]:
df = df.drop_duplicates()

splitting data into train and test according to date

In [32]:
as_datetime = pd.to_datetime(df['date_as_dt'])

df.loc[:, 'date_as_dt'] = as_datetime

date_bound = pd.Timestamp('2018-01-01 00:00:00')
train = df[df['date_as_dt'] < date_bound]
test = df[~(df['date_as_dt'] < date_bound)]

train.shape, test.shape

((10325, 23), (5700, 23))

In [33]:
X_train = train.drop('team_won', axis=1)
y_train = train['team_won']

X_test = test.drop('team_won', axis=1)
y_test = test['team_won']

X_train = X_train.reset_index().drop('index', axis=1)
X_test = X_test.reset_index().drop('index', axis=1)
y_train = y_train.reset_index()['team_won']
y_test = y_test.reset_index()['team_won']

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10325, 22), (5700, 22), (10325,), (5700,))

In [34]:
X_train.shape[0] / df.shape[0], X_test.shape[0] / df.shape[0]

(0.6443057722308893, 0.35569422776911075)

label encoding team names

In [35]:
cols = ['team1', 'team2']
x_train_team_names = X_train[cols]
x_test_team_names = X_test[cols]

In [36]:
cols = ['team1', 'team2']

le = LabelEncoder()
le.fit(X_train['team1'])

X_test['team1'] = X_test['team1'].map(lambda s: '<unknown>' if s not in le.classes_ else s)
X_test['team2'] = X_test['team2'].map(lambda s: '<unknown>' if s not in le.classes_ else s)

le.classes_ = np.append(le.classes_, '<unknown>')

X_train['enc_team1'] = le.transform(X_train['team1'])
X_train['enc_team2'] = le.transform(X_train['team2'])
X_test['enc_team1'] = le.transform(X_test['team1'])
X_test['enc_team2'] = le.transform(X_test['team2'])

X_train = X_train.drop(cols, axis=1)
X_test = X_test.drop(cols, axis=1)
X_train.shape, X_test.shape

((10325, 22), (5700, 22))

In [37]:
ob_cols = ['Last 3_defend1',
       'Last 1_defend1', 'Home_defend1', 'Away_defend1', 'Last 3_defend2',
       'Last 1_defend2', 'Home_defend2', 'Away_defend2', 'Last 3_offense1',
       'Last 1_offense1', 'Home_offense1', 'Away_offense1', 'Last 3_offense2',
       'Last 1_offense2', 'Home_offense2', 'Away_offense2']
for col in ob_cols:
    train_bad_samps = X_train[X_train[col] == '--'].index
    X_train = X_train.drop(train_bad_samps, axis=0)
    y_train = y_train.drop(train_bad_samps, axis=0)
for col in ob_cols:
    test_bad_samps = X_test[X_test[col] == '--'].index
    X_test = X_test.drop(test_bad_samps, axis=0)
    y_test = y_test.drop(test_bad_samps, axis=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9969, 22), (5532, 22), (9969,), (5532,))

turning datetime into integer in order to put into model

In [38]:
x_train_dates = X_train['date_as_dt']
x_test_dates = X_test['date_as_dt']
X_train['date_as_dt'] = X_train['date_as_dt'].apply(lambda x: x.value) / 10**9
X_test['date_as_dt'] = X_test['date_as_dt'].apply(lambda x: x.value) / 10**9

In [39]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_ss = scaler.transform(X_train)
X_test_ss = scaler.transform(X_test)


X_train = pd.DataFrame(data=X_train_ss, columns=X_train.columns)
X_test = pd.DataFrame(data=X_test_ss, columns=X_test.columns)
X_train.shape, X_test.shape

((9969, 22), (5532, 22))

baseline model 1

In [40]:
from sklearn.linear_model import LogisticRegression
y_train = y_train.ravel()
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [41]:
y_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred))
y_pred.shape[0] - (y_pred == y_test).sum()

              precision    recall  f1-score   support

           1       0.66      0.78      0.72      3118
           2       0.63      0.48      0.55      2414

    accuracy                           0.65      5532
   macro avg       0.65      0.63      0.63      5532
weighted avg       0.65      0.65      0.64      5532



1933

baseline model using elos

In [42]:
elo_col_bool = (X_test['elo1_pre'] < X_test['elo2_pre']).astype(int)
results = elo_col_bool + 1
(results == y_test.reset_index()['team_won']).sum() / X_test.shape[0]

0.6404555314533622

In [43]:
print(classification_report(y_test, results))

              precision    recall  f1-score   support

           1       0.70      0.62      0.66      3118
           2       0.58      0.66      0.62      2414

    accuracy                           0.64      5532
   macro avg       0.64      0.64      0.64      5532
weighted avg       0.65      0.64      0.64      5532



In [44]:
lr.predict_proba(X_test)

array([[0.68959409, 0.31040591],
       [0.75103607, 0.24896393],
       [0.7781362 , 0.2218638 ],
       ...,
       [0.75430408, 0.24569592],
       [0.65041495, 0.34958505],
       [0.53080169, 0.46919831]])