In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
df = pd.read_csv('nba_elo.csv')
df.shape

(72039, 27)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72039 entries, 0 to 72038
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            72039 non-null  object 
 1   season          72039 non-null  int64  
 2   neutral         72039 non-null  int64  
 3   playoff         4626 non-null   object 
 4   team1           72039 non-null  object 
 5   team2           72039 non-null  object 
 6   elo1_pre        72039 non-null  float64
 7   elo2_pre        72039 non-null  float64
 8   elo_prob1       72039 non-null  float64
 9   elo_prob2       72039 non-null  float64
 10  elo1_post       72026 non-null  float64
 11  elo2_post       72026 non-null  float64
 12  carm-elo1_pre   5249 non-null   float64
 13  carm-elo2_pre   5249 non-null   float64
 14  carm-elo_prob1  5249 non-null   float64
 15  carm-elo_prob2  5249 non-null   float64
 16  carm-elo1_post  5249 non-null   float64
 17  carm-elo2_post  5249 non-null  

In [4]:
df = df.drop(df[df['score1'].isna()].index)

changes in league scoring system, from 2010 - only taking games from after then

In [5]:
as_datetime = pd.to_datetime(df['date'])

date_bound = pd.Timestamp('2010-01-01 00:00:00')

df.loc[:, 'date_as_dt'] = as_datetime
df.drop('date', axis=1)

df = df[df['date_as_dt'] > date_bound]
df = df.drop('date', axis=1)

df = df.reset_index().drop('index', axis=1)

In [6]:
df.shape

(16034, 27)

dropping duplicates in case there are any

In [7]:
df = df.drop_duplicates()

dropping features we don't want / need

In [8]:
df.columns

Index(['season', 'neutral', 'playoff', 'team1', 'team2', 'elo1_pre',
       'elo2_pre', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
       'carm-elo1_pre', 'carm-elo2_pre', 'carm-elo_prob1', 'carm-elo_prob2',
       'carm-elo1_post', 'carm-elo2_post', 'raptor1_pre', 'raptor2_pre',
       'raptor_prob1', 'raptor_prob2', 'score1', 'score2', 'quality',
       'importance', 'total_rating', 'date_as_dt'],
      dtype='object')

In [9]:
feats_to_drop = ['season', 'neutral', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post', 'carm-elo1_pre', 'carm-elo2_pre', 'carm-elo_prob1', 'carm-elo_prob2',
       'carm-elo1_post', 'carm-elo2_post', 'raptor1_pre', 'raptor2_pre',
       'raptor_prob1', 'raptor_prob2', 'quality',
       'importance', 'total_rating']

In [10]:
df = df.drop(feats_to_drop, axis=1)
df.shape

(16034, 8)

feature engineering target variable

In [11]:
win_col_bool = (df.score1 < df.score2)
df['team_won'] = win_col_bool.astype('int') + 1
df = df.drop(['score1', 'score2'], axis=1)
df.shape

(16034, 7)

transforming 'playoff' feature into binary feature

In [12]:
df['playoff'].unique()

array([nan, 't', 'q', 's', 'c', 'f', 'p'], dtype=object)

In [13]:
df['enc_playoff'] = df['playoff'].notna().astype(int)

In [14]:
df = df.drop('playoff', axis=1)
df.shape

(16034, 7)

In [15]:
df.head()

Unnamed: 0,team1,team2,elo1_pre,elo2_pre,date_as_dt,team_won,enc_playoff
0,BOS,TOR,1660.5702,1496.2823,2010-01-02,1,0
1,IND,MIN,1374.8478,1304.9703,2010-01-02,1,0
2,NJN,CLE,1302.0889,1742.0925,2010-01-02,2,0
3,UTA,DEN,1560.92,1598.8054,2010-01-02,2,0
4,POR,GSW,1574.6274,1397.9839,2010-01-02,1,0


making unique id now so that we can match it later

In [16]:
df['uid'] = df['team1'] + df['team2'] + df['date_as_dt'].astype('str')

creating game results for opposing teams

In [17]:
df2 = df.copy()
# switching team names
df2['team_1'] = df['team2']
df2['team_2'] = df['team1']
df2['team1'] = df2['team_1']
df2['team2'] = df2['team_2']
df2 = df2.drop(['team_1', 'team_2'], axis=1)
# switching team elos
df2['elo_1_pre'] = df['elo2_pre']
df2['elo_2_pre'] = df['elo1_pre']
df2['elo1_pre'] = df2['elo_1_pre']
df2['elo2_pre'] = df2['elo_2_pre']
df2 = df2.drop(['elo_1_pre', 'elo_2_pre'], axis=1)
# switching team win
df2['team_won'] = (df2['team_won'] == 1).astype(int) + 1
# adding uids
df2['uid'] = df['uid']

In [18]:
df = pd.concat([df, df2], axis=0)
df.shape

(32068, 8)

splitting data into train and test according to date

In [19]:
date_bound = pd.Timestamp('2018-01-01 00:00:00')
train = df[df['date_as_dt'] < date_bound]
test = df[~(df['date_as_dt'] < date_bound)]

train.shape, test.shape

((20668, 8), (11400, 8))

In [20]:
X_train = train.drop('team_won', axis=1)
y_train = train['team_won']

X_test = test.drop('team_won', axis=1)
y_test = test['team_won']

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20668, 7), (11400, 7), (20668,), (11400,))

In [21]:
X_train.shape[0] / df.shape[0], X_test.shape[0] / df.shape[0]

(0.6445054259698142, 0.35549457403018586)

extracting uids for later

In [22]:
x_test_uids = X_test['uid']
X_test = X_test.drop('uid', axis=1)
X_train = X_train.drop('uid', axis=1)

label encoding team names

In [23]:
cols = ['team1', 'team2']
x_train_team_names = X_train[cols].values
x_test_team_names = X_test[cols].values

In [24]:
le = LabelEncoder()
le.fit(X_train['team1'])

X_test['team1'] = X_test['team1'].map(lambda s: '<unknown>' if s not in le.classes_ else s)
X_test['team2'] = X_test['team2'].map(lambda s: '<unknown>' if s not in le.classes_ else s)

le.classes_ = np.append(le.classes_, '<unknown>')

X_train['enc_team1'] = le.transform(X_train['team1'])
X_train['enc_team2'] = le.transform(X_train['team2'])
X_test['enc_team1'] = le.transform(X_test['team1'])
X_test['enc_team2'] = le.transform(X_test['team2'])

X_train = X_train.drop(cols, axis=1)
X_test = X_test.drop(cols, axis=1)

In [25]:
X_train.head()

Unnamed: 0,elo1_pre,elo2_pre,date_as_dt,enc_playoff,enc_team1,enc_team2
0,1660.5702,1496.2823,2010-01-02,0,1,28
1,1374.8478,1304.9703,2010-01-02,0,11,17
2,1302.0889,1742.0925,2010-01-02,0,18,5
3,1560.92,1598.8054,2010-01-02,0,29,7
4,1574.6274,1397.9839,2010-01-02,0,25,9


turning datetime into integer in order to put into model

In [26]:
x_train_dates = X_train['date_as_dt'].values
x_test_dates = X_test['date_as_dt'].values
X_train['date_as_dt'] = X_train['date_as_dt'].apply(lambda x: x.value) / 10**9
X_test['date_as_dt'] = X_test['date_as_dt'].apply(lambda x: x.value) / 10**9

In [27]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_ss = scaler.transform(X_train)
X_test_ss = scaler.transform(X_test)


X_train = pd.DataFrame(data=X_train_ss, columns=X_train.columns)
X_test = pd.DataFrame(data=X_test_ss, columns=X_test.columns)
X_train.shape, X_test.shape

((20668, 6), (11400, 6))

baseline model 1

In [28]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [29]:
y_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred))
y_pred.shape[0] - (y_pred == y_test).sum()

              precision    recall  f1-score   support

           1       0.64      0.64      0.64      5700
           2       0.64      0.64      0.64      5700

    accuracy                           0.64     11400
   macro avg       0.64      0.64      0.64     11400
weighted avg       0.64      0.64      0.64     11400



4086

In [30]:
proba = lr.predict_proba(X_test)
proba

array([[0.62889229, 0.37110771],
       [0.65038663, 0.34961337],
       [0.72586362, 0.27413638],
       ...,
       [0.34118306, 0.65881694],
       [0.47949783, 0.52050217],
       [0.61227499, 0.38772501]])

In [31]:
X_test['y_pred'] = y_pred
X_test[['prob1', 'prob2']] = proba * 100
X_test['uid'] = x_test_uids.values
X_test[['team1', 'team2']] = x_test_team_names
with open('initial_preds.csv', 'w') as initial_preds_file:
    X_test.to_csv(initial_preds_file, index=False)


baseline model using elos

In [32]:
elo_col_bool = (X_test['elo1_pre'] < X_test['elo2_pre']).astype(int)
results = elo_col_bool + 1
(results == y_test.reset_index()['team_won']).sum() / X_test.shape[0]

0.6382456140350877

In [33]:
print(classification_report(y_test, results))

              precision    recall  f1-score   support

           1       0.64      0.64      0.64      5700
           2       0.64      0.64      0.64      5700

    accuracy                           0.64     11400
   macro avg       0.64      0.64      0.64     11400
weighted avg       0.64      0.64      0.64     11400

