In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
games_df = pd.read_csv('final_games.csv')
features = ['HOME_TEAM','AWAY_TEAM','HOME_W_PCT','HOME_FGM_RANK','HOME_FG3M_RANK','HOME_PLUS_MINUS_RANK','HOME_REB_RANK','HOME_AST_RANK','HOME_TOV_RANK','HOME_STL_RANK','HOME_PF_RANK','HOME_BLK_RANK','HOME_PTS_RANK','HOME_DivisionRank','HOME_HOME_WINS','HOME_ROAD_WINS','HOME_CurrentHomeStreak','HOME_CurrentRoadStreak','HOME_CurrentStreak','HOME_HighValueInjured','AWAY_W_PCT','AWAY_FGM_RANK','AWAY_FG3M_RANK','AWAY_PLUS_MINUS_RANK','AWAY_REB_RANK','AWAY_AST_RANK','AWAY_TOV_RANK','AWAY_STL_RANK','AWAY_PF_RANK','AWAY_BLK_RANK','AWAY_PTS_RANK','AWAY_DivisionRank','AWAY_HOME_WINS','AWAY_ROAD_WINS','AWAY_CurrentHomeStreak','AWAY_CurrentRoadStreak','AWAY_CurrentStreak','AWAY_HighValueInjured']
games_df[features].head()

Unnamed: 0,HOME_TEAM,AWAY_TEAM,HOME_W_PCT,HOME_FGM_RANK,HOME_FG3M_RANK,HOME_PLUS_MINUS_RANK,HOME_REB_RANK,HOME_AST_RANK,HOME_TOV_RANK,HOME_STL_RANK,...,AWAY_PF_RANK,AWAY_BLK_RANK,AWAY_PTS_RANK,AWAY_DivisionRank,AWAY_HOME_WINS,AWAY_ROAD_WINS,AWAY_CurrentHomeStreak,AWAY_CurrentRoadStreak,AWAY_CurrentStreak,AWAY_HighValueInjured
0,MIN,UTA,0.598,19,5,4,15,17,18,18,...,18,24,21,5,10,7,-1,-21,-2,0
1,CLE,IND,0.78,4,2,2,6,9,3,14,...,15,6,7,2,29,20,-1,2,1,0
2,BKN,NYK,0.317,30,13,26,29,24,23,20,...,4,29,9,2,27,24,-2,1,1,0
3,SAS,TOR,0.415,14,9,21,19,6,10,12,...,30,28,23,3,18,12,1,-2,-2,0
4,MIA,WAS,0.451,24,12,15,21,14,8,15,...,24,11,27,5,8,10,-2,1,1,0


In [3]:
X,y = games_df.drop(columns=['GAME_ID','GAME_DATE','HOME_W','HOME_L','AWAY_W','AWAY_L','HOME_WIN'],axis=1),games_df['HOME_WIN']
cats = X.select_dtypes(exclude=np.number).columns.tolist()
for col in cats:
    X[col] = X[col].astype('category')
X.columns

Index(['HOME_TEAM', 'AWAY_TEAM', 'HOME_W_PCT', 'HOME_FGM_RANK',
       'HOME_FG3M_RANK', 'HOME_PLUS_MINUS_RANK', 'HOME_REB_RANK',
       'HOME_AST_RANK', 'HOME_TOV_RANK', 'HOME_STL_RANK', 'HOME_BLK_RANK',
       'HOME_PF_RANK', 'HOME_PTS_RANK', 'HOME_DivisionRank', 'HOME_HOME_WINS',
       'HOME_ROAD_WINS', 'HOME_CurrentHomeStreak', 'HOME_CurrentRoadStreak',
       'HOME_CurrentStreak', 'HOME_HighValueInjured', 'AWAY_W_PCT',
       'AWAY_FGM_RANK', 'AWAY_FG3M_RANK', 'AWAY_PLUS_MINUS_RANK',
       'AWAY_REB_RANK', 'AWAY_AST_RANK', 'AWAY_TOV_RANK', 'AWAY_STL_RANK',
       'AWAY_BLK_RANK', 'AWAY_PF_RANK', 'AWAY_PTS_RANK', 'AWAY_DivisionRank',
       'AWAY_HOME_WINS', 'AWAY_ROAD_WINS', 'AWAY_CurrentHomeStreak',
       'AWAY_CurrentRoadStreak', 'AWAY_CurrentStreak',
       'AWAY_HighValueInjured'],
      dtype='object')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=7)

In [5]:
dtrain_reg = xgb.DMatrix(X_train, y_train,enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test,enable_categorical=True)

In [6]:
params = {
"objective": "binary:logistic",
"eval_metric":"logloss",
"tree_method":"hist",
"eta":0.01,
"max_depth":4,
"subsample":0.4,
"colsample_bytree":0.4,
"lambda":8.2,
"alpha":2,
"seed":1
}

In [7]:
n=1000
evals = [(dtrain_reg,"train"),(dtest_reg,"validation")]
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    evals=evals,
    early_stopping_rounds=50,

)

[0]	train-logloss:0.68762	validation-logloss:0.68964
[1]	train-logloss:0.68659	validation-logloss:0.68875
[2]	train-logloss:0.68550	validation-logloss:0.68773
[3]	train-logloss:0.68453	validation-logloss:0.68683
[4]	train-logloss:0.68349	validation-logloss:0.68599
[5]	train-logloss:0.68241	validation-logloss:0.68522
[6]	train-logloss:0.68135	validation-logloss:0.68418
[7]	train-logloss:0.68015	validation-logloss:0.68307
[8]	train-logloss:0.67898	validation-logloss:0.68197
[9]	train-logloss:0.67782	validation-logloss:0.68108
[10]	train-logloss:0.67721	validation-logloss:0.68044
[11]	train-logloss:0.67596	validation-logloss:0.67922
[12]	train-logloss:0.67483	validation-logloss:0.67806
[13]	train-logloss:0.67394	validation-logloss:0.67718
[14]	train-logloss:0.67283	validation-logloss:0.67639
[15]	train-logloss:0.67162	validation-logloss:0.67527
[16]	train-logloss:0.67081	validation-logloss:0.67475
[17]	train-logloss:0.66968	validation-logloss:0.67344
[18]	train-logloss:0.66881	validation-

In [8]:
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test,preds)
rmse

0.1981433480978012

In [150]:
model.save_model('predictor.json')