In [1]:
import pandas as pd
from pymongo import MongoClient
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from decimal import Decimal
import odds_data_getter as odg


In [2]:
teams_df = pd.read_json('/Users/ccuulinay/github_proj/scrapy_proj/nba_odds_spider/lab/collection_backup/all_teams.json')

In [3]:
teams_df.drop(['TEAM_CITY', 'TEAM_NAME'],axis=1, inplace=True)

In [4]:
advanced_stat_df = pd.read_json('./advanced_data_slice_201702.json')

In [5]:
def read_n_preprocess_ah_df():
    ah_df = odg.read_mongo_data_to_dataframe()
    ah_df = ah_df[ah_df.away_team != 'Team USA']
    ah_df = ah_df[ah_df.away_team != 'West']
    ah_df = ah_df[ah_df.home_team != 'Team World']
    ah_df = ah_df[ah_df.away_team != 'EAST']
    
    ah_df = ah_df.merge(teams_df, how='left', left_on='away_team', right_on='FULL_TEAM_NAME')
    ah_df.drop(['FULL_TEAM_NAME'],axis=1, inplace=True)
    ah_df.rename(columns={'TEAM_ID': 'away_team_id'}, inplace=True)
    ah_df = ah_df.merge(teams_df, how='left', left_on='home_team', right_on='FULL_TEAM_NAME')
    ah_df.drop(['FULL_TEAM_NAME'],axis=1, inplace=True)
    ah_df.rename(columns={'TEAM_ID': 'home_team_id'}, inplace=True)
    
    ah_df['winner'] = np.where(ah_df['score_home'] - ah_df['score_away'] > 0, 'home', 'away')
    ah_df['ot'] = ah_df['overtime'].apply(lambda x: 1 if x == True else 0)
    ah_df['home_win'] = ah_df['winner'].apply(lambda x: 1 if x == 'home' else 0)

    ah_df['date_time_DT'] = pd.to_datetime(ah_df['date_time'])
    ah_df['year'] = ah_df['date_time_DT'].dt.year.astype(int)
    ah_df['month'] = ah_df['date_time_DT'].dt.month.astype(int)
    ah_df['day'] = ah_df['date_time_DT'].dt.day.astype(int)
    ah_df['weekDay'] = ah_df['date_time_DT'].dt.dayofweek.astype(int)

    for i in range(1, 5):
        ah_df["new_odd_home_" + str(i)] = ah_df["odd_home_" + str(i)].apply(lambda x: odg.to_decimal(x))
        ah_df["new_odd_away_" + str(i)] = ah_df["odd_away_" + str(i)].apply(lambda x: odg.to_decimal(x))
        ah_df["new_odd_home_" + str(i)] = ah_df["new_odd_home_" + str(i)].astype(float)
        ah_df["new_odd_away_" + str(i)] = ah_df["new_odd_away_" + str(i)].astype(float)

    ah_df.drop(
        ['odd_home_1', 'odd_home_2', 'odd_home_3', 'odd_home_4', 'odd_away_1', 'odd_away_2', 'odd_away_3',
         'odd_away_4'],
        axis=1, inplace=True)
    
    one_hot = pd.get_dummies(ah_df[['away_team', 'home_team']], prefix=['away_team_', 'home_team_'])
    ah_df = ah_df.join(one_hot)
    ah_df.drop(['away_team', 'home_team'], axis=1, inplace=True)
    ah_df.drop(['date_time', 'date_time_DT', 'winner', 'overtime'], axis=1, inplace=True)

    ah_df['total_score'] = ah_df['score_home'] + ah_df['score_away']
    ah_df['sub_score'] = ah_df['score_home'] - ah_df['score_away']
    
    # ah_df['bet_winner'] = np.where(float(row['sub_score']) + float(row['ah_'+str(num)]) >= 0, 'home', 'away')

    ah_df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return ah_df

In [6]:
ah_df = read_n_preprocess_ah_df()

In [7]:
COLUMNS = ['away_team__Atlanta_Hawks',
           'away_team__Boston_Celtics',
           'away_team__Brooklyn_Nets',
           'away_team__Charlotte_Hornets',
           'away_team__Chicago_Bulls',
           'away_team__Cleveland_Cavaliers',
           'away_team__Dallas_Mavericks',
           'away_team__Denver_Nuggets',
           'away_team__Detroit_Pistons',
           'away_team__Golden_State_Warriors',
           'away_team__Houston_Rockets',
           'away_team__Indiana_Pacers',
           'away_team__Los_Angeles_Clippers',
           'away_team__Los_Angeles_Lakers',
           'away_team__Memphis_Grizzlies',
           'away_team__Miami_Heat',
           'away_team__Milwaukee_Bucks',
           'away_team__Minnesota_Timberwolves',
           'away_team__New_Orleans_Pelicans',
           'away_team__New_York_Knicks',
           'away_team__Oklahoma_City_Thunder',
           'away_team__Orlando_Magic',
           'away_team__Philadelphia_76ers',
           'away_team__Phoenix_Suns',
           'away_team__Portland_Trail_Blazers',
           'away_team__Sacramento_Kings',
           'away_team__San_Antonio_Spurs',
           'away_team__Toronto_Raptors',
           'away_team__Utah_Jazz',
           'away_team__Washington_Wizards',
           'home_team__Atlanta_Hawks',
           'home_team__Boston_Celtics',
           'home_team__Brooklyn_Nets',
           'home_team__Charlotte_Hornets',
           'home_team__Chicago_Bulls',
           'home_team__Cleveland_Cavaliers',
           'home_team__Dallas_Mavericks',
           'home_team__Denver_Nuggets',
           'home_team__Detroit_Pistons',
           'home_team__Golden_State_Warriors',
           'home_team__Houston_Rockets',
           'home_team__Indiana_Pacers',
           'home_team__Los_Angeles_Clippers',
           'home_team__Los_Angeles_Lakers',
           'home_team__Memphis_Grizzlies',
           'home_team__Miami_Heat',
           'home_team__Milwaukee_Bucks',
           'home_team__Minnesota_Timberwolves',
           'home_team__New_Orleans_Pelicans',
           'home_team__New_York_Knicks',
           'home_team__Oklahoma_City_Thunder',
           'home_team__Orlando_Magic',
           'home_team__Philadelphia_76ers',
           'home_team__Phoenix_Suns',
           'home_team__Portland_Trail_Blazers',
           'home_team__Sacramento_Kings',
           'home_team__San_Antonio_Spurs',
           'home_team__Toronto_Raptors',
           'home_team__Utah_Jazz',
           'home_team__Washington_Wizards',
           'away_team_id',
           'home_team_id',
           'ah_1',
#            'odd_cnt_1',
           'new_odd_home_1',
           'new_odd_away_1',
#            'ah_2',
#            'odd_cnt_2',
#            'new_odd_home_2',
#            'new_odd_away_2',
#            'ah_3',
#            'odd_cnt_3',
#            'new_odd_home_3',
#            'new_odd_away_3',
#            'ah_4',
#            'odd_cnt_4',
#            'new_odd_home_4',
#            'new_odd_away_4',
           'year',
           'month',
           'day',
           'weekDay',
#            'ot',
           'home_win',
           'score_away',
           'score_home',
           'total_score',
           'sub_score']

FEATURES = COLUMNS[:-5]

In [8]:
ah_df = ah_df[COLUMNS]
home_win = ah_df['home_win']
sub_score = ah_df['sub_score']
ah_df.sort_values(by=['year', 'month','day'], ascending=[1, 1, 1], inplace=True)

In [9]:
ah_df.head()

Unnamed: 0,away_team__Atlanta_Hawks,away_team__Boston_Celtics,away_team__Brooklyn_Nets,away_team__Charlotte_Hornets,away_team__Chicago_Bulls,away_team__Cleveland_Cavaliers,away_team__Dallas_Mavericks,away_team__Denver_Nuggets,away_team__Detroit_Pistons,away_team__Golden_State_Warriors,...,new_odd_away_1,year,month,day,weekDay,home_win,score_away,score_home,total_score,sub_score
1185,0,0,0,0,0,0,0,0,0,0,...,1.89,2014,10,5,6,0,98,86,184,-12
1186,0,0,0,0,0,0,0,0,0,0,...,1.92,2014,10,6,0,1,94,99,193,5
1187,0,0,0,0,0,0,0,0,0,0,...,1.91,2014,10,7,1,1,78,98,176,20
1188,0,0,0,0,0,0,0,0,0,0,...,1.89,2014,10,7,1,1,87,93,180,6
1190,0,0,0,0,0,0,0,1,0,0,...,1.9,2014,10,7,1,1,95,98,193,3


In [11]:
result_df = pd.merge(advanced_stat_df, ah_df, left_index=True, right_index=True, how='inner')

In [17]:
COLUMNS = result_df.columns.tolist()
FEATURES = COLUMNS[:-5]

In [13]:
result_data = result_df.as_matrix()
result_data.shape

(3591, 152)

In [14]:
train_data = result_data[:,:-5]
train_label = result_data[:,-5:].astype(int)

In [42]:
from sklearn.ensemble import RandomForestRegressor
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=4))])
rf_clf = clf.fit(x_train, y_train)
y_hat = rf_clf.predict(x_test)

In [43]:
payout_df = pd.DataFrame(x_test, columns=FEATURES)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
# payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : odg.calculate_payout(x), axis=1)
# payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
# payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
# payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum() # ,payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

67.479935695607438

In [40]:
# Try XGBoost on subscore
import xgboost as xgb
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)

data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 6, 'eta': 1, 'silent': 1, 'objective': 'reg:linear'}
bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)

[0]	eval-rmse:12.7101	train-rmse:11.0091
[1]	eval-rmse:13.1057	train-rmse:10.3104
[2]	eval-rmse:13.3924	train-rmse:9.69237
[3]	eval-rmse:13.6263	train-rmse:9.18781
[4]	eval-rmse:13.8232	train-rmse:8.83794
[5]	eval-rmse:13.9848	train-rmse:8.51979


In [41]:
payout_df = pd.DataFrame(x_test, columns=FEATURES)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : odg.calculate_payout(x), axis=1)
#payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
#payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
#payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum()#, payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

-47.999460566455255

In [47]:
# Build a linear regressor for subscore
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

linreg = LinearRegression()
lr_model = linreg.fit(x_train, y_train)
y_hat = linreg.predict(x_test)

In [48]:
payout_df = pd.DataFrame(x_test, columns=FEATURES)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : odg.calculate_payout(x), axis=1)
# payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
# payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
# payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum()# , payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

27.660216624571657