In [1]:
import pandas as pd
from pymongo import MongoClient
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017

MONGODB_DB = "nba_odds_n_predict"
MONGODB_COLLECTION = "games_ah_odds"

In [18]:
def _connect_to_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)

    return conn[db]


def read_mongo_data_to_dataframe(db=MONGODB_DB, collection=MONGODB_COLLECTION, query={}, host=MONGODB_SERVER, port=MONGODB_PORT, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_to_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df = pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

def map_team_to_id(teams_df, team_name):
    return teams_df[teams_df.FULL_TEAM_NAME == team_name].TEAM_ID.values[0]

def fraction2decimal(f):
    if f.find("/") != -1:
        return float(Decimal(f.split('/')[0]) / Decimal(f.split('/')[1])) + 1
    else:
        return f


def american2decimal(a):
    if a.find("+") != -1:
        return (float(a)/100) + 1
    elif a.find("-") != -1:
        return (100/abs(float(a))) + 1
    else:
        return a


def to_decimal(x):
    if x.find(".") != -1:
        return x
    elif x.find("/") != -1:
        return fraction2decimal(x)
    else:
        return american2decimal(x)


def calculate_payout(row, num=1):
    if float(row['sub_score']) >= (-1) * float(row['ah_'+str(num)]):
        home_win_with_ah = 1.0
    else:
        home_win_with_ah = 0.0
    if float(row['predict_subscore']) >= (-1) * float(row['ah_'+str(num)]):
        return (float(row['new_odd_home_'+str(num)]) * home_win_with_ah) - 1.0
    else:
        return (float(row['new_odd_away_'+str(num)]) * (1.0 - home_win_with_ah)) - 1.0


In [116]:
teams_df = pd.read_json('/Users/ccuulinay/github_proj/scrapy_proj/nba_odds_spider/lab/collection_backup/all_teams.json')
ah_df = read_mongo_data_to_dataframe()

In [117]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,away_team,date_time,home_team,odd_away_1,odd_away_2,odd_away_3,...,odd_cnt_2,odd_cnt_3,odd_cnt_4,odd_home_1,odd_home_2,odd_home_3,odd_home_4,overtime,score_away,score_home
0,-5.0,-4.5,-6.0,-5.5,Miami Heat,2015-03-07 02:00:00,Washington Wizards,1.98,1.96,1.91,...,10,8,7,1.86,1.88,1.95,1.89,False,97,99
1,-7.0,-7.5,-6.5,-10.5,Milwaukee Bucks,2015-03-13 01:00:00,Indiana Pacers,1.92,1.83,1.98,...,7,7,4,1.94,2.00,1.86,2.45,True,103,109
2,9.0,8.5,9.5,6.5,Los Angeles Clippers,2015-03-19 04:00:00,Sacramento Kings,1.94,1.86,1.98,...,7,7,4,1.92,1.97,1.83,2.28,False,116,105
3,14.5,15.0,13.5,15.5,Los Angeles Clippers,2015-03-26 01:00:00,New York Knicks,17/20,93/100,7/10,...,8,4,4,97/100,47/50,23/20,43/50,False,111,80
4,3.0,3.5,2.5,2.0,Brooklyn Nets,2015-02-21 05:30:00,Los Angeles Lakers,1.92,1.94,1.83,...,8,5,4,1.94,1.87,2.03,2.12,False,114,105


In [118]:
ah_df = ah_df[ah_df.away_team != 'Team USA']
ah_df = ah_df[ah_df.away_team != 'West']
ah_df = ah_df[ah_df.home_team != 'Team World']
ah_df = ah_df[ah_df.away_team != 'EAST']
ah_df['winner'] = np.where(ah_df['score_home'] - ah_df['score_away']>0, 'home','away')
ah_df['ot'] = ah_df['overtime'].apply(lambda x : 1 if x == True else 0)
ah_df['home_win'] = ah_df['winner'].apply(lambda x : 1 if x == 'home' else 0)

ah_df['date_time_DT'] = pd.to_datetime(ah_df['date_time'])
ah_df['year'] = ah_df['date_time_DT'].dt.year.astype(str)
ah_df['month'] = ah_df['date_time_DT'].dt.month.astype(str)
ah_df['day'] = ah_df['date_time_DT'].dt.day.astype(str)
ah_df['weekDay'] = ah_df['date_time_DT'].dt.dayofweek.astype(str)

In [119]:
from fractions import Fraction
from decimal import Decimal

# ah_df["new_odd_home_1"]= ah_df.odd_home_1.apply(lambda x: to_decimal(x))
# ah_df["new_odd_home_2"]= ah_df.odd_home_2.apply(lambda x: to_decimal(x))
# ah_df["new_odd_home_3"]= ah_df.odd_home_3.apply(lambda x: to_decimal(x))
# ah_df["new_odd_home_4"]= ah_df.odd_home_4.apply(lambda x: to_decimal(x))
# ah_df["new_odd_away_1"]= ah_df.odd_away_1.apply(lambda x: to_decimal(x))
# ah_df["new_odd_away_2"]= ah_df.odd_away_2.apply(lambda x: to_decimal(x))
# ah_df["new_odd_away_3"]= ah_df.odd_away_3.apply(lambda x: to_decimal(x))
# ah_df["new_odd_away_4"]= ah_df.odd_away_4.apply(lambda x: to_decimal(x))

for i in range(1, 5):
    ah_df["new_odd_home_" + str(i)] = ah_df["odd_home_" + str(i)].apply(lambda x: to_decimal(x))
    ah_df["new_odd_away_" + str(i)] = ah_df["odd_away_" + str(i)].apply(lambda x: to_decimal(x))
    ah_df["new_odd_home_" + str(i)] = ah_df["new_odd_home_" + str(i)].astype(float)
    ah_df["new_odd_away_" + str(i)] = ah_df["new_odd_away_" + str(i)].astype(float)

In [120]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,away_team,date_time,home_team,odd_away_1,odd_away_2,odd_away_3,...,day,weekDay,new_odd_home_1,new_odd_away_1,new_odd_home_2,new_odd_away_2,new_odd_home_3,new_odd_away_3,new_odd_home_4,new_odd_away_4
0,-5.0,-4.5,-6.0,-5.5,Miami Heat,2015-03-07 02:00:00,Washington Wizards,1.98,1.96,1.91,...,7,5,1.86,1.98,1.88,1.96,1.95,1.91,1.89,1.93
1,-7.0,-7.5,-6.5,-10.5,Milwaukee Bucks,2015-03-13 01:00:00,Indiana Pacers,1.92,1.83,1.98,...,13,4,1.94,1.92,2.0,1.83,1.86,1.98,2.45,1.51
2,9.0,8.5,9.5,6.5,Los Angeles Clippers,2015-03-19 04:00:00,Sacramento Kings,1.94,1.86,1.98,...,19,3,1.92,1.94,1.97,1.86,1.83,1.98,2.28,1.6
3,14.5,15.0,13.5,15.5,Los Angeles Clippers,2015-03-26 01:00:00,New York Knicks,17/20,93/100,7/10,...,26,3,1.97,1.85,1.94,1.93,2.15,1.7,1.86,1.95
4,3.0,3.5,2.5,2.0,Brooklyn Nets,2015-02-21 05:30:00,Los Angeles Lakers,1.92,1.94,1.83,...,21,5,1.94,1.92,1.87,1.94,2.03,1.83,2.12,1.75


In [121]:
# ah_df.drop("odd_home_1", axis=1, inplace=True)
# ah_df.drop("odd_home_2", axis=1, inplace=True)
# ah_df.drop("odd_home_3", axis=1, inplace=True)
# ah_df.drop("odd_home_4", axis=1, inplace=True)
# ah_df.drop("odd_away_1", axis=1, inplace=True)
# ah_df.drop("odd_away_2", axis=1, inplace=True)
# ah_df.drop("odd_away_3", axis=1, inplace=True)
# ah_df.drop("odd_away_4", axis=1, inplace=True)
ah_df.drop(['odd_home_1', 'odd_home_2', 'odd_home_3', 'odd_home_4', 'odd_away_1', 'odd_away_2', 'odd_away_3', 'odd_away_4'], axis=1, inplace=True)

In [122]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,away_team,date_time,home_team,odd_cnt_1,odd_cnt_2,odd_cnt_3,...,day,weekDay,new_odd_home_1,new_odd_away_1,new_odd_home_2,new_odd_away_2,new_odd_home_3,new_odd_away_3,new_odd_home_4,new_odd_away_4
0,-5.0,-4.5,-6.0,-5.5,Miami Heat,2015-03-07 02:00:00,Washington Wizards,10,10,8,...,7,5,1.86,1.98,1.88,1.96,1.95,1.91,1.89,1.93
1,-7.0,-7.5,-6.5,-10.5,Milwaukee Bucks,2015-03-13 01:00:00,Indiana Pacers,9,7,7,...,13,4,1.94,1.92,2.0,1.83,1.86,1.98,2.45,1.51
2,9.0,8.5,9.5,6.5,Los Angeles Clippers,2015-03-19 04:00:00,Sacramento Kings,9,7,7,...,19,3,1.92,1.94,1.97,1.86,1.83,1.98,2.28,1.6
3,14.5,15.0,13.5,15.5,Los Angeles Clippers,2015-03-26 01:00:00,New York Knicks,9,8,4,...,26,3,1.97,1.85,1.94,1.93,2.15,1.7,1.86,1.95
4,3.0,3.5,2.5,2.0,Brooklyn Nets,2015-02-21 05:30:00,Los Angeles Lakers,10,8,5,...,21,5,1.94,1.92,1.87,1.94,2.03,1.83,2.12,1.75


In [123]:
one_hot = pd.get_dummies(ah_df[['away_team','home_team']], prefix=['away_team_', 'home_team_'])

In [124]:
ah_df = ah_df.join(one_hot)
ah_df.drop(['away_team','home_team'], axis=1, inplace=True)

In [125]:
ah_df.drop(['date_time', 'date_time_DT', 'winner', 'overtime'], axis=1, inplace=True)

In [126]:
ah_df['total_score'] = ah_df['score_home'] + ah_df['score_away']
ah_df['sub_score'] = ah_df['score_home'] - ah_df['score_away']

In [127]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,odd_cnt_1,odd_cnt_2,odd_cnt_3,odd_cnt_4,score_away,score_home,...,home_team__Philadelphia 76ers,home_team__Phoenix Suns,home_team__Portland Trail Blazers,home_team__Sacramento Kings,home_team__San Antonio Spurs,home_team__Toronto Raptors,home_team__Utah Jazz,home_team__Washington Wizards,total_score,sub_score
0,-5.0,-4.5,-6.0,-5.5,10,10,8,7,97,99,...,0,0,0,0,0,0,0,1,196,2
1,-7.0,-7.5,-6.5,-10.5,9,7,7,4,103,109,...,0,0,0,0,0,0,0,0,212,6
2,9.0,8.5,9.5,6.5,9,7,7,4,116,105,...,0,0,0,1,0,0,0,0,221,-11
3,14.5,15.0,13.5,15.5,9,8,4,4,111,80,...,0,0,0,0,0,0,0,0,191,-31
4,3.0,3.5,2.5,2.0,10,8,5,4,114,105,...,0,0,0,0,0,0,0,0,219,-9


In [128]:
ah_df = ah_df[[u'away_team__Atlanta Hawks',
 u'away_team__Boston Celtics',
 u'away_team__Brooklyn Nets',
 u'away_team__Charlotte Hornets',
 u'away_team__Chicago Bulls',
 u'away_team__Cleveland Cavaliers',
 u'away_team__Dallas Mavericks',
 u'away_team__Denver Nuggets',
 u'away_team__Detroit Pistons',
 u'away_team__Golden State Warriors',
 u'away_team__Houston Rockets',
 u'away_team__Indiana Pacers',
 u'away_team__Los Angeles Clippers',
 u'away_team__Los Angeles Lakers',
 u'away_team__Memphis Grizzlies',
 u'away_team__Miami Heat',
 u'away_team__Milwaukee Bucks',
 u'away_team__Minnesota Timberwolves',
 u'away_team__New Orleans Pelicans',
 u'away_team__New York Knicks',
 u'away_team__Oklahoma City Thunder',
 u'away_team__Orlando Magic',
 u'away_team__Philadelphia 76ers',
 u'away_team__Phoenix Suns',
 u'away_team__Portland Trail Blazers',
 u'away_team__Sacramento Kings',
 u'away_team__San Antonio Spurs',
 u'away_team__Toronto Raptors',
 u'away_team__Utah Jazz',
 u'away_team__Washington Wizards',
 u'home_team__Atlanta Hawks',
 u'home_team__Boston Celtics',
 u'home_team__Brooklyn Nets',
 u'home_team__Charlotte Hornets',
 u'home_team__Chicago Bulls',
 u'home_team__Cleveland Cavaliers',
 u'home_team__Dallas Mavericks',
 u'home_team__Denver Nuggets',
 u'home_team__Detroit Pistons',
 u'home_team__Golden State Warriors',
 u'home_team__Houston Rockets',
 u'home_team__Indiana Pacers',
 u'home_team__Los Angeles Clippers',
 u'home_team__Los Angeles Lakers',
 u'home_team__Memphis Grizzlies',
 u'home_team__Miami Heat',
 u'home_team__Milwaukee Bucks',
 u'home_team__Minnesota Timberwolves',
 u'home_team__New Orleans Pelicans',
 u'home_team__New York Knicks',
 u'home_team__Oklahoma City Thunder',
 u'home_team__Orlando Magic',
 u'home_team__Philadelphia 76ers',
 u'home_team__Phoenix Suns',
 u'home_team__Portland Trail Blazers',
 u'home_team__Sacramento Kings',
 u'home_team__San Antonio Spurs',
 u'home_team__Toronto Raptors',
 u'home_team__Utah Jazz',
 u'home_team__Washington Wizards',

 u'ah_1',
 u'odd_cnt_1',
 'new_odd_home_1',
 'new_odd_away_1',
 u'ah_2',
 u'odd_cnt_2',
 'new_odd_home_2',
 'new_odd_away_2',
 u'ah_3',
 u'odd_cnt_3',
 'new_odd_home_3',
 'new_odd_away_3',
 u'ah_4',
 u'odd_cnt_4',
 'new_odd_home_4',
 'new_odd_away_4',
 
 'year',
 'month',
 'day',
 'weekDay',
 'ot',
 'home_win',
 u'score_away',
 u'score_home',
 'total_score',
 'sub_score']]
ah_df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
cols = list(ah_df.columns.values)

In [129]:
cols

[u'away_team__Atlanta_Hawks',
 u'away_team__Boston_Celtics',
 u'away_team__Brooklyn_Nets',
 u'away_team__Charlotte_Hornets',
 u'away_team__Chicago_Bulls',
 u'away_team__Cleveland_Cavaliers',
 u'away_team__Dallas_Mavericks',
 u'away_team__Denver_Nuggets',
 u'away_team__Detroit_Pistons',
 u'away_team__Golden_State_Warriors',
 u'away_team__Houston_Rockets',
 u'away_team__Indiana_Pacers',
 u'away_team__Los_Angeles_Clippers',
 u'away_team__Los_Angeles_Lakers',
 u'away_team__Memphis_Grizzlies',
 u'away_team__Miami_Heat',
 u'away_team__Milwaukee_Bucks',
 u'away_team__Minnesota_Timberwolves',
 u'away_team__New_Orleans_Pelicans',
 u'away_team__New_York_Knicks',
 u'away_team__Oklahoma_City_Thunder',
 u'away_team__Orlando_Magic',
 u'away_team__Philadelphia_76ers',
 u'away_team__Phoenix_Suns',
 u'away_team__Portland_Trail_Blazers',
 u'away_team__Sacramento_Kings',
 u'away_team__San_Antonio_Spurs',
 u'away_team__Toronto_Raptors',
 u'away_team__Utah_Jazz',
 u'away_team__Washington_Wizards',
 u'home_

In [130]:
ah_df.to_json("ah_df_bk20170120.json", orient='records')

In [131]:
ah_data = ah_df.as_matrix()

In [132]:
ah_data.shape

(3091, 86)

In [133]:
train_data = ah_data[:,:-5]
train_label = ah_data[:,-5:].astype(int)

In [134]:
# First build a simple RFC for home_win
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,0], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4))])
rf_clf = clf.fit(x_train, y_train)
y_hat = rf_clf.predict(x_test)
result = (y_hat == y_test)

In [135]:
acc = np.mean(result)
acc

0.69612068965517238

In [136]:
# Try some other clf on home_win
from sklearn.ensemble import GradientBoostingClassifier
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,0], test_size=0.3)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', GradientBoostingClassifier(n_estimators=200, max_depth=4))])
rf_clf = clf.fit(x_train, y_train)
y_hat = rf_clf.predict(x_test)
result = (y_hat == y_test)

In [137]:
acc = np.mean(result)
acc

0.70905172413793105

In [138]:
# Try XGBoost on home_win
import xgboost as xgb
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,0], test_size=0.3)

data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 2}
bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)
result = (y_hat == y_test)

[0]	eval-merror:0.320043	train-merror:0.294961
[1]	eval-merror:0.321121	train-merror:0.285252
[2]	eval-merror:0.327586	train-merror:0.277393
[3]	eval-merror:0.337284	train-merror:0.271382
[4]	eval-merror:0.335129	train-merror:0.268146
[5]	eval-merror:0.346983	train-merror:0.250578


In [139]:
acc = np.mean(result)
acc

0.65301724137931039

### Get the data cols and result cols and have a look to regressors on subscore.

In [140]:
adjusted_x_cols = [u'away_team__Atlanta_Hawks',
 u'away_team__Boston_Celtics',
 u'away_team__Brooklyn_Nets',
 u'away_team__Charlotte_Hornets',
 u'away_team__Chicago_Bulls',
 u'away_team__Cleveland_Cavaliers',
 u'away_team__Dallas_Mavericks',
 u'away_team__Denver_Nuggets',
 u'away_team__Detroit_Pistons',
 u'away_team__Golden_State_Warriors',
 u'away_team__Houston_Rockets',
 u'away_team__Indiana_Pacers',
 u'away_team__Los_Angeles_Clippers',
 u'away_team__Los_Angeles_Lakers',
 u'away_team__Memphis_Grizzlies',
 u'away_team__Miami_Heat',
 u'away_team__Milwaukee_Bucks',
 u'away_team__Minnesota_Timberwolves',
 u'away_team__New_Orleans_Pelicans',
 u'away_team__New_York_Knicks',
 u'away_team__Oklahoma_City_Thunder',
 u'away_team__Orlando_Magic',
 u'away_team__Philadelphia_76ers',
 u'away_team__Phoenix_Suns',
 u'away_team__Portland_Trail_Blazers',
 u'away_team__Sacramento_Kings',
 u'away_team__San_Antonio_Spurs',
 u'away_team__Toronto_Raptors',
 u'away_team__Utah_Jazz',
 u'away_team__Washington_Wizards',
 u'home_team__Atlanta_Hawks',
 u'home_team__Boston_Celtics',
 u'home_team__Brooklyn_Nets',
 u'home_team__Charlotte_Hornets',
 u'home_team__Chicago_Bulls',
 u'home_team__Cleveland_Cavaliers',
 u'home_team__Dallas_Mavericks',
 u'home_team__Denver_Nuggets',
 u'home_team__Detroit_Pistons',
 u'home_team__Golden_State_Warriors',
 u'home_team__Houston_Rockets',
 u'home_team__Indiana_Pacers',
 u'home_team__Los_Angeles_Clippers',
 u'home_team__Los_Angeles_Lakers',
 u'home_team__Memphis_Grizzlies',
 u'home_team__Miami_Heat',
 u'home_team__Milwaukee_Bucks',
 u'home_team__Minnesota_Timberwolves',
 u'home_team__New_Orleans_Pelicans',
 u'home_team__New_York_Knicks',
 u'home_team__Oklahoma_City_Thunder',
 u'home_team__Orlando_Magic',
 u'home_team__Philadelphia_76ers',
 u'home_team__Phoenix_Suns',
 u'home_team__Portland_Trail_Blazers',
 u'home_team__Sacramento_Kings',
 u'home_team__San_Antonio_Spurs',
 u'home_team__Toronto_Raptors',
 u'home_team__Utah_Jazz',
 u'home_team__Washington_Wizards',

 u'ah_1',
 u'odd_cnt_1',
 'new_odd_home_1',
 'new_odd_away_1',
 u'ah_2',
 u'odd_cnt_2',
 'new_odd_home_2',
 'new_odd_away_2',
 u'ah_3',
 u'odd_cnt_3',
 'new_odd_home_3',
 'new_odd_away_3',
 u'ah_4',
 u'odd_cnt_4',
 'new_odd_home_4',
 'new_odd_away_4',
 
 'year',
 'month',
 'day',
 'weekDay',
 'ot']
adjusted_y_cols = ['home_win',
 u'score_away',
 u'score_home',
 'total_score',
 'sub_score']

In [141]:
home_win = ah_df['home_win']
sub_score = ah_df['sub_score']

In [142]:
# Second build a simple RFR for subscore
from sklearn.ensemble import RandomForestRegressor
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=4))])
rf_clf = clf.fit(x_train, y_train)
y_hat = rf_clf.predict(x_test)

In [143]:
payout_df = pd.DataFrame(x_test, columns=adjusted_x_cols)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
# payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : calculate_payout(x), axis=1)
payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum(), payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

(9.4936258962509648,
 9.1793328647870105,
 1.0671070702629066,
 -27.653761584435955)

In [144]:
# Try XGBoost on subscore
import xgboost as xgb
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)

data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 6, 'eta': 1, 'silent': 1, 'objective': 'reg:linear'}
bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)

[0]	eval-rmse:11.9361	train-rmse:11.1196
[1]	eval-rmse:12.1235	train-rmse:10.7184
[2]	eval-rmse:12.4634	train-rmse:10.3236
[3]	eval-rmse:12.5757	train-rmse:9.89626
[4]	eval-rmse:12.6393	train-rmse:9.55327
[5]	eval-rmse:12.6449	train-rmse:9.14246


In [145]:
payout_df = pd.DataFrame(x_test, columns=adjusted_x_cols)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : calculate_payout(x), axis=1)
payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum(), payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

(-7.8823393893751401,
 -6.0030076833588106,
 15.885967389896928,
 -20.800497285313419)

In [146]:
# Build a linear regressor for subscore
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

linreg = LinearRegression()
lr_model = linreg.fit(x_train, y_train)
y_hat = linreg.predict(x_test)

In [147]:
payout_df = pd.DataFrame(x_test, columns=adjusted_x_cols)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : calculate_payout(x), axis=1)
payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum(), payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

(4.6514591247754673, 15.74171707010693, 4.5554094065373221, 23.195956012261998)

### Have a look on the cnt and ah

In [148]:
ah_df[[u'ah_1',
 u'odd_cnt_1',
 'new_odd_home_1',
 'new_odd_away_1',
 u'ah_2',
 u'odd_cnt_2',
 'new_odd_home_2',
 'new_odd_away_2',
 u'ah_3',
 u'odd_cnt_3',
 'new_odd_home_3',
 'new_odd_away_3',
 u'ah_4',
 u'odd_cnt_4',
 'new_odd_home_4',
 'new_odd_away_4']].head()

Unnamed: 0,ah_1,odd_cnt_1,new_odd_home_1,new_odd_away_1,ah_2,odd_cnt_2,new_odd_home_2,new_odd_away_2,ah_3,odd_cnt_3,new_odd_home_3,new_odd_away_3,ah_4,odd_cnt_4,new_odd_home_4,new_odd_away_4
0,-5.0,10,1.86,1.98,-4.5,10,1.88,1.96,-6.0,8,1.95,1.91,-5.5,7,1.89,1.93
1,-7.0,9,1.94,1.92,-7.5,7,2.0,1.83,-6.5,7,1.86,1.98,-10.5,4,2.45,1.51
2,9.0,9,1.92,1.94,8.5,7,1.97,1.86,9.5,7,1.83,1.98,6.5,4,2.28,1.6
3,14.5,9,1.97,1.85,15.0,8,1.94,1.93,13.5,4,2.15,1.7,15.5,4,1.86,1.95
4,3.0,10,1.94,1.92,3.5,8,1.87,1.94,2.5,5,2.03,1.83,2.0,4,2.12,1.75


In [149]:
def get_total_odds_count(row):
    total_cnt = 0
    for i in range(1, 5):
        total_cnt += float(row['odd_cnt_' + str(i)])
    return total_cnt


def normalize_odds_count(row, num):
    total_cnt = get_total_odds_count(row)
    return (float(row['odd_cnt_'+str(num)])) / total_cnt


def normalize_odd(row, num):
    total = (float(row['new_odd_home_'+str(num)])) + (float(row['new_odd_away_'+str(num)]))
    norm_home = (float(row['new_odd_home_'+str(num)])) / total
    norm_away = (float(row['new_odd_away_'+str(num)])) / total
    return norm_home, norm_away


def normalize_ah_odds(row, num):
    i = num
    norm_cnt = normalize_odds_count(row, i)
    ah = row['ah_'+str(i)]
    norm_home, norm_away = normalize_odd(row, i)
    no_home = ah * norm_cnt * norm_home
    no_away = ah * norm_cnt * norm_away
    return no_home, no_away


In [150]:
for i in range(1, 5):
    odds = ah_df.apply(lambda x: normalize_ah_odds(x, i), axis=1)
    ah_df['norm_odd_home_'+str(i)], ah_df['norm_odd_away_'+str(i)] = zip(*odds)
# odds = ah_df.apply(lambda x: normalize_ah_odds(x, 1), axis=1)
# ah_df['norm_odd_home_1'], ah_df['norm_odd_away_1'] = zip(*odds)
norm_ah_df = ah_df.copy()

In [151]:
adjusted_x_cols = [u'away_team__Atlanta_Hawks',
 u'away_team__Boston_Celtics',
 u'away_team__Brooklyn_Nets',
 u'away_team__Charlotte_Hornets',
 u'away_team__Chicago_Bulls',
 u'away_team__Cleveland_Cavaliers',
 u'away_team__Dallas_Mavericks',
 u'away_team__Denver_Nuggets',
 u'away_team__Detroit_Pistons',
 u'away_team__Golden_State_Warriors',
 u'away_team__Houston_Rockets',
 u'away_team__Indiana_Pacers',
 u'away_team__Los_Angeles_Clippers',
 u'away_team__Los_Angeles_Lakers',
 u'away_team__Memphis_Grizzlies',
 u'away_team__Miami_Heat',
 u'away_team__Milwaukee_Bucks',
 u'away_team__Minnesota_Timberwolves',
 u'away_team__New_Orleans_Pelicans',
 u'away_team__New_York_Knicks',
 u'away_team__Oklahoma_City_Thunder',
 u'away_team__Orlando_Magic',
 u'away_team__Philadelphia_76ers',
 u'away_team__Phoenix_Suns',
 u'away_team__Portland_Trail_Blazers',
 u'away_team__Sacramento_Kings',
 u'away_team__San_Antonio_Spurs',
 u'away_team__Toronto_Raptors',
 u'away_team__Utah_Jazz',
 u'away_team__Washington_Wizards',
 u'home_team__Atlanta_Hawks',
 u'home_team__Boston_Celtics',
 u'home_team__Brooklyn_Nets',
 u'home_team__Charlotte_Hornets',
 u'home_team__Chicago_Bulls',
 u'home_team__Cleveland_Cavaliers',
 u'home_team__Dallas_Mavericks',
 u'home_team__Denver_Nuggets',
 u'home_team__Detroit_Pistons',
 u'home_team__Golden_State_Warriors',
 u'home_team__Houston_Rockets',
 u'home_team__Indiana_Pacers',
 u'home_team__Los_Angeles_Clippers',
 u'home_team__Los_Angeles_Lakers',
 u'home_team__Memphis_Grizzlies',
 u'home_team__Miami_Heat',
 u'home_team__Milwaukee_Bucks',
 u'home_team__Minnesota_Timberwolves',
 u'home_team__New_Orleans_Pelicans',
 u'home_team__New_York_Knicks',
 u'home_team__Oklahoma_City_Thunder',
 u'home_team__Orlando_Magic',
 u'home_team__Philadelphia_76ers',
 u'home_team__Phoenix_Suns',
 u'home_team__Portland_Trail_Blazers',
 u'home_team__Sacramento_Kings',
 u'home_team__San_Antonio_Spurs',
 u'home_team__Toronto_Raptors',
 u'home_team__Utah_Jazz',
 u'home_team__Washington_Wizards',
 u'norm_odd_home_1',
 u'norm_odd_away_1',
 u'norm_odd_home_2',
 u'norm_odd_away_2',
 u'norm_odd_home_3',
 u'norm_odd_away_3',
 u'norm_odd_home_4',
 u'norm_odd_away_4',
 
 'year',
 'month',
 'day',
 'weekDay',
 'ot',

 u'ah_1',
 u'odd_cnt_1',
 'new_odd_home_1',
 'new_odd_away_1',
 u'ah_2',
 u'odd_cnt_2',
 'new_odd_home_2',
 'new_odd_away_2',
 u'ah_3',
 u'odd_cnt_3',
 'new_odd_home_3',
 'new_odd_away_3',
 u'ah_4',
 u'odd_cnt_4',
 'new_odd_home_4',
 'new_odd_away_4']
adjusted_y_cols = ['home_win',
 u'score_away',
 u'score_home',
 'total_score',
 'sub_score']

In [152]:
norm_ah_data_df = norm_ah_df[adjusted_x_cols]
norm_ah_label_df = norm_ah_df[adjusted_y_cols]

In [153]:
norm_ah_data = norm_ah_data_df.as_matrix()
norm_ah_label = norm_ah_label_df.as_matrix()

In [154]:
x_train, x_test, y_train, y_test = train_test_split(norm_ah_data,norm_ah_label[:,4], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=4))])
rf_clf = clf.fit(x_train[:, :-16], y_train)
y_hat = rf_clf.predict(x_test[:, :-16])

In [155]:
payout_df = pd.DataFrame(x_test, columns=adjusted_x_cols)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : calculate_payout(x), axis=1)
payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum(), payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

(7.2594869389410874,
 30.8265070574041,
 -2.3148218727626038,
 -26.729155124050813)

In [156]:
# Try XGBoost on subscore
import xgboost as xgb
x_train, x_test, y_train, y_test = train_test_split(norm_ah_data,norm_ah_label[:,4], test_size=0.3)

data_train = xgb.DMatrix(x_train[:, :-16], label=y_train)
data_test = xgb.DMatrix(x_test[:, :-16], label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 6, 'eta': 1, 'silent': 1, 'objective': 'reg:linear'}
bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)

[0]	eval-rmse:12.2765	train-rmse:10.8293
[1]	eval-rmse:12.4171	train-rmse:10.4712
[2]	eval-rmse:12.7375	train-rmse:9.87373
[3]	eval-rmse:12.9237	train-rmse:9.50402
[4]	eval-rmse:13.0333	train-rmse:9.20791
[5]	eval-rmse:13.1178	train-rmse:8.85069


In [157]:
payout_df = pd.DataFrame(x_test, columns=adjusted_x_cols)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : calculate_payout(x), axis=1)
payout_df['payout_2'] = payout_df.apply(lambda x : calculate_payout(x, num=2), axis=1)
payout_df['payout_3'] = payout_df.apply(lambda x : calculate_payout(x, num=3), axis=1)
payout_df['payout_4'] = payout_df.apply(lambda x : calculate_payout(x, num=4), axis=1)
payout_df.payout_1.sum(), payout_df.payout_2.sum(),payout_df.payout_3.sum(),payout_df.payout_4.sum()

(-1.9039068024801793,
 -7.7751088351679591,
 1.5066148541845159,
 -8.6256426730600726)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3091 entries, 0 to 3093
Data columns (total 94 columns):
away_team__Atlanta_Hawks             3091 non-null float64
away_team__Boston_Celtics            3091 non-null float64
away_team__Brooklyn_Nets             3091 non-null float64
away_team__Charlotte_Hornets         3091 non-null float64
away_team__Chicago_Bulls             3091 non-null float64
away_team__Cleveland_Cavaliers       3091 non-null float64
away_team__Dallas_Mavericks          3091 non-null float64
away_team__Denver_Nuggets            3091 non-null float64
away_team__Detroit_Pistons           3091 non-null float64
away_team__Golden_State_Warriors     3091 non-null float64
away_team__Houston_Rockets           3091 non-null float64
away_team__Indiana_Pacers            3091 non-null float64
away_team__Los_Angeles_Clippers      3091 non-null float64
away_team__Los_Angeles_Lakers        3091 non-null float64
away_team__Memphis_Grizzlies         3091 non-null float64
away_tea