In [1]:
import pandas as pd
from pymongo import MongoClient
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017

MONGODB_DB = "nba_odds_n_predict"
MONGODB_COLLECTION = "games_ah_odds"

In [46]:
def _connect_to_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)

    return conn[db]


def read_mongo_data_to_dataframe(db=MONGODB_DB, collection=MONGODB_COLLECTION, query={}, host=MONGODB_SERVER, port=MONGODB_PORT, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_to_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df = pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

def map_team_to_id(teams_df, team_name):
    return teams_df[teams_df.FULL_TEAM_NAME == team_name].TEAM_ID.values[0]

def fraction2decimal(f):
    if f.find("/") != -1:
        return float(Decimal(f.split('/')[0]) / Decimal(f.split('/')[1])) + 1
    else:
        return f


def american2decimal(a):
    if a.find("+") != -1:
        return (float(a)/100) + 1
    elif a.find("-") != -1:
        return (100/abs(float(a))) + 1
    else:
        return a


def to_decimal(x):
    if x.find(".") != -1:
        return x
    elif x.find("/") != -1:
        return fraction2decimal(x)
    else:
        return american2decimal(x)


def calculat_payout(row, num=1):
    if float(row['sub_score']) >= (-1) * float(row['ah_'+str(num)]):
        bet_win = 1.0
    else:
        bet_win = 0.0
    if float(row['predict_subscore']) >= (-1) * float(row['ah_'+str(num)]):
        return (float(row['new_odd_home_'+str(num)]) * bet_win) - 1.0
    else:
        return (float(row['new_odd_away_'+str(num)]) * bet_win) - 1.0

In [3]:
teams_df = pd.read_json('/Users/ccuulinay/github_proj/scrapy_proj/nba_odds_spider/lab/collection_backup/all_teams.json')
ah_df = read_mongo_data_to_dataframe()

In [4]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,away_team,date_time,home_team,odd_away_1,odd_away_2,odd_away_3,...,odd_cnt_2,odd_cnt_3,odd_cnt_4,odd_home_1,odd_home_2,odd_home_3,odd_home_4,overtime,score_away,score_home
0,-5.0,-4.5,-6.0,-5.5,Miami Heat,2015-03-07 02:00:00,Washington Wizards,1.98,1.96,1.91,...,10,8,7,1.86,1.88,1.95,1.89,False,97,99
1,-7.0,-7.5,-6.5,-10.5,Milwaukee Bucks,2015-03-13 01:00:00,Indiana Pacers,1.92,1.83,1.98,...,7,7,4,1.94,2.00,1.86,2.45,True,103,109
2,9.0,8.5,9.5,6.5,Los Angeles Clippers,2015-03-19 04:00:00,Sacramento Kings,1.94,1.86,1.98,...,7,7,4,1.92,1.97,1.83,2.28,False,116,105
3,14.5,15.0,13.5,15.5,Los Angeles Clippers,2015-03-26 01:00:00,New York Knicks,17/20,93/100,7/10,...,8,4,4,97/100,47/50,23/20,43/50,False,111,80
4,3.0,3.5,2.5,2.0,Brooklyn Nets,2015-02-21 05:30:00,Los Angeles Lakers,1.92,1.94,1.83,...,8,5,4,1.94,1.87,2.03,2.12,False,114,105


In [5]:
ah_df = ah_df[ah_df.away_team != 'Team USA']
ah_df = ah_df[ah_df.away_team != 'West']
ah_df = ah_df[ah_df.home_team != 'Team World']
ah_df = ah_df[ah_df.away_team != 'EAST']
ah_df['winner'] = np.where(ah_df['score_home'] - ah_df['score_away']>0, 'home','away')
ah_df['ot'] = ah_df['overtime'].apply(lambda x : 1 if x == True else 0)
ah_df['home_win'] = ah_df['winner'].apply(lambda x : 1 if x == 'home' else 0)

ah_df['date_time_DT'] = pd.to_datetime(ah_df['date_time'])
ah_df['year'] = ah_df['date_time_DT'].dt.year.astype(str)
ah_df['month'] = ah_df['date_time_DT'].dt.month.astype(str)
ah_df['day'] = ah_df['date_time_DT'].dt.day.astype(str)
ah_df['weekDay'] = ah_df['date_time_DT'].dt.dayofweek.astype(str)

In [6]:
from fractions import Fraction
from decimal import Decimal

ah_df["new_odd_home_1"]= ah_df.odd_home_1.apply(lambda x: to_decimal(x))
ah_df["new_odd_home_2"]= ah_df.odd_home_2.apply(lambda x: to_decimal(x))
ah_df["new_odd_home_3"]= ah_df.odd_home_3.apply(lambda x: to_decimal(x))
ah_df["new_odd_home_4"]= ah_df.odd_home_4.apply(lambda x: to_decimal(x))
ah_df["new_odd_away_1"]= ah_df.odd_away_1.apply(lambda x: to_decimal(x))
ah_df["new_odd_away_2"]= ah_df.odd_away_2.apply(lambda x: to_decimal(x))
ah_df["new_odd_away_3"]= ah_df.odd_away_3.apply(lambda x: to_decimal(x))
ah_df["new_odd_away_4"]= ah_df.odd_away_4.apply(lambda x: to_decimal(x))

In [7]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,away_team,date_time,home_team,odd_away_1,odd_away_2,odd_away_3,...,day,weekDay,new_odd_home_1,new_odd_home_2,new_odd_home_3,new_odd_home_4,new_odd_away_1,new_odd_away_2,new_odd_away_3,new_odd_away_4
0,-5.0,-4.5,-6.0,-5.5,Miami Heat,2015-03-07 02:00:00,Washington Wizards,1.98,1.96,1.91,...,7,5,1.86,1.88,1.95,1.89,1.98,1.96,1.91,1.93
1,-7.0,-7.5,-6.5,-10.5,Milwaukee Bucks,2015-03-13 01:00:00,Indiana Pacers,1.92,1.83,1.98,...,13,4,1.94,2.0,1.86,2.45,1.92,1.83,1.98,1.51
2,9.0,8.5,9.5,6.5,Los Angeles Clippers,2015-03-19 04:00:00,Sacramento Kings,1.94,1.86,1.98,...,19,3,1.92,1.97,1.83,2.28,1.94,1.86,1.98,1.6
3,14.5,15.0,13.5,15.5,Los Angeles Clippers,2015-03-26 01:00:00,New York Knicks,17/20,93/100,7/10,...,26,3,1.97,1.94,2.15,1.86,1.85,1.93,1.7,1.95
4,3.0,3.5,2.5,2.0,Brooklyn Nets,2015-02-21 05:30:00,Los Angeles Lakers,1.92,1.94,1.83,...,21,5,1.94,1.87,2.03,2.12,1.92,1.94,1.83,1.75


In [8]:
ah_df.drop("odd_home_1", axis=1, inplace=True)
ah_df.drop("odd_home_2", axis=1, inplace=True)
ah_df.drop("odd_home_3", axis=1, inplace=True)
ah_df.drop("odd_home_4", axis=1, inplace=True)
ah_df.drop("odd_away_1", axis=1, inplace=True)
ah_df.drop("odd_away_2", axis=1, inplace=True)
ah_df.drop("odd_away_3", axis=1, inplace=True)
ah_df.drop("odd_away_4", axis=1, inplace=True)

In [9]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,away_team,date_time,home_team,odd_cnt_1,odd_cnt_2,odd_cnt_3,...,day,weekDay,new_odd_home_1,new_odd_home_2,new_odd_home_3,new_odd_home_4,new_odd_away_1,new_odd_away_2,new_odd_away_3,new_odd_away_4
0,-5.0,-4.5,-6.0,-5.5,Miami Heat,2015-03-07 02:00:00,Washington Wizards,10,10,8,...,7,5,1.86,1.88,1.95,1.89,1.98,1.96,1.91,1.93
1,-7.0,-7.5,-6.5,-10.5,Milwaukee Bucks,2015-03-13 01:00:00,Indiana Pacers,9,7,7,...,13,4,1.94,2.0,1.86,2.45,1.92,1.83,1.98,1.51
2,9.0,8.5,9.5,6.5,Los Angeles Clippers,2015-03-19 04:00:00,Sacramento Kings,9,7,7,...,19,3,1.92,1.97,1.83,2.28,1.94,1.86,1.98,1.6
3,14.5,15.0,13.5,15.5,Los Angeles Clippers,2015-03-26 01:00:00,New York Knicks,9,8,4,...,26,3,1.97,1.94,2.15,1.86,1.85,1.93,1.7,1.95
4,3.0,3.5,2.5,2.0,Brooklyn Nets,2015-02-21 05:30:00,Los Angeles Lakers,10,8,5,...,21,5,1.94,1.87,2.03,2.12,1.92,1.94,1.83,1.75


In [10]:
one_hot = pd.get_dummies(ah_df[['away_team','home_team']], prefix=['away_team_', 'home_team_'])

In [11]:
ah_df = ah_df.join(one_hot)
ah_df.drop(['away_team','home_team'], axis=1, inplace=True)

In [12]:
ah_df.drop(['date_time', 'date_time_DT', 'winner', 'overtime'], axis=1, inplace=True)

In [13]:
ah_df['total_score'] = ah_df['score_home'] + ah_df['score_away']
ah_df['sub_score'] = ah_df['score_home'] - ah_df['score_away']

In [14]:
ah_df.head()

Unnamed: 0,ah_1,ah_2,ah_3,ah_4,odd_cnt_1,odd_cnt_2,odd_cnt_3,odd_cnt_4,score_away,score_home,...,home_team__Philadelphia 76ers,home_team__Phoenix Suns,home_team__Portland Trail Blazers,home_team__Sacramento Kings,home_team__San Antonio Spurs,home_team__Toronto Raptors,home_team__Utah Jazz,home_team__Washington Wizards,total_score,sub_score
0,-5.0,-4.5,-6.0,-5.5,10,10,8,7,97,99,...,0,0,0,0,0,0,0,1,196,2
1,-7.0,-7.5,-6.5,-10.5,9,7,7,4,103,109,...,0,0,0,0,0,0,0,0,212,6
2,9.0,8.5,9.5,6.5,9,7,7,4,116,105,...,0,0,0,1,0,0,0,0,221,-11
3,14.5,15.0,13.5,15.5,9,8,4,4,111,80,...,0,0,0,0,0,0,0,0,191,-31
4,3.0,3.5,2.5,2.0,10,8,5,4,114,105,...,0,0,0,0,0,0,0,0,219,-9


In [15]:
cols = list(ah_df.columns.values)
ah_df = ah_df[[u'away_team__Atlanta Hawks',
 u'away_team__Boston Celtics',
 u'away_team__Brooklyn Nets',
 u'away_team__Charlotte Hornets',
 u'away_team__Chicago Bulls',
 u'away_team__Cleveland Cavaliers',
 u'away_team__Dallas Mavericks',
 u'away_team__Denver Nuggets',
 u'away_team__Detroit Pistons',
 u'away_team__Golden State Warriors',
 u'away_team__Houston Rockets',
 u'away_team__Indiana Pacers',
 u'away_team__Los Angeles Clippers',
 u'away_team__Los Angeles Lakers',
 u'away_team__Memphis Grizzlies',
 u'away_team__Miami Heat',
 u'away_team__Milwaukee Bucks',
 u'away_team__Minnesota Timberwolves',
 u'away_team__New Orleans Pelicans',
 u'away_team__New York Knicks',
 u'away_team__Oklahoma City Thunder',
 u'away_team__Orlando Magic',
 u'away_team__Philadelphia 76ers',
 u'away_team__Phoenix Suns',
 u'away_team__Portland Trail Blazers',
 u'away_team__Sacramento Kings',
 u'away_team__San Antonio Spurs',
 u'away_team__Toronto Raptors',
 u'away_team__Utah Jazz',
 u'away_team__Washington Wizards',
 u'home_team__Atlanta Hawks',
 u'home_team__Boston Celtics',
 u'home_team__Brooklyn Nets',
 u'home_team__Charlotte Hornets',
 u'home_team__Chicago Bulls',
 u'home_team__Cleveland Cavaliers',
 u'home_team__Dallas Mavericks',
 u'home_team__Denver Nuggets',
 u'home_team__Detroit Pistons',
 u'home_team__Golden State Warriors',
 u'home_team__Houston Rockets',
 u'home_team__Indiana Pacers',
 u'home_team__Los Angeles Clippers',
 u'home_team__Los Angeles Lakers',
 u'home_team__Memphis Grizzlies',
 u'home_team__Miami Heat',
 u'home_team__Milwaukee Bucks',
 u'home_team__Minnesota Timberwolves',
 u'home_team__New Orleans Pelicans',
 u'home_team__New York Knicks',
 u'home_team__Oklahoma City Thunder',
 u'home_team__Orlando Magic',
 u'home_team__Philadelphia 76ers',
 u'home_team__Phoenix Suns',
 u'home_team__Portland Trail Blazers',
 u'home_team__Sacramento Kings',
 u'home_team__San Antonio Spurs',
 u'home_team__Toronto Raptors',
 u'home_team__Utah Jazz',
 u'home_team__Washington Wizards',

 u'ah_1',
 u'odd_cnt_1',
 'new_odd_home_1',
 'new_odd_away_1',
 u'ah_2',
 u'odd_cnt_2',
 'new_odd_home_2',
 'new_odd_away_2',
 u'ah_3',
 u'odd_cnt_3',
 'new_odd_home_3',
 'new_odd_away_3',
 u'ah_4',
 u'odd_cnt_4',
 'new_odd_home_4',
 'new_odd_away_4',
 
 'year',
 'month',
 'day',
 'weekDay',
 'ot',
 'home_win',
 u'score_away',
 u'score_home',
 'total_score',
 'sub_score']]

In [58]:
ah_df.to_json("ah_df_bk20170120.json", orient='records')

In [16]:
ah_data = ah_df.as_matrix()

In [17]:
ah_data.shape

(3091, 86)

In [18]:
train_data = ah_data[:,:-5]
train_label = ah_data[:,-5:].astype(int)

In [19]:
# First build a simple RFC for home_win
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,0], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4))])
rf_clf = clf.fit(x_train, y_train)
y_hat = rf_clf.predict(x_test)
result = (y_hat == y_test)



In [20]:
acc = np.mean(result)
acc

0.69719827586206895

In [21]:
# Try some other clf on home_win
from sklearn.ensemble import GradientBoostingClassifier
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,0], test_size=0.3)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', GradientBoostingClassifier(n_estimators=200, max_depth=4))])
rf_clf = clf.fit(x_train, y_train)
y_hat = rf_clf.predict(x_test)
result = (y_hat == y_test)

In [22]:
acc = np.mean(result)
acc

0.68318965517241381

In [23]:
# Try XGBoost on home_win
import xgboost as xgb
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,0], test_size=0.3)

data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 2}
bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)
result = (y_hat == y_test)

[0]	eval-merror:0.324353	train-merror:0.294036
[1]	eval-merror:0.325431	train-merror:0.288488
[2]	eval-merror:0.329741	train-merror:0.287564
[3]	eval-merror:0.324353	train-merror:0.275543
[4]	eval-merror:0.315733	train-merror:0.263985
[5]	eval-merror:0.318966	train-merror:0.256588




In [24]:
acc = np.mean(result)
acc

0.68103448275862066

In [25]:
adjusted_x_cols = [u'away_team__Atlanta Hawks',
 u'away_team__Boston Celtics',
 u'away_team__Brooklyn Nets',
 u'away_team__Charlotte Hornets',
 u'away_team__Chicago Bulls',
 u'away_team__Cleveland Cavaliers',
 u'away_team__Dallas Mavericks',
 u'away_team__Denver Nuggets',
 u'away_team__Detroit Pistons',
 u'away_team__Golden State Warriors',
 u'away_team__Houston Rockets',
 u'away_team__Indiana Pacers',
 u'away_team__Los Angeles Clippers',
 u'away_team__Los Angeles Lakers',
 u'away_team__Memphis Grizzlies',
 u'away_team__Miami Heat',
 u'away_team__Milwaukee Bucks',
 u'away_team__Minnesota Timberwolves',
 u'away_team__New Orleans Pelicans',
 u'away_team__New York Knicks',
 u'away_team__Oklahoma City Thunder',
 u'away_team__Orlando Magic',
 u'away_team__Philadelphia 76ers',
 u'away_team__Phoenix Suns',
 u'away_team__Portland Trail Blazers',
 u'away_team__Sacramento Kings',
 u'away_team__San Antonio Spurs',
 u'away_team__Toronto Raptors',
 u'away_team__Utah Jazz',
 u'away_team__Washington Wizards',
 u'home_team__Atlanta Hawks',
 u'home_team__Boston Celtics',
 u'home_team__Brooklyn Nets',
 u'home_team__Charlotte Hornets',
 u'home_team__Chicago Bulls',
 u'home_team__Cleveland Cavaliers',
 u'home_team__Dallas Mavericks',
 u'home_team__Denver Nuggets',
 u'home_team__Detroit Pistons',
 u'home_team__Golden State Warriors',
 u'home_team__Houston Rockets',
 u'home_team__Indiana Pacers',
 u'home_team__Los Angeles Clippers',
 u'home_team__Los Angeles Lakers',
 u'home_team__Memphis Grizzlies',
 u'home_team__Miami Heat',
 u'home_team__Milwaukee Bucks',
 u'home_team__Minnesota Timberwolves',
 u'home_team__New Orleans Pelicans',
 u'home_team__New York Knicks',
 u'home_team__Oklahoma City Thunder',
 u'home_team__Orlando Magic',
 u'home_team__Philadelphia 76ers',
 u'home_team__Phoenix Suns',
 u'home_team__Portland Trail Blazers',
 u'home_team__Sacramento Kings',
 u'home_team__San Antonio Spurs',
 u'home_team__Toronto Raptors',
 u'home_team__Utah Jazz',
 u'home_team__Washington Wizards',

 u'ah_1',
 u'odd_cnt_1',
 'new_odd_home_1',
 'new_odd_away_1',
 u'ah_2',
 u'odd_cnt_2',
 'new_odd_home_2',
 'new_odd_away_2',
 u'ah_3',
 u'odd_cnt_3',
 'new_odd_home_3',
 'new_odd_away_3',
 u'ah_4',
 u'odd_cnt_4',
 'new_odd_home_4',
 'new_odd_away_4',
 
 'year',
 'month',
 'day',
 'weekDay',
 'ot']
adjusted_y_cols = ['home_win',
 u'score_away',
 u'score_home',
 'total_score',
 'sub_score']

In [38]:
home_win = ah_df['home_win']
sub_score = ah_df['sub_score']

In [39]:
# Second build a simple RFR for subscore
from sklearn.ensemble import RandomForestRegressor
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)
# clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)

clf = Pipeline([
        ('ss', StandardScaler()),
        ('DTC', RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=4))])
rf_clf = clf.fit(x_train, y_train)
y_hat = rf_clf.predict(x_test)

In [40]:
payout_df = pd.DataFrame(x_test, columns=adjusted_x_cols)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : calculat_payout(x), axis=1)
payout_df.payout_1.sum()

-20.726285943805628

In [47]:
# Try XGBoost on subscore
import xgboost as xgb
x_train, x_test, y_train, y_test = train_test_split(train_data,train_label[:,4], test_size=0.3)

data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'reg:linear'}
bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)

[0]	eval-rmse:11.8986	train-rmse:11.7159
[1]	eval-rmse:11.9464	train-rmse:11.5472
[2]	eval-rmse:12.0135	train-rmse:11.4103
[3]	eval-rmse:12.1157	train-rmse:11.2615
[4]	eval-rmse:12.2269	train-rmse:11.1443
[5]	eval-rmse:12.2004	train-rmse:11.0133


In [48]:
payout_df = pd.DataFrame(x_test, columns=adjusted_x_cols)
payout_df['predict_subscore'] = y_hat
payout_df['home_win'] = home_win
payout_df['sub_score'] = sub_score
payout_df.head()
payout_df['payout_1'] = payout_df.apply(lambda x : calculat_payout(x), axis=1)
payout_df.payout_1.sum()

-46.151602151502189