In [1]:
import pandas as pd
import sys
import os
import logging
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import pickle
from scipy.stats import norm

In [2]:
sys.path.append('{}/mmml'.format(os.path.dirname(os.getcwd())))
from mmml.config import data_folder
from mmml.game_results import *
from mmml.utils import *

<b> Model Objects </b>

In [3]:
columns_key = getFeatureDict(pd.read_csv('{}/mmml/mmml/feature_list2.csv'.format(os.path.dirname(os.getcwd()))))

In [4]:
with open("{}/Model_Objects/{}.pkl".format(os.path.dirname(os.getcwd()), 'xgboost_regression_reverse'), 'rb') as file:
    model = pickle.load(file)

<b> Scoring Data </b>

In [5]:
base_oot = pd.read_pickle('{}/Data/Processed/base_oot.pkl'.format(os.path.dirname(os.getcwd())))
base_oot.head()

Unnamed: 0,HTeamID,ATeamID,Season,DayNum,HWin,HScore,AScore,GameRound,Seed_H,Seed_A,GameSlot
943,1314,1411,2017,137,1,103,64,1,Z01,Z16,R1Z1
949,1455,1173,2017,137,1,64,58,1,Z10,Z07,R1Z7
1013,1344,1401,2018,137,0,69,73,1,Z10,Z07,R1Z7
1066,1388,1437,2019,136,0,57,61,1,Z11,Z06,R1Z6
1005,1199,1281,2018,137,1,67,54,1,Z09,Z08,R1Z8


In [6]:
scaled_x_features_oot = pd.read_pickle('{}/Data/Processed/{}.pkl'.format(os.path.dirname(os.getcwd()), 'scaled_x_features_oot'))
scaled_x_features_oot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Ast,Blk,DR,FGA,FGA3,FGM,FGM3,FTA,FTM,NLoc,...,possessions,o_eff,d_eff,net_eff,elo,last_elo,MOR,POM,SAG,Avg_Rank
TeamID,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1427,2019,362,70,756,1801,752,762,258,472,348,5,...,2070.025,102.897308,102.027753,0.869555,"[1500.0, 1488.4820238179843, 1480.415603408158...",1511.939254,248.0,151.0,153.0,0.522857
1287,2019,400,131,753,1844,714,772,240,625,442,4,...,2202.9,101.048618,109.446639,-8.398021,"[1500.0, 1491.8493246116866, 1484.147568908605...",1452.968527,262.0,255.0,251.0,0.728571
1354,2019,359,53,699,1776,552,748,179,705,524,2,...,2230.925,98.568979,110.85088,-12.281901,"[1500.0, 1492.0748768264345, 1483.911603995309...",1368.327533,330.0,338.0,341.0,0.958095
1159,2018,394,101,719,1766,804,770,292,510,391,1,...,2077.2125,107.01842,105.477894,1.540526,"[1500.0, 1491.8493246116866, 1500.240321356997...",1538.256763,191.0,210.0,200.0,0.569524
1181,2019,541,232,965,2173,818,1037,247,751,518,8,...,2517.1,112.788526,91.25581,21.532716,"[1500.0, 1510.0, 1518.099318978226, 1525.97497...",1680.401035,1.0,3.0,1.0,0.001905


### Test Inidividual Functions

<b> Baseline Chalk Model </b>
- Predict winner as higher seed
- Output: Prediction, Probability (Indexed by Base Index)

In [7]:
### HELPERS FOR CHALK PREDICTION ALG
def getNumericSeed(seed):
    """
    Helper function for convering Region/Seed into strictly numeric value
    Example: 'W01' -> 1
    """
    seed = seed[1:]

    if seed[-1] in ['a', 'b']:
        seed = seed[:-1]

    return int(seed)

def chalk_predictions(df, seed=42):
    """
    Baseline prediciton algorithm. Predict higher seeds will win every time
    Output = DF with Pred and Prob columns, indexed by input DF.
    """
    score_df = df.copy()

    # Add Random for if
    np.random.seed(seed=seed)
    score_df['rand'] = np.random.rand(score_df.shape[0])

    # Get Numeric Seeds
    score_df['Seed_H_Numeric'] = score_df['Seed_H'].apply(lambda x: getNumericSeed(x))
    score_df['Seed_A_Numeric'] = score_df['Seed_A'].apply(lambda x: getNumericSeed(x))

    score_df['Pred'] = np.where(score_df['Seed_H_Numeric'] < score_df['Seed_A_Numeric'], 1,
                                np.where(score_df['Seed_H_Numeric'] > score_df['Seed_A_Numeric'], 0,
                                        np.where(score_df['rand'] < 0.5, 1, 0)))
    score_df['Prob'] = np.where(score_df['Pred'] == 1, 1.0, 0.0)

    prediction_df = score_df[['Pred','Prob']].set_index(score_df.index)
    return prediction_df

In [8]:
score_r1 = createModelData(base=base_oot, x_features=scaled_x_features_oot, columns_key=columns_key)
pred = chalk_predictions(score_r1)
pred.head(3)

Unnamed: 0,Pred,Prob
943,1,1.0
972,1,1.0
919,0,0.0


<b> Score Using Model Object </b>
- Output: Prediction, Probability (Indexed by Base Index)

In [9]:
def model_predictions(df, model, features):
    """
    Helper to score dataframe
    Output = DF with Pred and Prob columns, indexed by input DF.
    """
    # Produce Scores
    y_pred = model['clf'].predict(df[features])
    y_pred = pd.DataFrame(y_pred, columns=['Pred'], index=df.index)

    y_pred['Prob'] = y_pred['Pred'].apply(lambda x: norm(model['mean'], model['std']).cdf(x))

    y_pred['Pred'] = np.where(y_pred['Pred'] > 0, 1, 0)

    return y_pred

In [10]:
score_r1 = createModelData(base=base_oot, x_features=scaled_x_features_oot, columns_key=columns_key)
pred = model_predictions(score_r1, model, columns_key['features'])
pred.head(3)

Unnamed: 0,Pred,Prob
943,1,[0.9089902483536632]
972,1,[0.625647665635849]
919,0,[0.2062551717381958]


<b> Score Round of Matchups </b>
- Given Base of matchups and X-features, score round and produce matchups for the next round

In [11]:
def score_round(matchups_r1, x_features, columns_key, scorer='chalk'):
    """
    Score matchups for round n and create matchups for round n+1
    """
    base_path = os.path.dirname(os.getcwd())
    
    # Merge Current Round Matchups with X_Features
    score_r1 = createModelData(base=matchups_r1, x_features=x_features, columns_key=columns_key)

    # Score round matchups
    if scorer=='chalk':
        predictions = chalk_predictions(df=score_r1)
    else:
        predictions = model_predictions(df=score_r1, model=scorer, features=columns_key['features']) # ToDo - pass features?

    # Merge Predictions/Probabilites back to Score DF
    pred_r1 = score_r1.merge(predictions, left_index=True, right_index=True)

    # Identify Seed/TeamID of predicted Winner
    pred_r1['WTeamID_pred'] = np.where(pred_r1['Pred']==1, pred_r1['HTeamID'], pred_r1['ATeamID'])
    pred_r1['WSeed_pred'] = np.where(pred_r1['Pred']==1, pred_r1['Seed_H'], pred_r1['Seed_A'])

    # Get Slots
    slots_simple = pd.read_csv('{}/Data/Raw/{}/MDataFiles_Stage1/MNCAATourneySeedRoundSlots.csv'.format(base_path, data_folder))
    slots_simple.drop('EarlyDayNum', axis=1, inplace=True)
    slots_simple.drop('LateDayNum', axis=1, inplace=True)
    slots_simple = slots_simple.set_index(['Seed', 'GameRound'])

    # Combine Next Round Slots and Current Round Predictions
    r2_slots = pred_r1[['Season', 'GameRound', 'WSeed_pred', 'WTeamID_pred']].copy()
    #r2_slots['GameRound'] = round_nbr + 1
    r2_slots['GameRound'] = r2_slots['GameRound'] + 1
    r2_slots = r2_slots.merge(slots_simple, left_on=['WSeed_pred', 'GameRound'], right_index=True, how='left')
    r2_slots = r2_slots.rename(columns={'WSeed_pred':'Seed', 'WTeamID_pred':'TeamID'})

    # Create H/A Team Matchups based on GameSlot
    r2_slots.sort_values(['Season', 'GameSlot', 'Seed'], inplace=True)
    r2_slots_H = r2_slots.drop_duplicates(subset=['Season','GameSlot'], keep='first')
    r2_slots_A = r2_slots.drop_duplicates(subset=['Season','GameSlot'], keep='last')

    base_r2 = r2_slots_H.merge(r2_slots_A,
                      left_on=['Season', 'GameRound', 'GameSlot'],
                      right_on=['Season', 'GameRound', 'GameSlot'],
                     how='inner', suffixes=['_H', '_A'])

    base_r2 = base_r2.rename(columns={'TeamID_H':'HTeamID', 'TeamID_A':'ATeamID'})

    return base_r2, pred_r1

In [12]:
base_r1 = base_oot.query('GameRound==1')
base_r2, pred_r1 = score_round(matchups_r1=base_r1, x_features=scaled_x_features_oot, columns_key=columns_key, scorer='chalk')
pred_r1.head(3)

Unnamed: 0,HTeamID,ATeamID,Season,GameRound,Seed_H,Seed_A,GameSlot,HScore_diff,possessions_diff,o_eff_diff,d_eff_diff,net_eff_diff,last_elo_diff,Avg_Rank_diff,Pred,Prob,WTeamID_pred,WSeed_pred
943,1314,1411,2017,1,Z01,Z16,R1Z1,39,11.7375,10.109038,-4.732006,14.841044,56.063338,-0.57619,1,1.0,1314,Z01
949,1455,1173,2017,1,Z10,Z07,R1Z7,6,210.4375,8.454454,-6.039917,14.494371,59.559292,-0.088571,0,0.0,1173,Z07
1013,1344,1401,2018,1,Z10,Z07,R1Z7,-4,145.1875,-2.003137,3.886741,-5.889879,17.363832,0.07619,0,0.0,1401,Z07


<b> Overall Wrapper Scoring Function </b>
- Intergrate individual functions together
- Loop through round scoring to produce true scoring and predictions for bracket-style tournament 

In [13]:
def fnScore(base, x_features, scorer='chalk'):
    """
    docstring
    """

    # ToDo - Split for simple scoring method?

    ## READ FEATURE DICT
    columns_key = getFeatureDict(pd.read_csv('{}/mmml/mmml/feature_list2.csv'.format(os.path.dirname(os.getcwd()))))

    #### Get Initial Set of True Results
    true_outcome = base.copy()
    true_outcome['WTeamID_true'] = np.where(true_outcome['HWin']==1, true_outcome['HTeamID'], true_outcome['ATeamID'])
    true_outcome = true_outcome[['Season', 'GameRound', 'GameSlot', 'WTeamID_true']]

    #### Create dictionary of base matchups and predictions for each round
    round_dict = {i:{'base':None,'pred':None} for i in range(1,7)}
    round_dict[1]={'base':base.query('GameRound==1')}

    #### Score each round and create next round matchups
    for round_num in range(1,7):
        logging.info("Getting predictions for Round {}...".format(round_num))
        base_r, pred_r0 = score_round(round_dict[round_num]['base'], x_features, columns_key, scorer=scorer)

        round_dict[round_num]['pred'] = pred_r0

        if round_num != 6:
            round_dict[round_num+1]['base'] = base_r

    #### CREATE OVERALL RESULTS DF
    logging.info("Creating DataFrame of all results...")
    results_df = round_dict[1]['pred']

    for i in range(2,7):
        results_df = results_df.append(round_dict[i]['pred'])

    results_df = results_df.merge(true_outcome, left_on=['Season', 'GameRound', 'GameSlot'], right_on=['Season', 'GameRound', 'GameSlot'], how='left')

    #### SCORE VS TRUE RESULTS
    logging.info("Scoring...")

    # Flag if prediction correct
    #results_df['Correct'] = np.where(results_df['WTeamID_true'] == results_df['WTeamID_pred'], 1, 0)

    # Convert results to ESPN type score
    #points_dict = {1:10, 2:20, 3:40, 4:80, 5:160, 6:320}
    #results_df['Points'] = np.where(results_df['Correct']==1, results_df['GameRound'].apply(lambda x: points_dict[x]), 0)

    return round_dict, results_df

In [14]:
round_dict, results_df = fnScore(base=base_oot, x_features=scaled_x_features_oot, scorer=model)

In [15]:
results_df.head(5)

Unnamed: 0,HTeamID,ATeamID,Season,GameRound,Seed_H,Seed_A,GameSlot,HScore_diff,possessions_diff,o_eff_diff,d_eff_diff,net_eff_diff,last_elo_diff,Avg_Rank_diff,Pred,Prob,WTeamID_pred,WSeed_pred,WTeamID_true
0,1314,1411,2017,1,Z01,Z16,R1Z1,39,11.7375,10.109038,-4.732006,14.841044,56.063338,-0.57619,1,[0.9089902483536632],1314,Z01,1314
1,1455,1173,2017,1,Z10,Z07,R1Z7,6,210.4375,8.454454,-6.039917,14.494371,59.559292,-0.088571,1,[0.664916321934604],1455,Z10,1455
2,1344,1401,2018,1,Z10,Z07,R1Z7,-4,145.1875,-2.003137,3.886741,-5.889879,17.363832,0.07619,0,[0.39418241096581874],1401,Z07,1401
3,1388,1437,2019,1,Z11,Z06,R1Z6,-4,-119.675,0.388529,-1.685512,2.07404,-31.06791,0.027619,0,[0.4569758654392564],1437,Z06,1437
4,1199,1281,2018,1,Z09,Z08,R1Z8,13,183.9375,2.520309,0.29267,2.227639,7.204241,-0.02,1,[0.5701009349260717],1199,Z09,1199


<b> Evaulate Function </b>
- Given predictions, score vs. True results

In [16]:
def evaluate(results_df):
    # Create Flag if prediction correct
    results_df['Correct'] = np.where(results_df['WTeamID_true'] == results_df['WTeamID_pred'], 1, 0)

    # GameRound: Point value from ESPN bracket challenge
    points_dict = {1:10, 2:20, 3:40, 4:80, 5:160, 6:320}
    results_df['Points'] = np.where(results_df['Correct']==1, results_df['GameRound'].apply(lambda x: points_dict[x]), 0)

    year_list = list(set(results_df['Season']))
    year_list.sort()

    # Results for each year
    results_dict = {}
    for year in year_list: # for year in year_list:
        acc = results_df.query('Season=={}'.format(year))['Correct'].sum() / results_df.query('Season=={}'.format(year))['Correct'].count()
        pts = results_df.query('Season=={}'.format(year))['Points'].sum()

        by_round_correct = results_df.query('Season=={}'.format(year)).groupby('GameRound').sum()['Correct']
        by_round_total = results_df.query('Season=={}'.format(year)).groupby('GameRound').count()['Correct']
        by_round_pts = results_df.query('Season=={}'.format(year)).groupby('GameRound').sum()['Points']
        by_round_results = pd.DataFrame(by_round_pts).merge(pd.DataFrame(by_round_correct / by_round_total), left_index=True, right_index=True).transpose()

        # print("")
        print("{year}: {acc}, {pts}".format(year=year, acc=acc, pts=pts))
        print("{} \n".format(by_round_results))
        results_dict[year] = {'acc':acc, 'pts':pts}
    return results_dict

In [17]:
evaluate(results_df)

2017: 0.6507936507936508, 910
GameRound          1         2      3      4      5    6
Points     250.00000  180.0000  160.0  160.0  160.0  0.0
Correct      0.78125    0.5625    0.5    0.5    0.5  0.0 

2018: 0.6349206349206349, 1120
GameRound       1        2        3      4      5      6
Points     240.00  200.000  120.000  80.00  160.0  320.0
Correct      0.75    0.625    0.375   0.25    0.5    1.0 

2019: 0.6825396825396826, 950
GameRound          1         2        3      4      5    6
Points     210.00000  300.0000  200.000  80.00  160.0  0.0
Correct      0.65625    0.9375    0.625   0.25    0.5  0.0 



{2017: {'acc': 0.6507936507936508, 'pts': 910},
 2018: {'acc': 0.6349206349206349, 'pts': 1120},
 2019: {'acc': 0.6825396825396826, 'pts': 950}}

<b> Create DF of Bracket Predictions </b>

In [18]:
def getBracket(results_df):
    base_path = os.path.dirname(os.getcwd())
    teams = pd.read_csv('{}/Data/Raw/{}/MDataFiles_Stage1/MTeams.csv'.format(base_path, data_folder))
    
    
    merged = results_df.merge(teams, left_on='HTeamID', right_on='TeamID')\
                        .merge(teams, left_on='ATeamID', right_on='TeamID', suffixes=['_H', '_A'])
    
    bracket = merged[['Season', 'GameRound', 'Seed_H', 'Seed_A', 'TeamName_H', 'TeamName_A', 'Pred', 'Prob']].copy()
    
    bracket.sort_values(by=['Season', 'GameRound', 'Seed_H'], inplace=True)
    #bracket=merged
    return bracket

In [19]:
bracket = getBracket(results_df)

In [20]:
bracket.query('Season==2017 and GameRound==2')

Unnamed: 0,Season,GameRound,Seed_H,Seed_A,TeamName_H,TeamName_A,Pred,Prob
134,2017,2,W01,W08,Villanova,Wisconsin,1,[0.7214097365150929]
50,2017,2,W02,W10,Duke,Marquette,1,[0.7760584683261542]
116,2017,2,W03,W06,Baylor,SMU,0,[0.33668901516256217]
29,2017,2,W04,W05,Florida,Virginia,0,[0.4500216517797819]
60,2017,2,X01,X09,Gonzaga,Vanderbilt,1,[0.8957006429478908]
52,2017,2,X02,X07,Arizona,St Mary's CA,0,[0.4932666184107198]
47,2017,2,X03,X11,Florida St,Xavier,1,[0.6351500552827891]
178,2017,2,X04,X05,West Virginia,Notre Dame,1,[0.6665976614202224]
81,2017,2,Y01,Y08,Kansas,Miami FL,1,[0.6823318365956402]
13,2017,2,Y02,Y07,Louisville,Michigan,1,[0.5748134140579426]
