In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.externals import joblib

### Read in data

In [2]:
# Read in DF of historical stats
all_stats = pd.read_pickle("all_stats.pkl")
all_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Ast,Blk,DR,FGA,FGA3,FGM,FGM3,FTA,FTM,OR,...,max_FGA,min_Score,min_OppScore,min_TO,min_OppTO,min_FGA3,min_FGA,FTpct,FG3pct,FGpct
Season,TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003,1102,13.0,1.785714,16.821429,39.785714,20.821429,19.142857,7.821429,17.107143,11.142857,4.178571,...,51,33,33,5,6,9,32,0.651357,0.375643,0.481149
2003,1103,15.222222,2.333333,19.925926,55.851852,16.074074,27.148148,5.444444,25.851852,19.037037,9.777778,...,75,52,55,6,9,7,36,0.73639,0.33871,0.486074
2003,1104,12.107143,3.785714,23.928571,57.178571,19.857143,24.035714,6.357143,20.928571,14.857143,13.571429,...,73,46,48,7,7,11,42,0.709898,0.320144,0.420362
2003,1105,14.538462,2.076923,23.115385,61.615385,20.769231,24.384615,7.576923,21.846154,15.423077,13.5,...,73,40,53,7,10,10,45,0.705986,0.364815,0.395755
2003,1106,11.678571,3.142857,23.857143,55.285714,17.642857,23.428571,6.107143,16.464286,10.642857,12.285714,...,71,43,33,8,7,10,43,0.646421,0.346154,0.423773


In [3]:
# Features to find difference between teams
stat_features = ['OppScore', 'Score','OppFGM','OppFGA', 'OppFGM3', 'OppFGA3','OppFTM', 'OppFTA', 'OppOR', 'OppDR',
 'OppAst', 'OppTO', 'OppStl', 'OppBlk', 'OppPF', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl',
 'Blk', 'PF', 'FGpct', 'FG3pct', 'FTpct']

In [4]:
# Read in DF of matchups
NCAA_2018 = pd.read_csv('2018/TournamentMatchups_R1_verify.csv')
print(NCAA_2018.shape)
NCAA_2018 = NCAA_2018.drop('Unnamed: 0', axis=1)
NCAA_2018.head()

(32, 6)


Unnamed: 0,Season,Team1,Team2,Team1ID,Team2ID
0,2018,Virginia,UMBC,1438,1420
1,2018,Creighton,Kansas St,1166,1243
2,2018,Kentucky,Davidson,1246,1172
3,2018,Arizona,Buffalo,1112,1138
4,2018,Miami FL,Loyola-Chicago,1274,1260


In [5]:
# DF with key linking TeamName and TeamID
teams = pd.read_csv('Teams.csv')
teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2018
1,1102,Air Force,1985,2018
2,1103,Akron,1985,2018
3,1104,Alabama,1985,2018
4,1105,Alabama A&M,2000,2018


In [6]:
# Read in fitted RF model
fitted_rf = joblib.load('fitted_rf.pkl') 

### Get predictions for tournament matchups

In [7]:
# Create function to add features to matchupsDF
def add_features(matchupsDF, stats_df, stat_features):
    team1_stats = ['Team1'+x for x in stat_features]
    team2_stats = ['Team2'+x for x in stat_features]
    
    # Rename columns for Team 1 stats
    predict = pd.merge(matchupsDF, stats_df[stat_features], how='left', left_on=['Season','Team1ID'], right_index=True)
    team1_stats = ['Team1'+x for x in stat_features]
    
    # Rename columns with Team1 prefix
    predict.rename(columns=dict(zip(stat_features, team1_stats)), inplace=True)
    
    # Add columns for Team 2 stats
    predict = pd.merge(predict, all_stats[stat_features], how='left', left_on=['Season','Team2ID'], right_index=True)
    team2_stats = ['Team2'+x for x in stat_features]
    predict.rename(columns=dict(zip(stat_features, team2_stats)), inplace=True)
    
    for x in stat_features:
        predict[x+'_diff'] = predict['Team1'+x] - predict['Team2'+x]
    
    return predict

<b> Make predictions for each round </b>

In [8]:
# Initial matchups - first round
tournamentMatchups = NCAA_2018

# Initialize dict of picks
predictionDict = {'Round1':None, 'Round2': None, 'Sweet16':None, 'Elite8':None, 'Final4':None, 'Champ': None}

# List of rounds to iterate through
roundList = list(predictionDict.keys())

for tournamentRound in roundList:
    print(tournamentRound)
    
    # Get season stats for each team in round
    predict = add_features(tournamentMatchups, all_stats, stat_features)
    print(predict.shape)
    
    # Get columns for model
    predict_small = predict[['Score_diff', 'OppScore_diff', 'FTpct_diff', 'FTA_diff']]
    
    # Generate picks
    picks = pd.DataFrame(fitted_rf.predict(predict_small))
    picks.rename(columns={0:"Winner"}, inplace=True)

    # Append Winner picks to matchup DF
    roundPicks = pd.concat([tournamentMatchups, picks], axis=1)
    roundPicks['WinnerTeam'] = roundPicks['Team1'].where(roundPicks['Winner'] == 1, roundPicks['Team2'])
    roundPicks.drop('Winner',axis=1,inplace=True)
    predictionDict[tournamentRound] = roundPicks
    
    print(roundPicks)
    
    # Generate input DF
    
    # Iterate through predicted winners of round i to create matchups for round i+1
    my_list = range(0,roundPicks.shape[0],2)

    if tournamentRound != 'Champ':
        
        # Gather pairs of winning teams - format ensures winners will matchup in next round
        for i in my_list:
           # Get winners from each input game 
            team1 = roundPicks.loc[i]['WinnerTeam']
            team2 = roundPicks.loc[i+1]['WinnerTeam']
            d = {'Season':2018, 'Team1': [team1], 'Team2': [team2]}

            # Create/Append to dataframe of next round matchups
            if i == 0:
                df = pd.DataFrame(data=d)
            else:
                df_new = pd.DataFrame(data=d)
                df = df.append(df_new, ignore_index=True)

        # Format matchups DF for next round
        matchups = df.merge(teams[['TeamID', 'TeamName']], left_on='Team1', right_on="TeamName")
        matchups.drop('TeamName', axis=1, inplace=True)
        matchups.rename(columns={'TeamID': 'Team1ID'}, inplace=True)

        matchups = matchups.merge(teams[['TeamID', 'TeamName']], left_on='Team2', right_on="TeamName")
        matchups.drop('TeamName', axis=1, inplace=True)
        matchups.rename(columns={'TeamID': 'Team2ID'}, inplace=True)

        tournamentMatchups = matchups
    else:
        print()
        print(roundPicks['Team1'][0], 'Scoring Stats')
        print(all_stats.loc[2018,roundPicks['Team1ID'][0]][['Score','OppScore','max_Score','max_OppScore']])
        
        print()
        print(roundPicks['Team2'][0], 'Scoring Stats')
        print(all_stats.loc[2018,roundPicks['Team2ID'][0]][['Score','OppScore','max_Score','max_OppScore']])
        

Round1
(32, 98)
    Season           Team1           Team2  Team1ID  Team2ID      WinnerTeam
0     2018        Virginia            UMBC     1438     1420        Virginia
1     2018       Creighton       Kansas St     1166     1243       Creighton
2     2018        Kentucky        Davidson     1246     1172        Kentucky
3     2018         Arizona         Buffalo     1112     1138         Arizona
4     2018        Miami FL  Loyola-Chicago     1274     1260  Loyola-Chicago
5     2018       Tennessee       Wright St     1397     1460       Wright St
6     2018          Nevada           Texas     1305     1400          Nevada
7     2018      Cincinnati      Georgia St     1153     1209      Cincinnati
8     2018          Xavier      NC Central     1462     1300          Xavier
9     2018        Missouri      Florida St     1281     1199        Missouri
10    2018         Ohio St    South Dakota     1326     1377    South Dakota
11    2018         Gonzaga  UNC Greensboro     1211     1422