In [11]:
import pandas as pd
import numpy as np
import glob

import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier

In [12]:
files = glob.glob('../mens-march-mania-2022/MDataFiles_Stage2/*')
[print(i,'\m/',f) for i,f in enumerate(files)];

0 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MNCAATourneyDetailedResults.csv
1 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MNCAATourneyCompactResults.csv
2 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MSeasons.csv
3 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MMasseyOrdinals_thruDay128.csv
4 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MRegularSeasonDetailedResults.csv
5 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MNCAATourneySlots.csv
6 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MGameCities.csv
7 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MConferenceTourneyGames.csv
8 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/Cities.csv
9 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MRegularSeasonCompactResults.csv
10 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MNCAATourneySeedRoundSlots.csv
11 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MTeamConferences.csv
12 \m/ ../mens-march-mania-2022/MDataFiles_Stage2/MSampleSubmissionStage2.csv
13 \m/ ../mens-march-mania-20

In [13]:
SeasonResults = pd.read_csv(files[4])
SeasonResults = SeasonResults.copy()
SeasonResults = SeasonResults[SeasonResults['Season'] >= 2003].reset_index(drop=True)  
#SeasonResults

winning_cols = ['Season', 'WTeamID', 'WLoc', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
losing_cols = ['Season', 'LTeamID', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
columns = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']

winning_stats = SeasonResults[winning_cols].groupby(['Season', 'WTeamID']).mean().reset_index()
winning_stats.columns = columns

losing_stats = SeasonResults[losing_cols].groupby(['Season', 'LTeamID']).mean().reset_index()
losing_stats.columns = columns

reg_season_stats = pd.concat([winning_stats, losing_stats]).groupby(['Season', 'TeamID']).mean().reset_index()

reg_season_stats['id'] = reg_season_stats.Season.astype(str) + reg_season_stats.TeamID.astype(str)


reg_season_stats.head()

Unnamed: 0,Season,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,id
0,2003,1102,19.572917,39.8125,8.09375,20.916667,11.447917,17.5625,4.135417,17.135417,13.489583,11.385417,6.135417,1.916667,18.416667,20031102
1,2003,1103,27.25,55.835165,5.445055,16.013736,19.153846,26.032967,9.763736,19.980769,15.31044,12.626374,7.260989,2.315934,19.873626,20031103
2,2003,1104,23.548128,56.858289,6.165775,19.593583,14.545455,20.491979,13.582888,23.251337,11.590909,13.347594,6.435829,3.679144,18.462567,20031104
3,2003,1105,24.759398,61.691729,8.071429,21.293233,16.597744,23.067669,13.838346,23.981203,14.954887,18.447368,9.932331,2.052632,19.977444,20031105
4,2003,1106,23.517949,55.189744,6.089744,17.528205,10.794872,16.751282,12.317949,24.133333,11.766667,17.079487,8.425641,3.184615,18.192308,20031106


In [14]:
# Read in Massey Rankings up to day 128 - Shows Teams and Massey Rankings by day in the 2003-2022 seasons

ranks = pd.read_csv(files[3])

# ['RankingDayNum'] == 133 - Because we only want Rankings leading up to the tournament

ranks = ranks[ranks['Season'] >= 2003].reset_index(drop=True)  
#ranks = ranks[ranks['RankingDayNum'] == 133].reset_index(drop=True)  

aggregate_ranks = ranks.groupby(['Season', 'TeamID']).agg({'OrdinalRank': ['mean', 'min', 'max']})
aggregate_ranks.columns = ['_'.join(col) for col in aggregate_ranks.columns]

massey_Ranks = aggregate_ranks.reset_index()
massey_Ranks['id'] = massey_Ranks.Season.astype(str) + massey_Ranks.TeamID.astype(str)
massey_Ranks

Unnamed: 0,Season,TeamID,OrdinalRank_mean,OrdinalRank_min,OrdinalRank_max,id
0,2003,1102,144.287500,52,219,20031102
1,2003,1103,182.205000,99,284,20031103
2,2003,1104,27.655502,1,106,20031104
3,2003,1105,305.377500,185,323,20031105
4,2003,1106,243.265000,166,307,20031106
...,...,...,...,...,...,...
6897,2022,1468,206.520604,113,332,20221468
6898,2022,1469,271.579670,175,342,20221469
6899,2022,1470,220.846154,122,345,20221470
6900,2022,1471,238.221154,34,313,20221471


In [15]:
def is_winning(wteam, lteam):
    if wteam < lteam:
        return 1
    else:
        return 0
    
    
march_tourney = pd.read_csv(files[1])
march_tourney = march_tourney[march_tourney['Season'] >= 2021].reset_index(drop=True) 

train = march_tourney

train['is_win'] = train.apply(lambda x: is_winning(x['WTeamID'], x['LTeamID']), axis=1)
train['team_a'] = train.Season.astype(str) + train.WTeamID.astype(str)
train['team_b'] = train.Season.astype(str) + train.LTeamID.astype(str)

train = train.drop(['WScore', 'LScore'], axis=1)
train = pd.merge(train, massey_Ranks, left_on='team_a', right_on='id').merge(massey_Ranks, left_on='team_b', right_on='id', suffixes=('_teama', '_teamb'))

train = train.drop(['Season_x', 'Season_y', 'id_teama', 'id_teamb', 'TeamID_teama', 'TeamID_teamb'], axis=1)
train = pd.merge(train, reg_season_stats, left_on='team_a', right_on='id').merge(reg_season_stats, left_on='team_b', right_on='id', suffixes=('_teama', '_teamb'))

train = train.drop(['Season_x', 'Season_y', 'team_a', 'team_b', 'TeamID_teama', 'TeamID_teamb', 'id_teama', 'id_teamb'], axis=1)
train = train.drop(['DayNum', 'WTeamID', 'LTeamID', 'NumOT', 'Season', 'WLoc'], axis=1)

print(train.shape)
train.count()

(66, 33)


is_win                    66
OrdinalRank_mean_teama    66
OrdinalRank_min_teama     66
OrdinalRank_max_teama     66
OrdinalRank_mean_teamb    66
OrdinalRank_min_teamb     66
OrdinalRank_max_teamb     66
FGM_teama                 66
FGA_teama                 66
FGM3_teama                66
FGA3_teama                66
FTM_teama                 66
FTA_teama                 66
OR_teama                  66
DR_teama                  66
Ast_teama                 66
TO_teama                  66
Stl_teama                 66
Blk_teama                 66
PF_teama                  66
FGM_teamb                 66
FGA_teamb                 66
FGM3_teamb                66
FGA3_teamb                66
FTM_teamb                 66
FTA_teamb                 66
OR_teamb                  66
DR_teamb                  66
Ast_teamb                 66
TO_teamb                  66
Stl_teamb                 66
Blk_teamb                 66
PF_teamb                  66
dtype: int64

In [25]:
#X_train = train
X_train = X_train = train.drop(['is_win'], axis=1)

y_train = train.is_win

X_test = train

In [24]:
gbc = GradientBoostingClassifier(loss='deviance', 
                             learning_rate=0.1,
                             n_estimators=100, 
                             subsample=0.5, 
                             criterion='friedman_mse',
                             min_samples_split=2, 
                             min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0, 
                             max_depth=5, 
                             max_features=None, 
                             verbose=1, 
                             max_leaf_nodes=None)

gbc.fit(X_train, np.ravel(y_train.values))
y_pred = gbc.predict_proba(X_test)

#y_pred = y_pred[:,1]

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.1983           0.0208            0.19s
         2           1.1007          -0.0503            0.20s
         3           1.0791           0.0141            0.23s
         4           0.9550           0.0027            0.22s
         5           0.9409           0.0082            0.24s
         6           0.8418          -0.0071            0.27s
         7           0.8355           0.0100            0.26s
         8           0.7752          -0.0225            0.25s
         9           0.7830          -0.0289            0.25s
        10           0.7078           0.0148            0.24s
        20           0.4013          -0.0119            0.16s
        30           0.2810          -0.0063            0.12s
        40           0.1681          -0.0021            0.09s
        50           0.1333          -0.0091            0.07s
        60           0.0726          -0.0040            0.05s
       

Feature names unseen at fit time:
- is_win
Feature names must be in the same order as they were in fit.



ValueError: X has 33 features, but GradientBoostingClassifier is expecting 32 features as input.

In [10]:
# Final output need - teamid's merged before

train['Prob_of_TeamID_teama_Win'] = y_pred[:,1]
train[['TeamID_teama', 'TeamID_teamb', 'Prob_of_TeamID_teama-Win']].head()

KeyError: "None of [Index(['TeamID_teama', 'TeamID_teamb', 'Prob_of_TeamID_teama-Win'], dtype='object')] are in the [columns]"

In [None]:
## 2022 Bracket

matchups = ['Gonzaga', 'Georgia St',
            "Boise St", 'Oklahoma',
            'Butler', 'Akron', 
            'Maryland', 'New Mexico St', 
            'Iowa', 'Cincinnati',  
            'Duke', 'Ark Little Rock', 
            'Michigan', 'Utah St', 
            'Villanova', 'N Dakota St', 
            'Kansas', 'Siena', 
            'Florida', 'USC', 
            'Auburn', 'ETSU', 
            'Wisconsin', 'Vermont', 
            'West Virginia', 'Wichita St', 
            'Creighton', 'Belmont', 
            'Virginia', 'Texas Tech', 
            'Michigan St', 'UC Irvine', 
            'Baylor', 'Boston Univ',  
            'Arizona', 'LSU', 
            'Ohio St', 'SF Austin', 
            'Louisville', 'Yale', 
            'Penn St', 'UCLA', 
            'Seton Hall', 'Hofstra', 
            'Providence', 'Rutgers', 
            'Florida St', 'N Kentucky', 
            'Gonzaga', 'Robert Morris', 
            'Colorado', 'Marquette', 
            'BYU', 'Liberty', 
            'Oregon', 'North Texas', 
            'Houston', 'Indiana', 
            'Kentucky', 'E Washington', 
            'Illinois', 'Arizona St',
            'San Diego St', 'Bradley']

In [None]:
 gbc_df = Reg_season(matchups,teams_df,reg_season_stats,massey_Ranks)