In [1]:
# Step 1: Load libraries
import pandas as pd
import numpy
import math
import csv
import random
from sklearn import linear_model, model_selection
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, VotingClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [14]:
# Step 2: Load data
season_data = pd.read_csv('MRegularSeasonDetailedResults.csv')
tourney_data = pd.read_csv('MNCAATourneyDetailedResults.csv')
seeds = pd.read_csv('MNCAATourneySeeds.csv')
frames = [season_data, tourney_data]
all_data = pd.concat(frames)
stat_fields = [
    # offensive statistics
    'fgm',              # field goal made
    'fga',              # field goal attempted
    'fgp',              # field goal percentage
    'fgm3',             # 3pt field goal made
    'fga3',             # 3 points field goal attempted
    '3pp',              # 3 points field goal percentage
    'ftm',              # free throw made
    'fta',              # free throw attempted
    'ftp',              # free throw percentage
    'ef_fg_perc',       # effective field goal percentage
    'f_throw_factor',   # free throw factor
    # defensive statistics
    'totalreb_perc',    # total rebound percentage
    'or',               # offensive rebound
    'offreb_perc',      # offensive rebound percentage
    'dr',               # defensive rebound
    'defreb_perc',      # defensive rebound percentage
    'to',               # turnover
    'ast',              # assist
    'ast_ratio',        # assist ratio 
    'to_ratio',         # turnover ratio
    'to_factor',        # turnover factor both defense & offense
    'stl',              # steal
    'blk',              # block
    'pf',               # personal foul
]

prediction_year = 2022
base_elo = 1600
team_elos = {}
team_stats = {}
X = []
y = []
submission_data = []
def initialize_data():
    for i in range(1985, prediction_year+1):
        team_elos[i] = {}
        team_stats[i] = {}
initialize_data()

In [15]:
# Step 3: Explore the data
all_data.head(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
5,2003,11,1458,81,1186,55,H,0,26,57,...,11,12,17,6,22,8,19,4,3,25
6,2003,12,1161,80,1236,62,H,0,23,55,...,15,20,28,9,21,11,30,10,4,28
7,2003,12,1186,75,1457,61,N,0,28,62,...,17,17,23,8,25,10,15,14,8,18
8,2003,12,1194,71,1156,66,N,0,28,58,...,18,12,27,13,26,13,25,8,2,18
9,2003,12,1458,84,1296,56,H,0,32,67,...,14,7,12,9,23,10,18,1,3,18


In [16]:
all_data = all_data[all_data.Season<2020]

In [17]:
# Step 4: Define Helper functions
def get_elo(season, team):
    try:
        return team_elos[season][team]
    except:
        try:
            # Get the previous season's ending value.
            i = 2 if season == 2021 else 1
            team_elos[season][team] = team_elos[season-i][team]
            return team_elos[season][team]
        except:
            # Get the starter elo.
            team_elos[season][team] = base_elo
            return team_elos[season][team]

def calc_elo(win_team, lose_team, season):
    winner_rank = get_elo(season, win_team)
    loser_rank = get_elo(season, lose_team)
    rank_diff = winner_rank - loser_rank
    exp = (rank_diff * -1) / 400
    odds = 1 / (1 + math.pow(10, exp))
    if winner_rank < 2100:
        k = 32
    elif winner_rank >= 2100 and winner_rank < 2400:
        k = 24
    else:
        k = 16
    new_winner_rank = round(winner_rank + (k * (1 - odds)))
    new_rank_diff = new_winner_rank - winner_rank
    new_loser_rank = loser_rank - new_rank_diff
    return new_winner_rank, new_loser_rank

def get_stat(season, team, field):
    try:
        l = team_stats[season][team][field]
        return sum(l) / float(len(l))
    except:
        return 0
    
def update_stats(season, team, fields):
    if team not in team_stats[season]:
        team_stats[season][team] = {}
    for key, value in fields.items():
        # Make sure we have the field.
        if key not in team_stats[season][team]:
            team_stats[season][team][key] = []
        if len(team_stats[season][team][key]) >= 9:
            team_stats[season][team][key].pop()
        team_stats[season][team][key].append(value)
        
def predict_winner(team_1, team_2, model, season, stat_fields):
    features = []
    # Team 1
    features.append(get_elo(season, team_1))
    for stat in stat_fields:
        features.append(get_stat(season, team_1, stat))
    # Team 2
    features.append(get_elo(season, team_2))
    for stat in stat_fields:
        features.append(get_stat(season, team_2, stat))
    #return model.predict_proba([features]).clip(0.025, 0.975)
    return model.predict_proba([features])

In [18]:
# Step 5: Feature Selection and Feature Engineering
## Our classifier will make its decision based off of the values for 25 features. One important feature is 
## a ranking metric called ELO while the remaining 24 features are traditional basketball metrics as described below:

### Features
"""
	wfgm: field goals made
	wfga: field goals attempted
	wfgm3: three pointers made
	wfga3: three pointers attempted
	wftm: free throws made
	wfta: free throws attempted
	wor: offensive rebounds
	wdr: defensive rebounds
	wast: assists
	wto: turnovers
	wstl: steals
	wblk: blocks
	wpf: personal fouls
"""
### Engineered Features
"""
    fgp: field goal percentage
    3pp: three point percentage
    ftp: free throw percentage
    ef_fg_perc: Effective Field Goal Percentage
    f_throw_factor: Free throw Factor
    totalreb_perc: Total Rebound Percentage	
    offreb_perc: Offensive Rebound Percentage    
    defreb_perc: Defensive Rebound Percentage
    ast_ratio: Assist Ratio
    to_ratio: Turnover Ratio    
    to_factor: Turnover Factor
"""
def build_season_data(all_data):
    # Calculate the elo for every game for every team, each season.
    # Store the elo per season so we can retrieve their end elo
    # later in order to predict the tournaments without having to
    # inject the prediction into this loop.
    for index, row in all_data.iterrows():
        if row['Season'] != 2020:
            # Used to skip matchups where we don't have usable stats yet.
            skip = 0
            # Get starter or previous elos.
            team_1_elo = get_elo(row['Season'], row['WTeamID'])
            team_2_elo = get_elo(row['Season'], row['LTeamID'])
            # Add 100 to the home team (# taken from Nate Silver analysis.)
            if row['WLoc'] == 'H':
                team_1_elo += 100
            elif row['WLoc'] == 'A':
                team_2_elo += 100         
            # We'll create some arrays to use later.
            team_1_features = [team_1_elo]
            team_2_features = [team_2_elo]
            # Build arrays out of the stats we're tracking..
            for field in stat_fields:
                team_1_stat = get_stat(row['Season'], row['WTeamID'], field)
                team_2_stat = get_stat(row['Season'], row['LTeamID'], field)
                if team_1_stat != 0 and team_2_stat != 0:
                    team_1_features.append(team_1_stat)
                    team_2_features.append(team_2_stat)
                else:
                    skip = 1
            if skip == 0:  # Make sure we have stats.
                # Randomly select left and right and 0 or 1 so we can train
                # for multiple classes.
                if random.random() > 0.5:
                    X.append(team_1_features + team_2_features)
                    y.append(0)
                else:
                    X.append(team_2_features + team_1_features)
                    y.append(1)
            # AFTER we add the current stuff to the prediction, update for
            # next time. Order here is key so we don't fit on data from the
            # same game we're trying to predict.
            if row['WFTA'] != 0 and row['LFTA'] != 0:
                stat_1_fields = {
                    # offense statistics
                    'fgm': row['WFGM'],
                    'fga': row['WFGA'],
                    'fgp': (row['WFGM'] / row['WFGA']) * 100,
                    'fgm3': row['WFGM3'],
                    'fga3': row['WFGA3'],
                    '3pp': (row['WFGM3'] / row['WFGA3']) * 100,
                    'ftm': row['WFTM'],
                    'fta': row['WFTA'],
                    'ftp': (row['WFTM'] / row['WFTA']) * 100,
                    'ef_fg_perc': 100 * (row['WFGM'] + (0.5 * row['WFGM3'])) / row['WFGA'],
                    'f_throw_factor': (row['WFTM'] / row['WFGM']) / (row['WFTA'] / row['WFGA']),
                    # defense statistics
                    'totalreb_perc': (row['WDR'] + row['WOR']) / (row['WDR'] + row['WOR'] + row['LDR'] + row['LOR']),
                    'or': row['WOR'],
                    'offreb_perc': 100 * (row['WOR'] / (row['WOR'] + row['LDR'])),
                    'dr': row['WDR'],
                    'defreb_perc': 100 * (row['WDR'] / (row['WDR'] + row['LOR'])),
                    'to': row['WTO'],
                    'ast': row['WAst'],
                    'ast_ratio': 100 * (row['WAst'] / row['WFGA'] + (0.475 * row['WFTA']) + row['WAst'] + row['WTO']),
                    'to_ratio': 100 * (row['WTO'] / row['WFGA'] + (0.475 * row['WFTA']) + row['WAst'] + row['WTO']),
                    'to_factor': row['WTO'] / (row['WFGA'] + (0.475 * row['WFTA']) + row['WTO']),
                    'stl': row['WStl'],
                    'blk': row['WBlk'],
                    'pf': row['WPF'],
                }         
                stat_2_fields = {
                    # offense statistics
                    'fgm': row['LFGM'],
                    'fga': row['LFGA'],
                    'fgp': (row['LFGM'] / row['LFGA']) * 100,
                    'fgm3': row['LFGM3'],
                    'fga3': row['LFGA3'],
                    '3pp': (row['LFGM3'] / row['LFGA3']) * 100,
                    'ftm': row['LFTM'],
                    'fta': row['LFTA'],
                    'ftp': row['LFTM'] / row['LFTA'] * 100,
                    'ef_fg_perc': 100 * (row['LFGM'] + (0.5 * row['LFGM3'])) / row['LFGA'],
                    'f_throw_factor': (row['LFTM'] / row['LFGM']) / (row['LFTA'] / row['LFGA']),
                    # defense statistics
                    'totalreb_perc': (row['LDR'] + row['LOR']) / (row['WDR'] + row['WOR'] + row['LDR'] + row['LOR']),
                    'or': row['LOR'],
                    'offreb_perc': 100 * (row['LOR'] / (row['LOR'] + row['WDR'])),
                    'dr': row['LDR'],
                    'defreb_perc': 100 * (row['LDR'] / (row['LDR'] + row['WOR'])),
                    'to': row['LTO'],
                    'ast': row['LAst'],
                    'ast_ratio': 100 * (row['LAst'] / row['LFGA'] + (0.475 * row['LFTA']) + row['LAst'] + row['LTO']),
                    'to_ratio': 100 * (row['LTO'] / row['LFGA'] + (0.475 * row['LFTA']) + row['LAst'] + row['LTO']),
                    'to_factor': row['LTO'] / (row['LFGA'] + (0.475 * row['LFTA']) + row['LTO']),
                    'stl': row['LStl'],
                    'blk': row['LBlk'],
                    'pf': row['LPF'],
                }
                update_stats(row['Season'], row['WTeamID'], stat_1_fields)
                update_stats(row['Season'], row['LTeamID'], stat_2_fields)
            # Now that we've added them, calc the new elo.
            new_winner_rank, new_loser_rank = calc_elo(
                row['WTeamID'], row['LTeamID'], row['Season'])
            team_elos[row['Season']][row['WTeamID']] = new_winner_rank
            team_elos[row['Season']][row['LTeamID']] = new_loser_rank
    return X, y
X, y = build_season_data(all_data)

In [19]:
# Step 6: Use Logistic Regression to Predict Game Outcomes
#model = linear_model.LogisticRegressionCV()
print("Let's hope to be correct 75% of the time")
#print(cross_validation.cross_val_score(model, numpy.array(X), numpy.array(y), cv=10, scoring='accuracy', n_jobs=-1).mean())
clf1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features=2, max_leaf_nodes=None, min_samples_leaf=4,
            min_samples_split=12, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)


clf2 = linear_model.LogisticRegressionCV(cv = 5)

clf3 = BaggingClassifier(base_estimator=None, n_estimators = 1000, max_samples=20, random_state=0)
clf4 = ExtraTreesClassifier(n_estimators = 100, random_state=0)
clf5 = SVC(probability=True)
clf6 = KNeighborsClassifier(n_neighbors=5)
eclf = VotingClassifier(estimators = [('RFC', clf1),
                                      ('logit',clf2),
                                      ('Bag',clf3),
                                      ('ETC', clf4),
                                      ('KNN',clf6)],
                       voting = 'soft')
clfs = [clf1,clf2,clf3,clf4,clf6]
print('here')
for clf in clfs:
    clf.fit(X, y)
    print(clf)
eclf = eclf.fit(X, y)

#print(model_selection.cross_val_score(eclf, numpy.array(X), numpy.array(y), cv=10, scoring='accuracy', n_jobs=-1).mean())
print("eclf fit")


Let's hope to be correct 75% of the time
here
RandomForestClassifier(max_depth=80, max_features=2, min_samples_leaf=4,
                       min_samples_split=12, n_jobs=1, random_state=0)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV(cv=5)
BaggingClassifier(max_samples=20, n_estimators=1000, random_state=0)
ExtraTreesClassifier(random_state=0)
KNeighborsClassifier()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

eclf fit


In [29]:
submission_data = []
for p in [2016, 2017, 2018, 2019, 2021]:
    tourney_teams = []
    for index, row in seeds.iterrows():
        if row['Season'] == p:
            tourney_teams.append(row['TeamID'])
    tourney_teams = sorted(tourney_teams)
    for team_1 in tourney_teams:
        for team_2 in tourney_teams:
            if team_1 < team_2:
                prediction = predict_winner(
                    team_1, team_2, eclf, p, stat_fields)
                label = str(prediction_year) + '_' + str(team_1) + '_' + \
                    str(team_2)
                submission_data.append([label, prediction[0][0]])

	1112	1114
	1112	1122
	1112	1124
	1112	1138
	1112	1139
	1112	1143
	1112	1151
	1112	1153
	1112	1160
	1112	1163
	1112	1167
	1112	1173
	1112	1181
	1112	1192
	1112	1195
	1112	1201
	1112	1211
	1112	1214
	1112	1218
	1112	1221
	1112	1231
	1112	1233
	1112	1234
	1112	1235
	1112	1242
	1112	1246
	1112	1268
	1112	1274
	1112	1276
	1112	1277
	1112	1292
	1112	1314
	1112	1320
	1112	1323
	1112	1328
	1112	1332
	1112	1333
	1112	1338
	1112	1344
	1112	1345
	1112	1355
	1112	1371
	1112	1372
	1112	1380
	1112	1386
	1112	1392
	1112	1393
	1112	1396
	1112	1400
	1112	1401
	1112	1403
	1112	1409
	1112	1421
	1112	1423
	1112	1425
	1112	1428
	1112	1433
	1112	1435
	1112	1437
	1112	1438
	1112	1451
	1112	1452
	1112	1453
	1112	1455
	1112	1458
	1112	1462
	1112	1463
	1114	1122
	1114	1124
	1114	1138
	1114	1139
	1114	1143
	1114	1151
	1114	1153
	1114	1160
	1114	1163
	1114	1167
	1114	1173
	1114	1181
	1114	1192
	1114	1195
	1114	1201
	1114	1211
	1114	1214
	1114	1218
	1114	1221
	1114	1231
	1114	1233
	1114	1234
	1114	1235
	1114	1242

In [31]:
# Step 7: Submit Results
print("Writing %d results." % len(submission_data))
d = {'ID': [row[0] for row in submission_data], 'Pred': [row[1] for row in submission_data]}
#submission_data2=pd.DataFrame(submission_data)
submission_data2=pd.DataFrame(data=d)
submission_data2.to_csv("submission_1.csv", index=False)
# submission_data2.shape

Writing 11390 results.


In [None]:
def build_team_dict():
    team_ids = pd.read_csv('MTeams.csv')
    team_id_map = {}
    for index, row in team_ids.iterrows():
        team_id_map[row['TeamID']] = row['TeamName']
    return team_id_map
team_id_map = build_team_dict()
readable = []
less_readable = []  # A version that's easy to look up.
readable_pd = pd.DataFrame(columns=['Winner','Loser','Probability'])
print("to csv")
for pred in submission_data:
    parts = pred[0].split('_')
    less_readable.append(
        [team_id_map[int(parts[1])], team_id_map[int(parts[2])], pred[1]])
    # Order them properly.
    if pred[1] > 0.5:
        winning = int(parts[1])
        losing = int(parts[2])
        proba = pred[1]
    else:
        winning = int(parts[2])
        losing = int(parts[1])
        proba = 1 - pred[1]
    readable_pd = readable_pd.append({'Winner':team_id_map[winning], 'Loser':team_id_map[losing], 'Probability':proba}, ignore_index=True)
readable_pd.to_csv('readable.csv', index=False)

In [9]:
readable_pd.append({'Winner':team_id_map[winning], 'Loser':team_id_map[losing], 'Probability':proba}, ignore_index=True)

Unnamed: 0,Winner,Loser,Probability
0,Arizona St,Abilene Chr,0.792476
1,Auburn,Abilene Chr,0.835211
2,Baylor,Abilene Chr,0.771986
3,Belmont,Abilene Chr,0.718686
4,Bradley,Abilene Chr,0.652286
5,Buffalo,Abilene Chr,0.725514
6,Cincinnati,Abilene Chr,0.782456
7,Colgate,Abilene Chr,0.667455
8,Duke,Abilene Chr,0.878546
9,Abilene Chr,F Dickinson,0.607069


In [10]:
readable_pd

Unnamed: 0,Winner,Loser,Probability
0,Arizona St,Abilene Chr,0.792476
1,Auburn,Abilene Chr,0.835211
2,Baylor,Abilene Chr,0.771986
3,Belmont,Abilene Chr,0.718686
4,Bradley,Abilene Chr,0.652286
5,Buffalo,Abilene Chr,0.725514
6,Cincinnati,Abilene Chr,0.782456
7,Colgate,Abilene Chr,0.667455
8,Duke,Abilene Chr,0.878546
9,Abilene Chr,F Dickinson,0.607069


In [11]:
# # Step 8: Get Relative Feature Importances
# def get_feature_importances(estimator, norm_order=1):
#     """Retrieve or aggregate feature importances from estimator"""
#     importances = getattr(estimator, "feature_importances_", None)

#     if importances is None and hasattr(estimator, "coef_"):
#         if estimator.coef_.ndim == 1:
#             importances = np.abs(estimator.coef_)

#         else:
#             importances = np.linalg.norm(estimator.coef_, axis=0,
#                                          ord=norm_order)

#     elif importances is None:
#         raise ValueError(
#             "The underlying estimator %s has no `coef_` or "
#             "`feature_importances_` attribute. Either pass a fitted estimator"
#             " to SelectFromModel or call fit before calling transform."
#             % estimator.__class__.__name__)

#     return importances
    
# import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline

# importances = np.array(get_feature_importances(eclf))[:]
# statdict = np.array([
#             'elo',
#             # offense
#             'field goals made',
#             'field goals attempted',
#             'field goal percentage',
#             'three point field goal made',
#             'three point field goal attempted',
#             'three point field goal percentage',
#             'free throw made',
#             'free throw attempted',
#             'free throw percentage',
#             'effective field goal percentage',
#             'free throw factor',
#             # defense
#             'total rebound percentage',
#             'offensive rebounds',
#             'offensive rebound percentage',
#             'defensive rebounds',
#             'defensive rebound percentage',
#             'turnovers',
#             'assist',
#             'assist ratio',
#             'turnover ratio',
#             'turnover factor',
#             'steals',
#             'blocks',
#             'personal fouls',
#            ],dtype='str')
    
#  feature_importance = abs(model.coef_[0][:25])
# feature_importance = 100.0 * (feature_importance / feature_importance.max())
# sorted_idx = np.argsort(feature_importance)
# pos = np.arange(sorted_idx.shape[0]) + .5


# featfig = plt.figure(figsize=(16,12))
# featax = featfig.add_subplot(1, 1, 1)
# featax.barh(pos, feature_importance[sorted_idx], align='center')
# featax.set_yticks(pos)
# featax.set_yticklabels(statdict[sorted_idx], fontsize=28)
# featax.set_xlabel('Relative Feature Importance',fontsize=24)

# plt.tight_layout()   
# plt.show()
# print(feature_importance)