In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
aggregate = pd.read_csv('./GeneratedDatasets/NCAA-Aggregate.csv')
differential = pd.read_csv('./GeneratedDatasets/NCAA-Differential.csv')
aggdiff = pd.read_csv('./GeneratedDatasets/NCAA-AggDiff.csv')
targagg = pd.read_csv('./GeneratedDatasets/2018Target-Aggregate.csv')
targdiff = pd.read_csv('./GeneratedDatasets/2018Target-Differential.csv')
targagdf = pd.read_csv('./GeneratedDatasets/2018Target-AggDiff.csv')

In [3]:
X_a = aggregate.drop(['ID', 'FavoriteID', 'UnderdogID', 'ExpectedWin'], axis=1)
y_a = aggregate['ExpectedWin']

In [4]:
X_d = differential.drop(['ID', 'FavoriteID', 'UnderdogID', 'ExpectedWin'], axis=1)
y_d = differential['ExpectedWin']

In [5]:
X_ad = aggdiff.drop(['ID', 'FavoriteID', 'UnderdogID', 'ExpectedWin'], axis=1)
y_ad = aggdiff['ExpectedWin']

In [6]:
Xa_train, Xa_test, ya_train, ya_test = train_test_split(X_a, y_a, random_state=2018, stratify=y_a)

In [7]:
Xd_train, Xd_test, yd_train, yd_test = train_test_split(X_d, y_d, random_state=317, stratify=y_d)

In [8]:
Xad_train, Xad_test, yad_train, yad_test = train_test_split(X_ad, y_ad, random_state=442, stratify = y_ad)

In [9]:
rf = RandomForestClassifier(random_state=1234)
rf_params = {'n_estimators': [int(n) for n in np.linspace(20, 50, 4)],
            'max_depth': [None, 2, 4, 6, 8],
            'max_features': ['auto', 'log2', 'sqrt'],
            'criterion': ['gini', 'entropy']}
gs_rfa = GridSearchCV(rf, rf_params)
gs_rfa.fit(Xa_train, ya_train)
print(gs_rfa.best_score_)
gs_rfa.best_params_

0.7224489795918367


{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 50}

In [10]:
gs_rfa.score(Xa_test, ya_test)

0.6910569105691057

In [11]:
rf = RandomForestClassifier(random_state=456)
rf_params = {'n_estimators': [int(n) for n in np.linspace(20, 50, 4)],
            'max_depth': [None, 2, 4, 6, 8],
            'max_features': ['auto', 'log2', 'sqrt'],
            'criterion': ['gini', 'entropy']}
gs_rfd = GridSearchCV(rf, rf_params)
gs_rfd.fit(Xd_train, yd_train)
print(gs_rfd.best_score_)
gs_rfd.best_params_

0.708843537414966


{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 50}

In [12]:
gs_rfd.score(Xd_test, yd_test)

0.7154471544715447

In [54]:
rf = RandomForestClassifier(random_state=789)
rf_params = {'n_estimators': [int(n) for n in np.linspace(20, 50, 4)],
            'max_depth': [None, 2, 4, 6, 8],
            'max_features': ['auto', 'log2', 'sqrt'],
            'criterion': ['gini', 'entropy']}
gs_rfad = GridSearchCV(rf, rf_params)
gs_rfad.fit(Xad_train, yad_train)
print(gs_rfad.best_score_)
gs_rfad.best_params_

0.7251700680272108


{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 40}

In [55]:
gs_rfad.score(Xad_test, yad_test)

0.7073170731707317

In [30]:
y_a.mean()

0.7064220183486238

In [56]:
gs_rfd.predict_proba(targdiff.drop('ID', axis=1))

array([[0.49894016, 0.50105984],
       [0.4255358 , 0.5744642 ],
       [0.49918283, 0.50081717],
       ...,
       [0.17129167, 0.82870833],
       [0.47104572, 0.52895428],
       [0.4545172 , 0.5454828 ]])

In [82]:
predictions = pd.DataFrame(columns=['ID', 'Pred'])
for n in range(len(gs_rfd.predict_proba(targdiff.drop('ID', axis=1)))):
    predictions.loc[n, 'ID'] = targdiff.loc[n, 'ID']
    predictions.loc[n, 'Pred'] = gs_rfd.predict_proba(targdiff.drop('ID', axis=1))[n][1]

In [89]:
predictions.head(10)

Unnamed: 0,ID,Pred
0,2018_1104_1112,0.50106
1,2018_1104_1113,0.574464
2,2018_1104_1116,0.500817
3,2018_1104_1120,0.50943
4,2018_1104_1137,0.656441
5,2018_1104_1138,0.558563
6,2018_1104_1139,0.549669
7,2018_1104_1153,0.420564
8,2018_1104_1155,0.522485
9,2018_1104_1158,0.69141


In [59]:
targdiff.tail()

Unnamed: 0,ID,WinsDiff,LossesDiff,PercentageDiff,MeanRankDiff,AbsMeanRankDiff,SeedDiff,coach_expDiff,made_tournamentDiff,first_roundDiff,...,AgstOppOffAvgOppFGA3Diff,AgstOppOffAvgOppFTMDiff,AgstOppOffAvgOppFTADiff,AgstOppOffAvgOppORDiff,AgstOppOffAvgOppDRDiff,AgstOppOffAvgOppAstDiff,AgstOppOffAvgOppTODiff,AgstOppOffAvgOppStlDiff,AgstOppOffAvgOppBlkDiff,AgstOppOffAvgOppPFDiff
2273,2018_1452_1460,1,1,-0.013,-113.475,-112,-9,20,20,20,...,-1.662,4.249,6.262,1.103,-0.945,-0.137,2.107,-0.467,0.736,-0.759
2274,2018_1452_1462,-4,5,-0.142,6.492,8,4,24,16,17,...,-5.452,5.632,8.578,1.7,2.004,-1.696,4.962,-1.17,0.19,-1.907
2275,2018_1455_1460,2,-2,0.062,-113.312,-111,-10,7,10,9,...,1.167,1.444,1.686,-1.387,-2.565,-0.759,-2.698,0.209,0.598,0.422
2276,2018_1455_1462,-3,2,-0.067,6.656,9,3,11,6,6,...,-2.623,2.827,4.002,-0.79,0.384,-2.318,0.157,-0.494,0.052,-0.726
2277,2018_1460_1462,-5,4,-0.129,119.967,120,13,4,-4,-3,...,-3.79,1.383,2.316,0.597,2.949,-1.559,2.855,-0.703,-0.546,-1.148


In [73]:
teams = pd.read_csv('./GeneratedDatasets/generated2018.csv')
teams[teams['TeamID'] == 1455]['TeamName']

65    Wichita St
Name: TeamName, dtype: object

In [74]:
teams[teams['TeamID'] == 1460]['TeamName']

66    Wright St
Name: TeamName, dtype: object

In [75]:
teams['TeamName']

0            Alabama
1            Arizona
2         Arizona St
3           Arkansas
4             Auburn
5           Bucknell
6            Buffalo
7             Butler
8         Cincinnati
9            Clemson
10    Col Charleston
11         Creighton
12      CS Fullerton
13          Davidson
14              Duke
15           Florida
16        Florida St
17        Georgia St
18           Gonzaga
19           Houston
20              Iona
21            Kansas
22         Kansas St
23          Kentucky
24          Lipscomb
25       Long Island
26    Loyola-Chicago
27          Marshall
28          Miami FL
29          Michigan
           ...      
38    North Carolina
39           Ohio St
40          Oklahoma
41              Penn
42        Providence
43            Purdue
44           Radford
45      Rhode Island
46       S Dakota St
47      San Diego St
48        Seton Hall
49         SF Austin
50    St Bonaventure
51          Syracuse
52               TCU
53         Tennessee
54           

In [76]:
teams[teams['TeamName'] == 'Long Island']

Unnamed: 0,TeamID,TeamName,Wins,Losses,Percentage,MeanRank,AbsMeanRank,Seed,coach_exp,made_tournament,...,AgstOppOffAvgOppFGA3,AgstOppOffAvgOppFTM,AgstOppOffAvgOppFTA,AgstOppOffAvgOppOR,AgstOppOffAvgOppDR,AgstOppOffAvgOppAst,AgstOppOffAvgOppTO,AgstOppOffAvgOppStl,AgstOppOffAvgOppBlk,AgstOppOffAvgOppPF
25,1254,Long Island,17,16,0.515,256.9344,257,16,10,1,...,-1.101,1.072,1.868,1.65,0.697,0.279,-1.408,0.121,-0.284,1.649


In [77]:
teams[teams['TeamName'] == 'Radford']

Unnamed: 0,TeamID,TeamName,Wins,Losses,Percentage,MeanRank,AbsMeanRank,Seed,coach_exp,made_tournament,...,AgstOppOffAvgOppFGA3,AgstOppOffAvgOppFTM,AgstOppOffAvgOppFTA,AgstOppOffAvgOppOR,AgstOppOffAvgOppDR,AgstOppOffAvgOppAst,AgstOppOffAvgOppTO,AgstOppOffAvgOppStl,AgstOppOffAvgOppBlk,AgstOppOffAvgOppPF
44,1347,Radford,20,12,0.625,166.4262,166,16,7,0,...,-1.573,-0.781,-1.905,0.406,-2.043,-1.615,-0.051,-0.691,0.241,-0.93


In [78]:
teams[teams['TeamName'] == 'St Bonaventure']

Unnamed: 0,TeamID,TeamName,Wins,Losses,Percentage,MeanRank,AbsMeanRank,Seed,coach_exp,made_tournament,...,AgstOppOffAvgOppFGA3,AgstOppOffAvgOppFTM,AgstOppOffAvgOppFTA,AgstOppOffAvgOppOR,AgstOppOffAvgOppDR,AgstOppOffAvgOppAst,AgstOppOffAvgOppTO,AgstOppOffAvgOppStl,AgstOppOffAvgOppBlk,AgstOppOffAvgOppPF
50,1382,St Bonaventure,25,7,0.781,51.4918,51,11,17,1,...,1.867,2.701,3.697,0.829,-0.682,-0.622,2.384,-0.79,0.652,2.398


In [79]:
teams[teams['TeamName'] == 'UCLA']

Unnamed: 0,TeamID,TeamName,Wins,Losses,Percentage,MeanRank,AbsMeanRank,Seed,coach_exp,made_tournament,...,AgstOppOffAvgOppFGA3,AgstOppOffAvgOppFTM,AgstOppOffAvgOppFTA,AgstOppOffAvgOppOR,AgstOppOffAvgOppDR,AgstOppOffAvgOppAst,AgstOppOffAvgOppTO,AgstOppOffAvgOppStl,AgstOppOffAvgOppBlk,AgstOppOffAvgOppPF
58,1417,UCLA,21,11,0.656,46.459,47,11,23,10,...,4.696,-1.433,-1.493,0.324,0.448,0.082,-2.119,-0.028,-1.336,0.994


In [84]:
predictions[predictions['ID'] == '2018_1254_1347']

Unnamed: 0,ID,Pred
1393,2018_1254_1347,0.487431


In [85]:
predictions[predictions['ID'] == '2018_1382_1417']

Unnamed: 0,ID,Pred
2132,2018_1382_1417,0.604193


In [86]:
predictions.to_csv('predictions-rfd.csv', index=False)

In [90]:
predictions_a = pd.DataFrame(columns=['ID', 'Pred'])
for n in range(len(gs_rfa.predict_proba(targagg.drop('ID', axis=1)))):
    predictions_a.loc[n, 'ID'] = targagg.loc[n, 'ID']
    predictions_a.loc[n, 'Pred'] = gs_rfa.predict_proba(targagg.drop('ID', axis=1))[n][1]

In [91]:
predictions_a.head(10)

Unnamed: 0,ID,Pred
0,2018_1104_1112,0.384016
1,2018_1104_1113,0.64479
2,2018_1104_1116,0.534664
3,2018_1104_1120,0.442356
4,2018_1104_1137,0.617004
5,2018_1104_1138,0.692896
6,2018_1104_1139,0.488097
7,2018_1104_1153,0.436746
8,2018_1104_1155,0.437414
9,2018_1104_1158,0.511926


In [92]:
predictions_a.to_csv('predictions-rfa.csv', index=False)

In [107]:
predictions_ad = pd.DataFrame(columns=['ID', 'Pred'])
for n in range(len(gs_rfad.predict_proba(targagdf.drop('ID', axis=1)))):
    predictions_ad.loc[n, 'ID'] = targagdf.loc[n, 'ID']
    predictions_ad.loc[n, 'Pred'] = gs_rfad.predict_proba(targagdf.drop('ID', axis=1))[n][1]

In [108]:
predictions_ad.head(10)

Unnamed: 0,ID,Pred
0,2018_1104_1112,0.449973
1,2018_1104_1113,0.654197
2,2018_1104_1116,0.625729
3,2018_1104_1120,0.65478
4,2018_1104_1137,0.848554
5,2018_1104_1138,0.704483
6,2018_1104_1139,0.565392
7,2018_1104_1153,0.409309
8,2018_1104_1155,0.517792
9,2018_1104_1158,0.72617


In [109]:
predictions_ad.to_csv('predictions-rfad.csv', index=False)

In [110]:
predictions_a[predictions_a['ID'] == '2018_1254_1347']

Unnamed: 0,ID,Pred
1393,2018_1254_1347,0.591774


In [111]:
predictions_a[predictions_a['ID'] == '2018_1382_1417']

Unnamed: 0,ID,Pred
2132,2018_1382_1417,0.761471


In [112]:
predictions_ad[predictions_ad['ID'] == '2018_1254_1347']

Unnamed: 0,ID,Pred
1393,2018_1254_1347,0.525648


In [113]:
predictions_ad[predictions_ad['ID'] == '2018_1382_1417']

Unnamed: 0,ID,Pred
2132,2018_1382_1417,0.554241


In [2]:
pred1 = pd.read_csv('predictions-rfd.csv')
pred2 = pd.read_csv('predictions-rfa.csv')
pred3 = pd.read_csv('predictions-rfad.csv')

In [3]:
df = pd.DataFrame(columns =['ID', 'Pred'])
for p in range(len(pred1)):
    df.loc[p, 'ID'] = pred1.loc[p, 'ID']
    df.loc[p, 'Pred'] = (pred1.loc[p, 'Pred'] + pred2.loc[p, 'Pred'] + pred3.loc[p, 'Pred']) / 3
df

Unnamed: 0,ID,Pred
0,2018_1104_1112,0.445016
1,2018_1104_1113,0.624484
2,2018_1104_1116,0.553737
3,2018_1104_1120,0.535522
4,2018_1104_1137,0.707333
5,2018_1104_1138,0.651981
6,2018_1104_1139,0.534386
7,2018_1104_1153,0.422206
8,2018_1104_1155,0.492564
9,2018_1104_1158,0.643169


In [4]:
df.to_csv('predictions-rfens.csv', index=False)

In [15]:
rf = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=456)
rf.fit(Xd_train, yd_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=456, verbose=0, warm_start=False)

In [16]:
rf.feature_importances_

array([0.03705568, 0.01670858, 0.03297958, 0.09481977, 0.08783773,
       0.0578696 , 0.01046326, 0.00491572, 0.00628386, 0.01156073,
       0.01196237, 0.01609396, 0.00616816, 0.00561126, 0.00035908,
       0.01351707, 0.03971462, 0.0036145 , 0.00561317, 0.00305416,
       0.00249374, 0.00339897, 0.00956999, 0.0130863 , 0.01879811,
       0.01749487, 0.0120138 , 0.01226549, 0.0159566 , 0.02074517,
       0.0072751 , 0.00714237, 0.00828024, 0.01122802, 0.00193703,
       0.00811965, 0.016241  , 0.01079158, 0.0201164 , 0.01142992,
       0.00354408, 0.00067235, 0.00366281, 0.01512926, 0.02179671,
       0.00853862, 0.00850946, 0.00395959, 0.00270588, 0.00945157,
       0.01259426, 0.00260412, 0.00665082, 0.00472458, 0.01013255,
       0.01340319, 0.00187739, 0.02481193, 0.02020685, 0.00744892,
       0.01047426, 0.00581577, 0.00745573, 0.0099296 , 0.00474564,
       0.0124173 , 0.01785171, 0.02594311, 0.00451797, 0.01499682,
       0.00883994])

In [17]:
targdiff

Unnamed: 0,ID,WinsDiff,LossesDiff,PercentageDiff,MeanRankDiff,AbsMeanRankDiff,SeedDiff,coach_expDiff,made_tournamentDiff,first_roundDiff,...,AgstOppOffAvgOppFGA3Diff,AgstOppOffAvgOppFTMDiff,AgstOppOffAvgOppFTADiff,AgstOppOffAvgOppORDiff,AgstOppOffAvgOppDRDiff,AgstOppOffAvgOppAstDiff,AgstOppOffAvgOppTODiff,AgstOppOffAvgOppStlDiff,AgstOppOffAvgOppBlkDiff,AgstOppOffAvgOppPFDiff
0,2018_1104_1112,-8,8,-0.235,31.590,31,5,-11,-10,-10,...,1.272,0.740,2.578,2.202,4.259,-1.182,1.893,0.419,0.678,0.607
1,2018_1104_1113,-1,4,-0.086,12.705,12,-2,-2,-1,-1,...,-3.018,-0.222,0.845,-0.649,-1.274,-4.160,-1.445,1.113,-0.310,-1.174
2,2018_1104_1116,-4,4,-0.117,24.492,26,2,-13,-8,-8,...,-1.684,-3.040,-3.028,0.508,-0.081,-2.089,-0.904,1.922,-0.046,1.035
3,2018_1104_1120,-6,8,-0.222,42.738,43,5,-11,-8,-8,...,-1.209,-2.197,-1.415,1.085,-0.258,-1.724,-2.032,0.267,0.141,-0.246
4,2018_1104_1137,-6,6,-0.176,-43.902,-41,-5,0,-1,-1,...,2.387,-2.432,-2.661,-0.527,-1.479,-1.384,0.907,-0.028,-0.859,-1.382
5,2018_1104_1138,-6,7,-0.199,-17.492,-17,-4,0,-1,-1,...,1.703,-4.511,-4.479,0.443,-1.245,-0.711,-0.544,1.061,-1.005,1.380
6,2018_1104_1139,-1,2,-0.047,22.935,24,-1,1,0,0,...,1.698,-0.871,0.286,2.741,-0.153,-0.126,-1.138,1.742,-0.277,2.944
7,2018_1104_1153,-11,11,-0.323,48.885,49,7,-12,-9,-9,...,1.874,3.245,5.290,0.815,2.561,1.267,-1.552,1.883,-0.079,1.675
8,2018_1104_1155,-4,6,-0.160,35.295,36,4,-13,-4,-4,...,0.296,2.711,4.509,1.651,0.366,0.827,0.869,1.061,-0.325,1.522
9,2018_1104_1158,-5,8,-0.215,-51.590,-49,-4,-1,0,0,...,1.613,0.171,2.390,1.366,-0.439,-0.928,1.184,2.261,-0.867,2.416
