# Model Evaluation Script

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from time import localtime
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras import regularizers, optimizers
from keras.callbacks import EarlyStopping, TensorBoard, ProgbarLogger
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

from sklearn import ensemble
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


from keras import backend as K
from sklearn.metrics import log_loss




# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../NCAA"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


Data
Data_Organization.ipynb
Final
Final MLP.ipynb
Final_Data_Organization.ipynb
GiantKillerInfo.txt
MarchMadnessAdvStats.csv
MarchMadnessFeatureDifferences.csv
MarchMadnessFeatures.csv
MarchMadnessTest.csv
ModelEvaluation.ipynb
OldNbs
SubmissionData.py
Submissions
logs
tENSORFLOW-TB.ipynb
tENSORFLOW.ipynb
tENSORFLOW.py



Lock Random Seed

In [3]:
# seed = 13
# np.random.seed(seed)

## Load pre-organized data

In [31]:
data_dir = '../NCAA/'
df_features = pd.read_csv(data_dir + 'MarchMadnessFeatures.csv')

Format and scale the data

In [45]:
X = df_features.iloc[:,1:]
xDim = np.shape(X)[1]
X_train = X.values.reshape(-1,xDim)
y_train = df_features.Result.values

scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)


print('Feature vector dimension is: %.2f' % xDim)
# print(X[:5, :])

Feature vector dimension is: 34.00


In [46]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=10)
# X_train = pca.fit_transform(X_train)

In [47]:
# xDim = np.shape(X_train)[1]
# xDim

# Load Training Data


In [48]:
df_test = pd.read_csv(data_dir+'MarchMadnessTest.csv')

X_test = df_test.iloc[:,1:]
xDimTest = np.shape(X_test)[1]
X_test = X_test.values.reshape(-1,xDimTest)
y_test = df_test.Result.values

In [49]:
# X_test = pca.transform(X_test)

In [50]:
np.shape(X_test)

(402, 34)

Create Kfold splits

In [51]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)

## Logisitic Regression

In [52]:
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {} '.format(clf.best_score_, 
                                                            clf.best_params_['C']))

cvresults = cross_val_score(logreg, X_train, y_train, cv=kfold ,scoring='neg_log_loss', verbose=0)
print('Log-Loss Mean :{:.3} ({:.3})'.format(np.mean(cvresults), np.std(cvresults)))

print("Training set score: %f" % clf.score(X_train, y_train))
print("Validation set score: %f" % clf.score(X_test, y_test))

Best log_loss: -0.576, with best C: 0.01 
Log-Loss Mean :-0.596 (0.0448)
Training set score: -0.557588
Validation set score: -0.576590


## Random Forest Classifier

In [53]:
fore = RandomForestClassifier(criterion='entropy', n_estimators = 50)

fore.fit(X_train, y_train)

cvresults = cross_val_score(fore, X_train, y_train, cv=kfold ,scoring='neg_log_loss', verbose=0)
print('Log-Loss Mean :{:.3} ({:.3})'.format(np.mean(cvresults), np.std(cvresults)))

y_pred =  fore.predict_proba(X_train)[:,1].reshape(-1,1)
LL = log_loss( y_train, y_pred)
print("Training set score: {:4}" .format(LL))
y_pred =  fore.predict_proba(X_test)[:,1].reshape(-1,1)
LL = log_loss( y_test, y_pred)
print("Validation set score: {:4}".format(LL))

Log-Loss Mean :-0.621 (0.0178)
Training set score: 0.17179165755257292
Validation set score: 0.5805600482463246


## MLP

Various properties

In [54]:
dropRate = 0.3
numBatch = 50
numEpoch = 120
learningRate = 1e-4

Base Single Layer Model

In [64]:
# MLP model
MLP = Sequential()
MLP.name = 'MLP'
MLP.add(Dense(50, input_dim=xDim, kernel_initializer='random_normal',activation = 'tanh'))
MLP.add(Dropout(dropRate))
# MLP.add(BatchNormalization())
MLP.add(Dense(500,kernel_initializer='normal',activation = 'tanh'))
MLP.add(Dropout(dropRate))
# # MLP.add(BatchNormalization())
MLP.add(Dense(100, kernel_initializer='normal',activation = 'tanh'))
MLP.add(Dropout(dropRate))
# # MLP.add(BatchNormalization())
MLP.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

# Compile model
adam = optimizers.Adam(lr=learningRate, amsgrad=True)
MLP.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [65]:
# TB = TensorBoard(log_dir="logs/{}_{}{}".format(MLP.name, localtime().tm_hour, localtime().tm_min))
# MLP.fit(X, y, validation_split = 0.2, epochs=numEpoch, batch_size=numBatch, verbose=0, callbacks = [TB])
MLP.fit(X_train, y_train, epochs=numEpoch, batch_size=numBatch, verbose=1)


Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120


<keras.callbacks.History at 0x2633f4a8ac8>

In [66]:
MLP.evaluate(X_test, y_test)



[0.5473488124448862, 0.71393034796216592]

In [28]:
# cvresults = cross_val_score(MLP, X, y, cv=kfold ,scoring='neg_log_loss', verbose=0)
# print('Log-Loss Mean :{:.3} ({:.3})'.format(np.mean(cvresults), np.std(cvresults)))

## SVM

In [29]:

# svc = SVC(C = .01, kernel = 'linear', probability = True)
svm = SVC(probability=True)
svc = GridSearchCV(svm, param_grid={
     'C': [0.001, 0.01, 0.1, 1, 10],
      'gamma': [0.001, 0.01, 0.1, 1, 10]},
      scoring ='neg_log_loss', refit=True)

svc.fit(X_train,y_train)
print('Best MSE: {:.4}, with best C: {} and best gamma: {}'.format(svc.best_score_, 
                                                                        svc.best_params_['C'], svc.best_params_['gamma']))
cvresults = cross_val_score(svc, X_train, y_train, cv=kfold ,scoring='neg_log_loss', verbose=0)
print('Log-Loss Mean :{:.3} ({:.3})'.format(np.mean(cvresults), np.std(cvresults)))

print("Training set score: %f" % svc.score(X_train, y_train))
print("Validation set score: %f" % svc.score(X_test, y_test))

Best MSE: -0.6222, with best C: 1 and best gamma: 0.001
Log-Loss Mean :-0.625 (0.0208)
Training set score: -0.594015
Validation set score: -0.584290


## Making predictions with desired model

### Extract data desired

In [30]:
data_dir = '../NCAA/Data/'
df_sample_sub = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
data_file = data_dir + 'MarchMadnessAdvStats.csv'
df_adv = pd.read_csv(data_file)
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')


n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

def seed_to_int(seed):
    '''Get just the digits from the seeding. Return as int'''
    s_int = int(seed[1:3])
    return s_int

In [None]:
print('Loading data for submission test')

# Make the seeding an integer
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(columns=['Seed'], inplace=True) # This is the string label
df_seeds.head()


T1_seed = []
T1_adv = []
T2_adv = []
T2_seed = []
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values[0]
    t1_adv = df_adv[(df_adv.TeamID == t1) & (df_adv.Season == year)].values[0]
    t2_adv = df_adv[(df_adv.TeamID == t2) & (df_adv.Season == year)].values[0]
    T1_seed.append(t1_seed)
    T1_adv.append(t1_adv)
    T2_seed.append(t2_seed)
    T2_adv.append(t2_adv)

T1_adv = [row[2:] for row in T1_adv]
T2_adv = [row[2:] for row in T2_adv]
T1_seed = np.reshape(T1_seed, [n_test_games,-1]).tolist()
T2_seed = np.reshape(T2_seed, [n_test_games, -1]).tolist()
X_pred = np.concatenate((T1_seed, T1_adv, T2_seed, T2_adv), axis=1)

df_subData = pd.DataFrame(np.array(X_pred).reshape(np.shape(X_pred)[0], np.shape(X_pred)[1]))

xDim = np.shape(df_subData)[1]
X_pred = df_subData.values.reshape(-1,xDim)

In [36]:
np.shape(T1_seed)

(9112, 1)

In [33]:
# X_test = scaler.fit_transform(X_test)

#s*************Check if this works!!!
preds = MLP.predict_proba(X_pred)

# df_sample_sub = pd.DataFrame()
# clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub.Pred = preds
df_sample_sub.shape

(9112, 2)

In [26]:
filename = 'MLP'
save_dir = '../NCAA/Submissions/'
c=0
ext = '.csv'
if os.path.exists(save_dir+filename+ext):
    while os.path.exists(filename+ext):
        c+=1
    filename = filename+'_'+str(c)
    df_sample_sub.to_csv(save_dir+filename+ext, index=False)
else:
    df_sample_sub.to_csv(save_dir+filename+ext, index=False)