# Keras MLP -> best Log Loss 0.555
## Overview ##
This kernel uses seeding and relative seeding with an MLP network using kfold volidation

In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from keras.callbacks import TensorBoard
from time import localtime
import os

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../NCAA"]).decode("utf8"))

# Any results you write to the current directory are saved as output.


Data
GiantKillerInfo.txt
KaggleKernelwSeeds.ipynb
KerasBinaryClassifier_Intro.ipynb
KerasMLP_only2003.csv
KerasMLP_onlyBPI_2009-2013.csv
KerasMLP_onlyseed.csv
KerasMLP_RankAndSeeds-Copy1.ipynb
KerasMLP_SeedsandYear.csv
KerasMLP_SeedsOnly.ipynb
KerasMLP_SeedswithTB.ipynb
logreg_withseedtoo.csv
logs
MasseyOrdinalsExp.ipynb
MLP_first_go.csv
MLP_with_seeds.csv
sonar.csv
SVM_first_go.csv
Tensorflow.ipynb



### Lock Random Seed

In [2]:
seed = 13
np.random.seed(seed)

### Load Training data


In [3]:
data_dir = '../NCAA/Data/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')

Cut off the region identifier from the seed number

In [4]:
def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(columns=['Seed'], inplace=True) # This is the string label
df_seeds.head()

Unnamed: 0,Season,TeamID,seed_int
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [5]:
df_tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_tour.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


### Merge seed for each team 
Merge the Seeds with their corresponding TeamIDs in the compact results dataframe.

In [6]:
df_winseeds = df_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
df_lossseeds = df_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})

df_dummy = pd.merge(left=df_tour, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_dummy.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed
0,1985,1116,1234,9
1,1985,1120,1345,11
2,1985,1207,1250,1
3,1985,1229,1425,9
4,1985,1242,1325,3


In [7]:
df_concat = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'LTeamID'])
df_concat['SeedDiff'] = df_concat.WSeed - df_concat.LSeed
df_concat.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,SeedDiff
0,1985,1116,1234,9,8,1
1,1985,1120,1345,11,6,5
2,1985,1207,1250,1,16,-15
3,1985,1229,1425,9,8,1
4,1985,1242,1325,3,14,-11


Dataframe with seed, relative seed, and win/loss. Use for training

In [8]:
df_wins = pd.DataFrame()
df_wins['Seed'] = df_concat['WSeed']
df_wins['SeedDiff'] = df_concat['SeedDiff']
df_wins['Result'] = 1
df_wins.size

6351

In [9]:
df_losses = pd.DataFrame()
df_losses['Seed'] = df_concat['LSeed']
df_losses['SeedDiff'] = -df_concat['SeedDiff']
df_losses['Result'] = 0

df_predictions = pd.concat((df_wins, df_losses))
df_predictions.head()

Unnamed: 0,Seed,SeedDiff,Result
0,9,1,1
1,11,5,1
2,1,-15,1
3,9,1,1
4,3,-11,1


In [10]:
X = df_predictions.loc[:,'Seed':'SeedDiff'].values.reshape(-1,2)
y = df_predictions.Result.values

# Set input_dim
xDim = np.shape(X)[1]

## Kfold Classifier - base model, no standardization ##
Set up cross validation, and pass inputs to model
- 100 epochs (iterations over entire x and y data), with batch size 5(iterations before gradient update)
- k fold sets cross validation, splits the data 10 times
- build_fn => callable model instance
- Stratified kfold divides the data into various cross val sets and presents 
- cross_val_score evaluates across the kfold sets


In [11]:
# base model
def create_base():
    # create model
    model = Sequential()
    model.add(Dense(10, input_dim=xDim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [12]:
# evaluate model with standardized dataset
# estimator = KerasClassifier(build_fn=create_base, epochs=100, batch_size=5, verbose=1)
# kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
# results = cross_val_score(estimator, X, y, cv=kfold, scoring = 'neg_log_loss')
# print("Results: %.4f%% (%.4f%%)" % (results.mean(), results.std()))

### Now run the model with standardized data using StandardScaler
- Rather than performing the scaling beforehand and possibily giving away distribution info on unseen data, a pipeline is used
    - This then performs the scaling on each fold as it is processed
- A pipeline is made of an sequence of steps and actions
    - In this case the steps are 'standardize' then 'mlp' and the actions are StandardScaler and KerasClassifier
-

In [13]:
# evaluate baseline model with standardized dataset
TB = TensorBoard(log_dir="logs/{}".format(time()))

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_base, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)

In [14]:
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
# resultsScaled = cross_val_score(pipeline, X, y, cv=kfold, scoring = 'neg_log_loss')
# print("Standardized: %.4f%% (%.4f%%)" % (resultsScaled.mean(), resultsScaled.std()))

### Reduce the Number of Hidden Nodes

In [15]:
# smaller model
def create_smaller():
    # create model
    model = Sequential()
    model.add(Dense(2, input_dim=xDim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimatorsSmall = []
estimatorsSmall.append(('standardize', StandardScaler()))
estimatorsSmall.append(('mlp', KerasClassifier(build_fn=create_smaller, epochs=100, batch_size=5, verbose=0)))
pipelineSmall = Pipeline(estimatorsSmall)

In [16]:
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
# resultsS = cross_val_score(pipelineSmall, X, y, cv=kfold, scoring = 'neg_log_loss')
# print("Standardized: %.4f%% (%.4f%%)" % (resultsS.mean(), resultsS.std()))

### Increase Number of Hidden Nodes and Layers

In [17]:
# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(10, input_dim=xDim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimatorsBig = []
estimatorsBig.append(('standardize', StandardScaler()))
estimatorsBig.append(('mlp', KerasClassifier(build_fn=create_larger, epochs=100, batch_size=5, verbose=0)))
pipelineBig = Pipeline(estimatorsBig)

In [18]:
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
# resultsB = cross_val_score(pipelineBig, X, y, cv=kfold, scoring = 'neg_log_loss')
# print("Standardized: %.4f%% (%.4f%%)" % (resultsB.mean(), resultsB.std()))

## Now fit model with best log-loss

In [19]:
pipeline.fit(X,y)



Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7efc98770128>)])

In [20]:
pipeline.score(X,y)



0.712564961994432

### Extract the data desired for the submission

In [21]:
df_sample_sub = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

In [22]:
X_test = np.zeros(shape=(n_test_games, 2))
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values[0]
    diff_seed = t1_seed - t2_seed
    X_test[ii, 0] = t1_seed
    X_test[ii, 1] = diff_seed


## Make Predictions ##
Create predictions using the logistic regression model we trained.

In [23]:
preds = pipeline.predict_proba(X_test)[:,1]

clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub.Pred = clipped_preds
df_sample_sub.head(100)

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.457501
1,2014_1107_1112,0.050000
2,2014_1107_1113,0.310956
3,2014_1107_1124,0.225349
4,2014_1107_1140,0.310956
5,2014_1107_1142,0.498293
6,2014_1107_1153,0.187487
7,2014_1107_1157,0.498293
8,2014_1107_1160,0.265962
9,2014_1107_1163,0.245087


Lastly, create your submission file!

In [26]:
filename = 'KerasMLP_onlyseeds'
c=0
ext = '.csv'
if os.path.exists(filename+ext):
    while os.path.exists(filename+ext):
        c+=1
        filename = filename+'_'+str(c)
    df_sample_sub.to_csv(filename+ext, index=False)
else:
    df_sample_sub.to_csv(filename+ext, index=False)