# Setup

In [212]:
import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

In [206]:
def display_df(df, n=1):
    display(df.head(n), df.tail(n), df.shape)

In [207]:
data_dir = '../data/interim/'

# Read Data

In [250]:
df_interim = pd.read_csv(data_dir + 'model_dataset.csv')
df_interim_sub = pd.read_csv(data_dir + 'submission_dataset.csv')

# Train / Validation Split

In [210]:
df_train = df_interim.loc[df_interim.Season.isin(np.arange(1985, 2014))]
df_valid = df_interim.loc[df_interim.Season.isin(np.arange(2014, 2019))]

# Train Model

In [247]:
features = ['TeamOneSeed', 'TeamTwoSeed', 'SeedDiff']

In [248]:
clf = RandomForestClassifier(n_estimators=100, random_state=3)
y_train = df_train.Label.values
X_train = df_train[features].values
clf.fit(X_train, y_train)

# Train score
yp_train = clf.predict_proba(X_train)[:, 1]
score_train = log_loss(y_train, yp_train, normalize=True)
print(score_train)

0.498012189154


# Validation Score

In [249]:
y_valid = df_valid.Label.values
X_valid = df_valid[features].values

# Validation score
yp_valid = clf.predict_proba(X_valid)[:, 1]
score_valid = log_loss(y_valid, yp_valid, normalize=True)
print(score_valid)

0.688018578408


# Save Model

In [220]:
outfile = open('../models/clf.model', 'wb')
pickle.dump(clf, outfile)
outfile.close()

# Create Submission File

In [251]:
def create_ID(row):
    """TBD"""
    return str(row.Season) + '_' + str(row.TeamOneID) + '_' + str(row.TeamTwoID)

In [257]:
X_sub = df_interim_sub[features].values

df_final_sub = (
    df_interim_sub
    .assign(Pred=clf.predict_proba(X_sub)[:, 1])
    .assign(ID=df_interim_sub.apply(create_ID, axis=1))
    .loc[:, ['ID', 'Pred']]
)

In [262]:
df_final_sub.to_csv('../models/clf_submission.csv', index=False)