# Setup

In [2]:
import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

In [3]:
def display_df(df, n=1):
    display(df.head(n), df.tail(n), df.shape)

In [4]:
data_dir = '../data/interim/'

# Read Data

In [5]:
df_interim = pd.read_csv(data_dir + 'model_dataset.csv')
df_interim_sub = pd.read_csv(data_dir + 'submission_dataset.csv')

# Train / Validation Split

In [6]:
df_train = df_interim.loc[df_interim.Season.isin(np.arange(1985, 2014))]
df_valid = df_interim.loc[df_interim.Season.isin(np.arange(2014, 2019))]

# Train Model

In [7]:
features = ['TeamOneSeed', 'TeamTwoSeed', 'SeedDiff']

In [8]:
clf = RandomForestClassifier(n_estimators=100, random_state=3)
y_train = df_train.Label.values
X_train = df_train[features].values
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=3, verbose=0, warm_start=False)

In [12]:
def clip_preds(y):
    return np.clip(y, 0.05, 0.95)

# Produce Scores

In [13]:
# Train score
yp_train = clip_preds(clf.predict_proba(X_train)[:, 1])
score_train = log_loss(y_train, yp_train, normalize=True)
print(score_train)

0.502631235042


In [14]:
y_valid = df_valid.Label.values
X_valid = df_valid[features].values

# Validation score
yp_valid = clip_preds(clf.predict_proba(X_valid)[:, 1])
score_valid = log_loss(y_valid, yp_valid, normalize=True)
print(score_valid)

0.584818153863


# Save Model

In [15]:
outfile = open('../models/clf.model', 'wb')
pickle.dump(clf, outfile)
outfile.close()

# Create Submission File

In [17]:
def create_ID(row):
    """TBD"""
    return str(row.Season) + '_' + str(row.TeamOneID) + '_' + str(row.TeamTwoID)

In [18]:
X_sub = df_interim_sub[features].values

df_final_sub = (
    df_interim_sub
    .assign(Pred=clip_preds(clf.predict_proba(X_sub)[:, 1]))
    .assign(ID=df_interim_sub.apply(create_ID, axis=1))
    .loc[:, ['ID', 'Pred']]
)

In [19]:
df_final_sub.to_csv('../models/clf_submission.csv', index=False)