In [None]:
"""These are the variables for the subsequent models"""
classifier = 'series' # series or game
stats_df = "hockey_stats_train.txt" #all the stats
test_df = "hockey_stats_2015.txt" #2015 stats
playoff = "2015_playoffs.txt" #playoff structure of 2015
year = "2015" #test year

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

def model_parameters(stats_df, classifier, test_df, playoff):
    """This function reads in the overall stats df and parses it into the 
    training data and then the classes of the training data. It scales the training
    data for downstream analysis. It also reads in the test df and parses it into
    the test data as well as its classifiers. It also scales the test data based on the 
    scaler transformation of the training data. It then returns the df's
    for subsequent analysis."""
    
    scaler = StandardScaler()
    stats_df = pd.read_csv(stats_df, sep = "\t")
    train_df = stats_df.iloc[:,4:len(stats_df.columns)]
    scaler.fit(train_df)
    train_df = scaler.transform(train_df)
    test_df = pd.read_csv(test_df, sep = "\t")
    teams = test_df.iloc[:,0]
    playoff_df = pd.read_csv(playoff, sep = "\t", header = None)
    if classifier == "series":
        train_class = stats_df.iloc[:,2]
        test_class = test_df.iloc[:,2]
    elif classifier == "game":
        train_class = stats_df.iloc[:,3]
        test_class = test_df.iloc[:,3]
    test_df = test_df.iloc[:,4:len(stats_df.columns)]
    test_df = scaler.transform(test_df)
        
    return(train_df, train_class, test_df, test_class, teams, playoff_df)

train_df, train_class, test_df, test_class, teams, playoff_df = model_parameters(stats_df, classifier, test_df, playoff)
#print(train_df)
#print(train_class)
print(test_df)
print(test_class)

In [None]:
from sklearn.neural_network import MLPClassifier
def ML_neural_network(train_df, train_class, test_df, test_class):
    """This function reads in the training df and classifier and runs an exhaustive parameter search for multi-layer
    perceptron classifier (neural network) with gridsearchcv(). It then proceeds to determine 
    the best set of parameters for multi-layer
    perceptron classifier with the repeated stratified
    Kfold cross validation and then applies the best parameters and predicts the classes on the test data set.
    It finally returns the predicted classes on the test data set, probabilities 
    of belonging to the classes on the test data set, the number of correct classifiers on the test data set, 
    the best parameters, and the best score from the gridsearchcv().
    
    We decided to leave out solvers adam and sgd as they did not converge on the training data
    and lbfgs was recomended for smaller datasets such as the one we have"""

    parameters = {'solver':['lbfgs'], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'alpha':[.1, .01, .001], 'max_iter':[500]}
    kfold = RepeatedStratifiedKFold(n_splits=4, n_repeats=5) # the cross validator
    ml = MLPClassifier() #the model we are running
    clf = GridSearchCV(ml, parameters, cv = kfold, scoring='accuracy', n_jobs=-1)
    clf.fit(train_df, train_class) #fits the best parameters to the training data
    predict_df = clf.predict(test_df) #predicts the classes on the test df
    probability_df = clf.predict_proba(test_df) #gets the probabilites of each instance belonging to the class on the test df
    ml_score = clf.score(test_df, test_class) #calculates the score of correct classifications from predict
    params = clf.best_params_ #best parameters from gridsearchcv()
    bestscore = clf.best_score_ #best score from gridsearchcv()
    return(predict_df, probability_df, ml_score, params, bestscore)

predict_df, probability_df, ml_score, params, bestscore = ML_neural_network(train_df, train_class, test_df, test_class)

print("The best parameters are:", '\n', params, '\n')
print("The score of the 2015 playoffs are: ", ml_score, '\n')
print("The best score from the CV are: ", bestscore, '\n')

In [None]:
from sklearn.linear_model import LogisticRegression
def ML_logistic_regression(train_df, train_class, test_df, test_class):
    
    """This function reads in the training df and classifier and runs an exhaustive parameter search for logistic regression
    with gridsearchcv(). Note that the parameters for this logistic regression that are different from the other 
    logistic regression cells are:
    
    'multi_class':['ovr', 'multinomial']
    'penalty':['l2']
    'solver':['lbfgs', 'newton-cg']
    
    It then proceeds to determine the best set of parameters for logistic regression with the repeated stratified
    Kfold cross validation and then applies the best parameters and predicts the classes on the test data set.
    It finally returns the predicted classes on the test data set, probabilities 
    of belonging to the classes on the test data set, the number of correct classifiers on the test data set, 
    the best parameters, and the best score from the gridsearchcv().
    
    We decided to leave out solvers sag and saga as they did not converge on the training data and are not recomended
    for smaller datasets such as ours. The solvers we used in this class can onyl use l2 penalty."""

    parameters = {'multi_class':['ovr', 'multinomial'], 'penalty':['l2'], 'C':[.1, 1.0, 10], 'solver':['lbfgs', 'newton-cg'], 'class_weight':['balanced'], 'max_iter':[500]}
    kfold = RepeatedStratifiedKFold(n_splits=4, n_repeats=5) #the cross validator
    ml = LogisticRegression() #the model we are running
    clf = GridSearchCV(ml, parameters, cv = kfold, scoring = 'accuracy', n_jobs=-1)
    clf.fit(train_df, train_class) #fits the best parameters to the training data
    predict_df = clf.predict(test_df) #predicts the classes on the test df
    probability_df = clf.predict_proba(test_df) #gets the probabilites of each instance belonging to the class on the test df
    ml_score = clf.score(test_df, test_class) #calculates the score of correct classifications from predict
    params = clf.best_params_ #best parameters from gridsearchcv()
    bestscore = clf.best_score_ #best score from gridsearchcv()
    return(predict_df, probability_df, ml_score, params, bestscore)

predict_df, probability_df, ml_score, params, bestscore = ML_logistic_regression(train_df, train_class, test_df, test_class)

print("The best parameters are:", '\n', params, '\n')
print("The score of the 2015 playoffs are: ", ml_score, '\n')
print("The best score from the CV are: ", bestscore, '\n')