In [36]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [27]:
class NHL_Period_Data:
    def __init__(self):
        self.data_x, self.data_y = self.__parseData__()
        
    def __parseData__(self):
        data = np.genfromtxt("data/periodStats.txt", delimiter=",", dtype=int, skip_header=1)
        return data[...,1:-2],data[...,-1]
    
    def transformData(self, encoder="StandardScaler"):
        col_tr = ColumnTransformer([(encoder, getattr(preprocessing,encoder)(), list(range(len(self.data_x[0]))))])
        col_tr.fit(self.data_x, self.data_y)
        self.data_x = col_tr.transform(self.data_x)
        
    def trainTestSplit(self, split=0.2):
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(self.data_x, self.data_y, test_size=split)
        
    def selectKBest(self, k=10):
        selector = SelectKBest(k=k).fit(self.train_x, self.train_y)
        self.train_x = selector.transform(self.train_x)
        self.test_x = selector.transform(self.test_x)

In [3]:
data = NHL_Period_Data()
print(data.data_x[0])
data.transformData("MinMaxScaler")
print(data.data_x[0])
data.trainTestSplit()

[ 0  7  6  9  5  4  6 15 11  3  4  2  0  6 10  4]
[0.46666667 0.31818182 0.33333333 0.40909091 0.2173913  0.26666667
 0.35294118 0.48387097 0.34375    0.2        0.21052632 0.08333333
 0.         0.2        0.35714286 0.30769231]


## KNN Classifier

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

params = {'n_neighbors':list(range(3,15,2)), 'weights':['uniform','distance'],
          'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size':list(range(10,40)),
          'p':[1,2], 'n_jobs':[-1]}
#knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', algorithm='auto', leaf_size=30,
#                           p=2, metric='minkowski', metric_params=None, n_jobs=-1)
knn = KNeighborsClassifier()
model = GridSearchCV(knn, params)
model.fit(data.train_x, data.train_y)
pred_vals = model.predict(data.test_x)
print("Predicted values: " + str(pred_vals[0:10]))
print("Actual Values:    " + str(data.test_y[0:10]))
acc = accuracy_score(data.test_y, pred_vals)
print("Validation accuracy: " + str(acc))



Predicted values: [ 0  0  0 -1  1  0 -1 -1  0  0]
Actual Values:    [ 1  0  1  0 -1 -2  1 -1  0 -1]
Validation accuracy: 0.3179036977919659


In [30]:
stand_data = NHL_Period_Data()
stand_data.transformData()
stand_data.trainTestSplit()
#stand_data.selectKBest(15)

In [31]:
#print(model.best_params_)
# {'algorithm': 'auto', 'leaf_size': 20, 'n_jobs': -1, 'n_neighbors': 13, 'p': 2, 'weights': 'distance'}

best_knn = KNeighborsClassifier(n_neighbors=13, weights='distance', algorithm='auto', leaf_size=8,
                                p=2, n_jobs=-1)
best_knn.fit(stand_data.train_x, stand_data.train_y)
best_pred = best_knn.predict(stand_data.test_x)
b_acc = accuracy_score(stand_data.test_y, best_pred)
print("Acc for leaf_size 8: " + str(b_acc))

Acc for leaf_size 8: 0.33160415003990423


In [33]:
for i in reversed(range(1,17)):
    stand_data.selectKBest(i)
    best_knn.fit(stand_data.train_x, stand_data.train_y)
    best_pred = best_knn.predict(stand_data.test_x)
    b_acc = accuracy_score(stand_data.test_y, best_pred)
    print("Acc for " + str(i) + " features: " + str(b_acc))

Acc for 16 features: 0.33160415003990423
Acc for 15 features: 0.3187017823889332
Acc for 14 features: 0.31670657089651505
Acc for 13 features: 0.3163075285980314
Acc for 12 features: 0.31683958499600956
Acc for 11 features: 0.31683958499600956
Acc for 10 features: 0.3120510774142059
Acc for 9 features: 0.30274009044958766
Acc for 8 features: 0.3026070763500931
Acc for 7 features: 0.3055333865389731
Acc for 6 features: 0.30593242883745675
Acc for 5 features: 0.28757648310720935
Acc for 4 features: 0.29755254056930036
Acc for 3 features: 0.30207501995211494
Acc for 2 features: 0.2870444267092312
Acc for 1 features: 0.30061186485767494


## Decision Tree Classifier

In [34]:
stand_data = NHL_Period_Data()
stand_data.transformData()
stand_data.trainTestSplit()

In [38]:
rf_class = RandomForestClassifier(n_estimators=71,criterion='gini',max_depth=16,min_samples_split=2,
                                 min_samples_leaf=1,min_weight_fraction_leaf=0.0,
                                 max_features='auto',max_leaf_nodes=61,min_impurity_decrease=0.0)

rf_class.fit(stand_data.train_x, stand_data.train_y)
rf_pred = rf_class.predict(stand_data.test_x)
rf_acc = accuracy_score(stand_data.test_y, rf_pred)
print("Acc for random forest: " + str(rf_acc))

Acc for random forest: 0.3538175046554935
