In [11]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier 
import seaborn as sns
from sklearn.metrics import accuracy_score
import pickle

In [2]:
#upload data 
data = pd.read_csv(r'C:\Users\akobe\lighthouse-data-notes\Final-Data\final_data\all_merged_data.csv', index_col = [0])

In [3]:
data.head(2)

Unnamed: 0,nhl_id,elite_id,name,draft_year,draft_season,prospect_gp,prospect_g,prospect_a,prospect_pts,prospect_pim,...,weight,shoots,position,prospect_category,amateur_league,amateur_team,nhl_games_played,200+games,birth_month,height_cm
0,8475166,9223,John Tavares,2009,2008-2009,56,58,46,104,54,...,209,L,C,North American Skater,OHL,London,1017,1,9,185.42
1,8475167,6007,Victor Hedman,2009,2008-2009,45,7,16,23,62,...,220,L,D,European Skater,SWEDEN,Modo,964,1,12,198.12


In [4]:
data = data.drop(['nhl_id', 'elite_id', 'name', 'draft_year', 'draft_season', 'prospect_pts', 'nhl_games_played', 'team', 'height', 'pick_no'], axis=1)

In [12]:
data.head(2)

Unnamed: 0,prospect_gp,prospect_g,prospect_a,prospect_pim,prospect_pm,birth_country,weight,shoots,position,prospect_category,amateur_league,amateur_team,200+games,birth_month,height_cm
0,56,58,46,54,10,CAN,209,L,C,North American Skater,OHL,London,1,9,185.42
1,45,7,16,62,21,SWE,220,L,D,European Skater,SWEDEN,Modo,1,12,198.12


In [25]:
data['prospect_gp'].value_counts()

60    15
61    13
65    12
71    12
67    11
62    11
64    11
66    10
57     9
55     9
63     9
68     9
49     9
56     8
58     8
69     7
42     7
54     7
70     7
37     7
52     6
53     6
72     6
59     6
50     5
51     5
48     5
40     4
46     4
43     4
44     4
45     4
36     3
33     3
31     3
18     2
85     2
80     2
47     2
86     2
84     2
88     2
35     2
75     2
38     2
74     1
28     1
34     1
91     1
16     1
90     1
2      1
39     1
26     1
87     1
83     1
78     1
41     1
Name: prospect_gp, dtype: int64

In [6]:
#split into target variables and features 
X = data.drop(columns=['200+games']) #features 
y = data['200+games'] #target variables 

In [7]:
#creating test-train-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [50]:
## Logistic regression without parameter tuning 
## tester
#create a pipeline for each numerical and categorical column 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerica/categorical column 
preprocessing = ColumnTransformer([('numeric', numeric_transform, ['prospect_gp', 'prospect_g', 'prospect_a', 'prospect_pim', 'prospect_pm', 'weight', 'height_cm']),
                                        ('categorical', categorical_transform, ['birth_country', 'shoots', 'position', 'prospect_category', 'amateur_league', 'birth_month'])])

#final pipeline
pipeline = Pipeline(steps= [('preprocessing', preprocessing),
                            ('model', LogisticRegression())])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {acc}')


Test set accuracy: 0.5423728813559322


In [9]:
#creating a pipeline 
#model 1
#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerica/categorical column 
preprocessing_tips = ColumnTransformer([('numeric', numeric_transform, ['prospect_gp', 'prospect_g', 'prospect_a', 'prospect_pim', 'prospect_pm', 'weight', 'height_cm']),
                                        ('categorical', categorical_transform, ['birth_country', 'shoots', 'position', 'prospect_category', 'amateur_league',  'amateur_team', 'birth_month'])])

#pipeline
pipeline = Pipeline(steps= [('preprocessing', preprocessing_tips),
                            ('select_best', SelectKBest()),
                            ('classifier', LogisticRegression())])


param_grid = {'classifier': [LogisticRegression(), SVC(), RandomForestClassifier(), RidgeClassifier()],
              'select_best__k': [3,4,5,6,7]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')

Best test set accuracy: 0.5254237288135594
Achieved with hyperparameters: {'classifier': RandomForestClassifier(), 'select_best__k': 7}


In [43]:
# Save the model
#with open('model1-Mar23', 'wb') as f:
 #   pickle.dump(grid, f)

In [None]:
#creating a pipeline 
#model 2
#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerica/categorical column 
preprocessing_tips = ColumnTransformer([('numeric', numeric_transform, ['prospect_gp', 'prospect_g', 'prospect_a', 'prospect_pim', 'prospect_pm', 'weight', 'height_cm']),
                                        ('categorical', categorical_transform, ['birth_country', 'shoots', 'position', 'prospect_category', 'amateur_league',  'amateur_team', 'birth_month'])])

#pipeline
pipeline = Pipeline(steps= [('preprocessing', preprocessing_tips),
                            ('select_best', SelectKBest()),
                            ('classifier', LogisticRegression())])


param_grid = {'classifier': [LogisticRegression(), SVC(), RandomForestClassifier(), RidgeClassifier()],
              'select_best__k': [3,4,5,6,7]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')