In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier 
import seaborn as sns
from sklearn.metrics import accuracy_score
import pickle
from matplotlib import pyplot as plt
sns.set_theme(context = 'talk', style = 'darkgrid')

In [5]:
data = pd.read_csv(r'C:\Users\akobe\lighthouse-data-notes\Final-Data\final_data\all_transformed_data.csv', index_col = [0])

In [6]:
data = data.drop(['dob', 'draft_year', 'translation_factor', 'eq_pts'], axis=1)

In [8]:
data.head(2)

Unnamed: 0,prospect_gp,prospect_pim,prospect_pm,birth_country,weight,shoots,position,prospect_category,amateur_league,amateur_team,200+games,birth_month,height_cm,eq_g,eq_a,oGVT,dGVT,overall_GVT
0,56,54,10,CAN,209,L,C,North American Skater,OHL,London,1,9,185.42,21.33,16.91,0.37,0.06,0.43
1,45,62,21,SWE,220,L,D,European Skater,SWEDEN,Modo,1,12,198.12,6.49,14.84,0.58,0.17,0.75


In [9]:
#split into target variables and features 
X = data.drop(columns=['200+games']) #features 
y = data['200+games'] #target variables 

In [12]:
#creating test-train-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [17]:
#creating a pipeline 
#model 1
#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerica/categorical column 
preprocessing_tips = ColumnTransformer([('numeric', numeric_transform, ['prospect_gp', 'prospect_pim', 'prospect_pm', 'weight', 'height_cm', 'eq_g', 'eq_a', 'oGVT', 'dGVT', 'overall_GVT']),
                                        ('categorical', categorical_transform, ['birth_country', 'shoots', 'position', 'prospect_category', 'amateur_league',  'amateur_team', 'birth_month'])])

#pipeline
pipeline = Pipeline(steps= [('preprocessing', preprocessing_tips),
                            ('select_best', SelectKBest()),
                            ('classifier', LogisticRegression())])


param_grid = {'classifier': [LogisticRegression(), SVC(), RandomForestClassifier(), RidgeClassifier()],
              'select_best__k': [3,4,5,6,7]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')

Best test set accuracy: 0.6440677966101694
Achieved with hyperparameters: {'classifier': RandomForestClassifier(), 'select_best__k': 7}


In [15]:
# Save the model
with open('model1_transformed_data', 'wb') as f:
    pickle.dump(grid, f)