In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier 
import seaborn as sns
from sklearn.metrics import accuracy_score
import pickle
from matplotlib import pyplot as plt
sns.set_theme(context = 'talk', style = 'darkgrid')

In [2]:
data = pd.read_csv(r'C:\Users\akobe\lighthouse-data-notes\Final-Data\final_data\all_transformed_data.csv', index_col = [0])

In [3]:
data = data.drop(['prospect_gp', 'dob', 'draft_year', 'translation_factor', 'eq_pts', 'amateur_team', 'birth_country', 'amateur_league'], axis=1)

In [4]:
def positions(df):
    position_2 = []
    
    for i in df['position']: 
        if i == 'D':
            position_2.append('D')
        else:
            position_2.append('F')
    
    df['position_2'] = position_2

In [5]:
positions(data)

In [6]:
data.head(2)

Unnamed: 0,prospect_pim,prospect_pm,weight,shoots,position,prospect_category,200+games,birth_month,height_cm,eq_g,eq_a,oGVT,dGVT,overall_GVT,position_2
0,54,10,209,L,C,North American Skater,1,9,185.42,21.33,16.91,0.37,0.06,0.43,F
1,62,21,220,L,D,European Skater,1,12,198.12,6.49,14.84,0.58,0.17,0.75,D


In [7]:
#split into target variables and features 
X = data.drop(columns=['200+games', 'position']) #features 
y = data['200+games'] #target variables 

In [8]:
#creating test-train-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [11]:
#creating a pipeline 
#model 1 - to determine best hyperparameters 
#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerica/categorical column 
preprocessing_tips = ColumnTransformer([('numeric', numeric_transform, ['prospect_pim', 'prospect_pm', 'weight', 'height_cm', 'eq_g', 'eq_a', 'oGVT', 'dGVT', 'overall_GVT']),
                                        ('categorical', categorical_transform, ['shoots', 'prospect_category',  'birth_month', 'position_2'])])

#pipeline
pipeline = Pipeline(steps= [('preprocessing', preprocessing_tips),
                            ('select_best', SelectKBest()),
                            ('classifier', LogisticRegression())])


param_grid = {'classifier': [LogisticRegression(), SVC(), RandomForestClassifier(), RidgeClassifier()],
              'select_best__k': [3,4,5,6]}

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)

print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')



Best test set accuracy: 0.6271186440677966
Achieved with hyperparameters: {'classifier': RandomForestClassifier(), 'select_best__k': 6}


In [22]:
#creating a pipeline 
#how to handle numerical and categorical variables 
numeric_transform = Pipeline([('scaling', StandardScaler())])
categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

#indicating numerica/categorical column 
preprocessing = ColumnTransformer([('numeric', numeric_transform, ['prospect_pim', 'prospect_pm', 'weight', 'height_cm', 'eq_g', 'eq_a', 'oGVT', 'dGVT', 'overall_GVT']),
                                        ('categorical', categorical_transform, ['shoots', 'prospect_category',  'birth_month', 'position_2'])])

#pipeline
pipeline = Pipeline(steps= [('preprocessing', preprocessing),
                            ('select_best', SelectKBest(k=5)),
                            ('classifier', RandomForestClassifier())])


pipeline.fit(X_train, y_train)

best_acc = round(pipeline.score(X_test, y_test), 2)
best_features = pipeline.named_steps['select_best'].get_support()

print(f'best test accuracy: {best_acc}.')
print(f'The best features are: {best_features}')

best test accuracy: 0.68.
The best features are: [False False False False  True  True False  True  True False False False
 False False False False False False False False False False  True False
 False False False]


In [23]:
# Save the model
with open('model21_transformed_data.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [33]:
test_data = {'prospect_pim': [54],  'prospect_pm': [10], 'weight': [209], 'shoots': ['L'], 'prospect_category': ['North American Skater'], 'birth_month': [9], 'height_cm': [185.42], 
                 'eq_g': [21.33], 'eq_a': [19.91], 'oGVT': [0.37], 'dGVT': [0.06], 'overall_GVT': [0.43], 'position_2': ['F']}

In [44]:
test_data_df =  pd.DataFrame.from_dict(test_data)

In [46]:
pipeline.predict(test_data_df)

array([1], dtype=int64)

In [45]:
test_data_df.head()

Unnamed: 0,prospect_pim,prospect_pm,weight,shoots,prospect_category,birth_month,height_cm,eq_g,eq_a,oGVT,dGVT,overall_GVT,position_2
0,54,10,209,L,North American Skater,9,185.42,21.33,19.91,0.37,0.06,0.43,F


In [15]:
X_train.head(1)

Unnamed: 0,prospect_pim,prospect_pm,weight,shoots,prospect_category,birth_month,height_cm,eq_g,eq_a,oGVT,dGVT,overall_GVT,position_2
232,20,11,185,L,North American Skater,11,187.96,2.72,10.61,-0.56,-1.02,-1.58,D


In [18]:
X_train_cat = data.drop(['prospect_pim', 'prospect_pm', 'weight', 'height_cm', 'eq_g', 'eq_a', 'oGVT', 'dGVT', 'overall_GVT', 'position'], axis=1)

In [19]:
X_train_cat.head(1)

Unnamed: 0,shoots,prospect_category,200+games,birth_month,position_2
0,L,North American Skater,1,9,F


In [20]:
ohe = OneHotEncoder()

In [21]:
transformed = ohe.fit_transform(X_train_cat)

In [22]:
print(ohe.categories_)

[array(['L', 'R'], dtype=object), array(['European Skater', 'North American Skater'], dtype=object), array([0, 1], dtype=int64), array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64), array(['D', 'F'], dtype=object)]
