In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier 
import seaborn as sns
from sklearn.metrics import accuracy_score
import pickle

In [3]:
data = pd.read_csv(r'C:\Users\akobe\lighthouse-data-notes\Final-Data\final_data\all_merged_data_full_dob.csv', index_col = [0]) #index col makes sures another index column is not needed

In [4]:
data.head(2)

Unnamed: 0,nhl_id,elite_id,name,dob,draft_year,draft_season,prospect_gp,prospect_g,prospect_a,prospect_pts,...,position,prospect_category,amateur_league,amateur_team,nhl_games_played,200+games,birth_year,birth_month,birth_day,height_cm
0,8475166,9223,John Tavares,1990-09-20,2009,2008-2009,56,58,46,104,...,C,North American Skater,OHL,London,1017,1,1990,9,20,185.42
1,8475167,6007,Victor Hedman,1990-12-18,2009,2008-2009,45,7,16,23,...,D,European Skater,SWEDEN,Modo,964,1,1990,12,18,198.12


In [5]:
#drop unwanted columns 
data = data.drop(['nhl_id', 'elite_id', 'name', 'draft_season',  'nhl_games_played', 'pick_no', 'team', 'height'], axis=1)

In [6]:
data.head(2)

Unnamed: 0,dob,draft_year,prospect_gp,prospect_g,prospect_a,prospect_pts,prospect_pim,prospect_pm,birth_country,weight,shoots,position,prospect_category,amateur_league,amateur_team,200+games,birth_year,birth_month,birth_day,height_cm
0,1990-09-20,2009,56,58,46,104,54,10,CAN,209,L,C,North American Skater,OHL,London,1,1990,9,20,185.42
1,1990-12-18,2009,45,7,16,23,62,21,SWE,220,L,D,European Skater,SWEDEN,Modo,1,1990,12,18,198.12


In [7]:
data['age'] = data['draft_year'] - data['birth_year']

In [8]:
def calc_prorated(df):
    ''' calculates player goals/assists/points to a 70 game season 
        (pts/games played) * 70'''
    prorated_games = 70
    
    df['pro_g'] = round((df['prospect_g']/df['prospect_gp']) * prorated_games, 2)
    df['pro_a'] = round((df['prospect_a']/df['prospect_gp']) * prorated_games, 2)
    df['pro_pts'] = round((df['prospect_pts']/df['prospect_gp']) * prorated_games, 2)
    

In [9]:
calc_prorated(data)

In [25]:
#define and apply league translation factor 
def apply_league_quality_translation(df):
    
    ''' adjusts players g/a/pts based on league quality
            multipy g/a/pts by the translation factor'''
    
    #translation factors 2016-2017
    KHL_transl = 0.77
    SHL_transl = 0.62 #Swedish hockey league 
    Finland_transl = 0.46 # Finland SM-liiga
    WCHA_transl = 0.44 # pre-2013 - Western Collegiate Hockey Association 
    NCHC_transl = 0.43 # National Collegiate Hockey Association 
    NLA_transl = 0.43 #Switzerland 
    hockey_east_transl = 0.38
    big_10_transl = 0.33
    CCHA_transl = 0.32 #Central Collegiate Hockey Assocaitiaion, now defunct
    OHL_transl = 0.31
    WHL_transl = 0.28
    QMJHL_transl = 0.25
    ECAC_transl = 0.23
    Czech_transl = 0.46
    other_transl = round((KHL_transl + SHL_transl +  Finland_transl + WCHA_transl + NCHC_transl + NLA_transl + hockey_east_transl + big_10_transl + 
                      CCHA_transl + OHL_transl + WHL_transl + QMJHL_transl + ECAC_transl  + Czech_transl)/14, 2)

    
    #applying the correct translation factor 
    translation_factor = []
    
    for i in df['amateur_league']:
        if i == 'OHL':
            translation_factor.append(OHL_transl)
        elif i == 'SWEDEN':
            translation_factor.append(SHL_transl)
        elif i == 'SWEDEN-2':
            translation_factor.append(SHL_transl)
        elif i == 'WHL':
            translation_factor.append(WHL_transl)
        elif i == 'QMJHL':
            translation_factor.append(QMJHL_transl)
        elif i == 'WCHA':
            translation_factor.append(WCHA_transl)
        elif i == 'FINLAND':
            translation_factor.append(Finland_transl)
        elif i == 'CCHA':
            translation_factor.append(CCHA_transl)
        elif i == 'H-EAST':
            translation_factor.append(hockey_east_transl)
        elif i == 'KHL':
            translation_factor.append(KHL_transl)
        elif i == 'BIG10':
            translation_factor.append(big_10_transl)
        elif i == 'CZECH':
            translation_factor.append(Czech_transl)
        elif i == 'NCHC':
            translation_factor.append(NCHC_transl)
        else: 
            translation_factor.append(other_transl)
     
    #creates translation factor column
    df['translation_factor'] = translation_factor
    
    #calculates g/a/pts based on league translation factor 
    df['transl_g'] = round(df['pro_g'] * df['translation_factor'], 2)
    df['transl_a'] = round(df['pro_a'] * df['translation_factor'], 2)
    df['transl_pts'] = round(df['pro_pts'] * df['translation_factor'], 2)
    

In [28]:
#apply the function
apply_league_quality_translation(data)

In [29]:
data.head()

Unnamed: 0,dob,draft_year,prospect_gp,prospect_g,prospect_a,prospect_pts,prospect_pim,prospect_pm,birth_country,weight,...,birth_day,height_cm,age,pro_g,pro_a,pro_pts,translation_factor,transl_g,transl_a,transl_pts
0,1990-09-20,2009,56,58,46,104,54,10,CAN,209,...,20,185.42,19,72.5,57.5,130.0,0.31,22.48,17.82,40.3
1,1990-12-18,2009,45,7,16,23,62,21,SWE,220,...,18,198.12,19,10.89,24.89,35.78,0.62,6.75,15.43,22.18
2,1991-01-16,2009,57,31,48,79,42,32,CAN,196,...,16,182.88,18,38.07,58.95,97.02,0.31,11.8,18.27,30.08
3,1991-08-02,2009,61,48,48,96,89,51,CAN,176,...,2,185.42,18,55.08,55.08,110.16,0.28,15.42,15.42,30.84
4,1991-08-22,2009,70,32,56,88,82,20,CAN,192,...,22,182.88,18,32.0,56.0,88.0,0.28,8.96,15.68,24.64
