# The Money Puck Recommender Engine:

In [33]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances

# Load the datasets
pd.set_option('display.max_columns', None) # Display Preference

In [34]:
#Saving the new MoneyPuck Datafranes as CSVs:
# All Situations:
MP_all_situations_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2021_2022.csv')
MP_all_situations_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2022_2023.csv')
MP_all_situations_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2023_2024.csv')
# 5on5:
MP_5on5_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2021_2022.csv')
MP_5on5_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2022_2023.csv')
MP_5on5_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2023_2024.csv')
# 4on5:
MP_4on5_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2021_2022.csv')
MP_4on5_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2022_2023.csv')
MP_4on5_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2023_2024.csv')
#5on4:
MP_5on4_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2021_2022.csv')
MP_5on4_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2022_2023.csv')
MP_5on4_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2023_2024.csv')
#Other Situations:
MP_other_situations_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2021_2022.csv')
MP_other_situations_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2022_2023.csv')
MP_other_situations_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2023_2024.csv')

In [35]:
#Concatenating the situational dataframes:
#All situations
MP_all_situations_frames = [MP_all_situations_2021_2022_df, MP_all_situations_2022_2023_df, MP_all_situations_2023_2024_df]
MP_AS_stats = pd.concat(MP_all_situations_frames, ignore_index=True)

#5on5:
MP_5on5_frames = [MP_5on5_2021_2022_df, MP_5on5_2022_2023_df, MP_5on5_2023_2024_df]
MP_5on5_stats = pd.concat(MP_5on5_frames, ignore_index=True)

#4on5:
MP_4on5_frames = [MP_4on5_2021_2022_df, MP_4on5_2022_2023_df, MP_4on5_2023_2024_df]
MP_4on5_stats = pd.concat(MP_4on5_frames, ignore_index=True)

#5on4:
MP_5on4_frames = [MP_5on4_2021_2022_df, MP_5on4_2022_2023_df, MP_5on4_2023_2024_df]
MP_5on4_stats = pd.concat(MP_5on4_frames, ignore_index=True)

#Other:
MP_other_situations_frames = [MP_other_situations_2021_2022_df, MP_other_situations_2022_2023_df, MP_other_situations_2023_2024_df]
MP_OS_stats = pd.concat(MP_other_situations_frames, ignore_index=True)

In [36]:
#Saving combined DFs to CSVs:
#All Situations:
MP_AS_stats.to_csv('MP_all_situations_2022_to_2024.csv', index=0)
#5on5:
MP_5on5_stats.to_csv('MP_5on5_2022_to_2024.csv', index=0)
#4on5:
MP_4on5_stats.to_csv('MP_4on5_2022_to_2024.csv', index=0)
#5on4:
MP_5on4_stats.to_csv('MP_5on4_2022_to_2024.csv', index=0)
#Other Situations:
MP_OS_stats.to_csv('MP_other_situations_2022_to_2024.csv', index=0)

### Adding the biographical data

In [37]:
# Reading the players' bio data
MP_player_bios = pd.read_csv('MP_NHL_data/allPlayersLookup.csv')

In [38]:
# Combining the data frames
MP_merged_AS_stats_bios = pd.merge(MP_AS_stats, MP_player_bios, on='playerId', how='left')
MP_merged_5on5_stats_bios = pd.merge(MP_5on5_stats, MP_player_bios, on='playerId', how='left')
MP_merged_4on5_stats_bios = pd.merge(MP_4on5_stats, MP_player_bios, on='playerId', how='left')
MP_merged_5on4_stats_bios = pd.merge(MP_5on4_stats, MP_player_bios, on='playerId', how='left')
MP_merged_OS_stats_bios = pd.merge(MP_OS_stats, MP_player_bios, on='playerId', how='left')

In [39]:
# Columns to drop after merge:
drop_cols_post_merge = ['primaryNumber', 'primaryPosition', 'name_y', 'position_y', 'team_y']

MP_merged_AS_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_5on5_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_4on5_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_5on4_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_OS_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)

In [40]:
# Renaming the columns that were shared in the merge:
col_rename_map = {'name_x': 'name', 'team_x': 'team', 'position_x': 'position','shootsCatches': 'shoots'}
MP_AS_stats = MP_merged_AS_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_5on5_stats = MP_merged_5on5_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_4on5_stats = MP_merged_4on5_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_5on4_stats = MP_merged_5on4_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_OS_stats = MP_merged_OS_stats_bios.rename(mapper=col_rename_map, axis=1)

## Feature Engineering:

### Average Ice Time Column

In [41]:
# Making an Avg IceTime per shift column

# All Situations
MP_AS_stats['avg_ice_time/shift (s)'] = round((MP_AS_stats['icetime'] / MP_AS_stats['shifts']), 0)
cols = MP_AS_stats.columns.tolist()
cols.insert(9, cols.pop(cols.index('avg_ice_time/shift (s)')))
MP_all_situations_stats = MP_AS_stats[cols]

#5on5:
MP_5on5_stats['avg_ice_time/shift (s)'] = round((MP_5on5_stats['icetime'] / MP_5on5_stats['shifts']), 0)
cols_5on5 = MP_5on5_stats.columns.tolist()
cols_5on5.insert(9, cols_5on5.pop(cols_5on5.index('avg_ice_time/shift (s)')))
MP_5on5_stats = MP_5on5_stats[cols_5on5]

#4on5:
MP_4on5_stats['avg_ice_time/shift (s)'] = round((MP_4on5_stats['icetime'] / MP_4on5_stats['shifts']), 0)
cols_4on5 = MP_4on5_stats.columns.tolist()
cols_4on5.insert(9, cols_4on5.pop(cols_4on5.index('avg_ice_time/shift (s)')))
MP_4on5_stats = MP_4on5_stats[cols_4on5]

#5on4:
MP_5on4_stats['avg_ice_time/shift (s)'] = round((MP_5on4_stats['icetime'] / MP_5on4_stats['shifts']), 0)
cols_5on4 = MP_5on4_stats.columns.tolist()
cols_5on4.insert(9, cols_5on4.pop(cols_5on4.index('avg_ice_time/shift (s)')))
MP_5on4_stats = MP_5on4_stats[cols_5on4]

#Other situations:
MP_OS_stats['avg_ice_time/shift (s)'] = round((MP_OS_stats['icetime'] / MP_OS_stats['shifts']), 0)
cols_OS = MP_OS_stats.columns.tolist()
cols_OS.insert(9, cols_OS.pop(cols_OS.index('avg_ice_time/shift (s)')))
MP_OS_stats = MP_OS_stats[cols_OS]


### Average shifts per game column:

In [42]:
# Making an Avg shifts per game column
#AS:
MP_AS_stats['avg_shifts_per_game'] = round(MP_AS_stats['shifts']/MP_AS_stats['games_played'], 0)
#5on5:
MP_5on5_stats['avg_shifts_per_game'] = round(MP_5on5_stats['shifts']/MP_5on5_stats['games_played'], 0)
#4on5:
MP_4on5_stats['avg_shifts_per_game'] = round(MP_4on5_stats['shifts']/MP_4on5_stats['games_played'], 0)
#5on4:
MP_5on4_stats['avg_shifts_per_game'] = round(MP_5on4_stats['shifts']/MP_5on4_stats['games_played'], 0)
#OS:
MP_OS_stats['avg_shifts_per_game'] = round(MP_OS_stats['shifts']/MP_OS_stats['games_played'], 0)

### Adjusting seasn to be the year the season finished to help get an accurate player age:

In [43]:
# Update the season info to represent the year the season ended rather than the year that started the season.

season_map = {2021: 2022, 2022: 2023, 2023: 2024}
MP_AS_stats['season'] = MP_AS_stats['season'].map(season_map)
MP_5on5_stats['season'] = MP_5on5_stats['season'].map(season_map)
MP_4on5_stats['season'] = MP_4on5_stats['season'].map(season_map)
MP_5on4_stats['season'] = MP_5on4_stats['season'].map(season_map)
MP_OS_stats['season'] = MP_OS_stats['season'].map(season_map)

### Player age column:

In [44]:
def MP_calculate_playing_age(df, dob_col_name, season_col_name, age_col_name):
    """
    Updates the age of players in the DataFrame based on their date of birth.

    Parameters:
    df (pd.DataFrame): The DataFrame containing player data.
    dob_col_name (str): The name of the column with date of birth information.
    age_col_name (str): The name of the column where the age should be updated.
    current_year (int): The year to calculate current age from.

    Returns:
    pd.DataFrame: The DataFrame with updated ages.
    """
    # Convert the 'birthDate' column to datetime format
    df[dob_col_name] = pd.to_datetime(df[dob_col_name], errors='coerce')  # Handle potential errors during conversion

    # Extract the year
    df['birth_year'] = df[dob_col_name].dt.year

    # Calculate the new age and replace the 'Age' column
    df[age_col_name] = df[season_col_name] - df['birth_year']

    # Drop the helper column
    df.drop(columns='birth_year', inplace=True)

    return df

In [45]:
#Making an Age column based on taking the season column and subtracting the year from the birthdate using the function I made
MP_AS_stats = MP_calculate_playing_age(df=MP_AS_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_5on5_stats = MP_calculate_playing_age(df=MP_5on5_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_4on5_stats = MP_calculate_playing_age(df=MP_4on5_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_5on4_stats = MP_calculate_playing_age(df=MP_5on4_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_OS_stats = MP_calculate_playing_age(df=MP_OS_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')

In [46]:
# Handling the places where ages missing because the NaNs prevent the Pipelines from working
missing_age_dict = {
    'Adam Edstrom': 23,
 'Adam Ginning': 24,
 'Adam Klapka': 23,
 'Akil Thomas': 24,
 'Aku Raty': 23,
 'Alex Vlasic': 23,
 'Andy Andreoff': 33,
 'Angus Crookshank': 24,
 'Anton Levtchi': 28,
 'Arshdeep Bains': 23,
 'Blake Lizotte': 26,
 'Brad Lambert': 20,
 'Bradly Nadeau': 19,
 'Brandon Gignac': 26,
 'Brandon Scanlin': 25,
 'Brendan Brisson': 22,
 'Brennan Othmann': 21,
 'Brian Halonen': 25,
 'Cameron Butler': 22,
 'Cameron Crotty': 25,
 'Collin Graf': 21,
 'Cutter Gauthier': 20,
 'Declan Carlile': 24,
 'Elliot Desnoyers': 22,
 'Emil Heineman': 22,
 'Emil Lilleberg': 23,
 'Ethan Del Mastro': 21,
 'Filip Roos': 25,
 'Frank Nazar': 20,
 'Gage Goncalves': 23,
 'Gavin Brindley': 19,
 'Georgii Merkulov':23,
 'Graeme Clarke': 23,
 'Hudson Fasching': 29,
 'Isak Rosen': 21,
 'Ivan Miroshnichenko': 20,
 'Jack St. Ivany': 25,
 'Jack Thompson': 22,
 'Jackson Blake': 21,
 'Jacob MacDonald': 31,
 'James Malatesta': 21,
 'Jason Polin': 25,
 'Jayden Struble': 22,
 'Jeff Malott': 28,
 'Jiri Kulich': 20,
 'Jiri Smejkal': 27,
 'Josh Doan': 22,
 'Joshua Roy': 21,
 'Justin Brazeau': 26,
 'Kyle MacLean': 25,
 'Landon Slaggert': 22,
 'Lane Hutson': 20,
 'Liam Ohgren': 20,
 'Linus Karlsson': 24,
 'Logan Mailloux': 21,
 'Logan Morrison': 22,
 'Logan Stankoven': 21,
 'Louis Crevier': 23,
 'Luca Del Bel Belluz': 20,
 'Lukas Cormier': 22,
 'Maksymilian Szuber': 21,
 'Marat Khusnutdinov': 22,
 'Marc Johnstone': 28,
 'Marshall Rifai': 26,
 'Mason Marchment': 29,
 'Mason Morelli': 28,
 'Matt Rempe': 22,
 'Matt Roy': 29,
 'Matt Savoie': 20,
 'Mavrik Bourque': 22,
 'Maxwell Crozier': 24,
 'Nathan Bastian': 26,
 'Nikita Chibrikov': 21,
 'Olen Zellweger': 20,
 'Olle Lycksell': 24,
 'Ondrej Pavel': 23,
 'Oskar Steen': 26,
 'Patrik Koch': 27,
 'Philip Kemp': 25,
 'Pierrick Dube': 23,
 'Ruslan Iskhakov': 24,
 'Ryan Winterton': 20,
 'Ryker Evans': 22,
 'Sam Colangelo': 22,
 'Sam Malinski': 26,
 'Samuel Laberge': 27,
 'Scott Morrow': 21,
 'Shakir Mukhamadullin': 22,
 'Simon Nemec': 20,
 'Vasily Ponomarev': 22,
 'William Lockwood': 26,
 'Wyatt Kaiser': 22,
 'Yan Kuznetsov': 22,
 'Zach Dean': 21,
 'Zachary Hayes': 25,
 'Zack Bolduc': 21,
 'Zack Ostapchuk': 21
 }

In [47]:
# Applying the missing_age_dict to the original dataframes
MP_AS_stats['age'] = MP_AS_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)
MP_5on5_stats['age'] = MP_5on5_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_4on5_stats['age'] = MP_4on5_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_5on4_stats['age'] = MP_5on4_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_OS_stats['age'] = MP_OS_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

### Making a binnned age_group column based on age:

In [48]:
# COLUMN EDITS
# Age Column: Making Age Bins 
bins = [0, 20, 26, 30, 35, 45]
labels = ['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']

MP_AS_stats['age_group'] = pd.cut(MP_AS_stats['age'], bins, labels=labels)
MP_5on5_stats['age_group'] = pd.cut(MP_5on5_stats['age'], bins, labels=labels)
MP_4on5_stats['age_group'] = pd.cut(MP_4on5_stats['age'], bins, labels=labels)
MP_5on4_stats['age_group'] = pd.cut(MP_5on4_stats['age'], bins, labels=labels)
MP_OS_stats['age_group'] = pd.cut(MP_OS_stats['age'], bins, labels=labels)


## Making the ZR_gameScore, playerRating and ZR_playerRating columns:

In [49]:
def calculate_ZR_gameScore(df):
    """
    Calculates the ZR_gameScore for a given DataFrame.

    Args:
        df: The DataFrame containing player statistics.

    Returns:
        The DataFrame with the 'ZR_gameScore' column added.
    """

    df['ZR_gameScore'] = (
        (df['I_F_goals'] * 0.75) 
        + (df['I_F_primaryAssists'] * 0.7) 
        + (df['I_F_secondaryAssists'] * 0.55)
        + (df['I_F_shotsOnGoal'] * 0.075) 
        + (df['shotsBlockedByPlayer'] * 0.05) 
        + (df['penaltiesDrawn'] * 0.15) 
        - (df['penalties'] * 0.15)
        + (df['I_F_hits'] * 0.01) 
        - (df['I_F_dZoneGiveaways'] * 0.03) 
        + (df['I_F_takeaways'] * 0.015) 
        - (df['I_F_giveaways'] * 0.015)
        + (df['onIce_corsiPercentage']) 
        + (df['faceoffsWon'] * 0.01) 
        - (df['faceoffsLost'] * 0.01)
        + (df['OnIce_F_goals'] * 0.15) 
        - (df['OnIce_A_goals'] * 0.15)
    )

    return df

# Apply the function to your DataFrames
MP_AS_stats = calculate_ZR_gameScore(MP_AS_stats)
MP_5on5_stats = calculate_ZR_gameScore(MP_5on5_stats)
MP_4on5_stats = calculate_ZR_gameScore(MP_4on5_stats)
MP_5on4_stats = calculate_ZR_gameScore(MP_5on4_stats)
MP_OS_stats = calculate_ZR_gameScore(MP_OS_stats)

In [50]:
# Group by 'season' and apply MinMaxScaler within each group
def scale_by_season(group):
    if len(group) == 1:  # Handle the case of only one value in a season
        return 100  # Assign the maximum rating if there's only one player
    else:
        scaler = MinMaxScaler(feature_range=(0, 100))
        return scaler.fit_transform(group.values.reshape(-1, 1)).ravel()

# Apply the transformation to create the 'playerRating' column
MP_AS_stats['playerRating'] = MP_AS_stats.groupby('season')['gameScore'].transform(scale_by_season)
MP_5on5_stats['playerRating'] = MP_5on5_stats.groupby('season')['gameScore'].transform(scale_by_season)
MP_4on5_stats['playerRating'] = MP_4on5_stats.groupby('season')['gameScore'].transform(scale_by_season)
MP_5on4_stats['playerRating'] = MP_5on4_stats.groupby('season')['gameScore'].transform(scale_by_season)
MP_OS_stats['playerRating'] = MP_OS_stats.groupby('season')['gameScore'].transform(scale_by_season)

# Apply the transformation to create the 'ZR_playerRating' column
MP_AS_stats['ZR_playerRating'] = MP_AS_stats.groupby('season')['ZR_gameScore'].transform(scale_by_season)
MP_5on5_stats['ZR_playerRating'] = MP_5on5_stats.groupby('season')['ZR_gameScore'].transform(scale_by_season)
MP_4on5_stats['ZR_playerRating'] = MP_4on5_stats.groupby('season')['ZR_gameScore'].transform(scale_by_season)
MP_5on4_stats['ZR_playerRating'] = MP_5on4_stats.groupby('season')['ZR_gameScore'].transform(scale_by_season)
MP_OS_stats['ZR_playerRating'] = MP_OS_stats.groupby('season')['ZR_gameScore'].transform(scale_by_season)

In [51]:
#Saving the new feature engineered dataframes to csvs:
MP_AS_stats.to_csv('MP_AS_stats_bios_new_features.csv', index=0)
MP_5on5_stats.to_csv('MP_5on5_stats_bios_new_features.csv', index=0)
MP_4on5_stats.to_csv('MP_4on5_stats_bios_new_features.csv', index=0)
MP_5on4_stats.to_csv('MP_5on4_stats_bios_new_features.csv', index=0)
MP_OS_stats.to_csv('MP_OS_stats_bios_new_features.csv', index=0)

## Making the functions of the recommender engine so that it is more user friendly:

In [52]:
def MP_create_player_index_dict(df):
      """
    Create a nested dictionary from a DataFrame that maps player names to their indices for each season.

    This function resets the index of the DataFrame to ensure that the index column 
    holds the original row indices. It then groups the DataFrame by 'name' and 'season' 
    and aggregates the indices into a list for each group. After grouping, it pivots the DataFrame 
    so each players' 'name' is a row with each 'season' as columns, containing lists of indices 
    as values. Finally, it converts the pivoted DataFrame into a nested dictionary where each player's 
    name is a key to a dictionary mapping each season to the player's indices.

    Parameters:
    df (pandas.DataFrame): The DataFrame to process, which must contain 'Player' and 'Season' columns 
                           and has a unique index.

    Returns:
    dict: A nested dictionary where the first level keys are player names, and second level keys are 
          seasons, each mapping to a list of index positions for that player in that season.
    """

    # Reset the index 
      df = df.reset_index()

    # Group by 'Player' and 'Season', then aggregate the original index values into a list.
      grouped = df.groupby(['name', 'season'])['index'].agg(lambda x: list(x)).reset_index()

    # Pivot the DataFrame to have 'Player' as rows and 'Season' as columns with list of indices as values.
      pivot_df = grouped.pivot(index='name', columns='season', values='index')

    # Convert the pivoted DataFrame into a nested dictionary.
      MP_player_index_dict = pivot_df.apply(lambda row: row.dropna().to_dict(), axis=1).to_dict()

      return MP_player_index_dict

In [53]:
# Saving Index Dict Variables:
MP_AS_player_dict = MP_create_player_index_dict(MP_AS_stats)
MP_5on5_player_dict = MP_create_player_index_dict(MP_5on5_stats)
MP_4on5_player_dict = MP_create_player_index_dict(MP_4on5_stats)
MP_5on4_player_dict = MP_create_player_index_dict(MP_5on4_stats)
MP_OS_player_dict = MP_create_player_index_dict(MP_OS_stats)

In [54]:
def MP_get_index_all_gamestates(player_name, MP_AS_dict= MP_AS_player_dict, MP_5on5_dict= MP_5on5_player_dict, 
                                MP_4on5_dict= MP_4on5_player_dict, MP_5on4_dict= MP_5on4_player_dict,
                                MP_OS_dict= MP_OS_player_dict):
    """
    Returns a string with all the indices for each game state (All Strengths, Even Strength,
    Power Play, and Penalty Kill) for a given player.
    
    Parameters:
    - player_name (str): The name of the player to lookup.
    - player_index_dict_AS (dict): The dictionary with indices for All Strengths.
    - player_index_dict_ES (dict): The dictionary with indices for Even Strength.
    - player_index_dict_PP (dict): The dictionary with indices for Power Play.
    - player_index_dict_PK (dict): The dictionary with indices for Penalty Kill.

    Returns:
    - str: A formatted string containing the indices for each game state for the player.
    """
    result_string= (
        f"{player_name}'s ALL SITUATIONS indices are: {MP_AS_dict.get(player_name)}\n"
        f"{player_name}'s 5-ON-5 indices are: {MP_5on5_dict.get(player_name)}\n"
        f"{player_name}'s 4-ON-5 indices are: {MP_4on5_dict.get(player_name)}\n"
        f"{player_name}'s 5-ON-4 indices are: {MP_5on4_dict.get(player_name)}\n"
        f"{player_name}'s OTHER SITUATIONS indices are: {MP_OS_dict.get(player_name)}\n"
    )

    return print(result_string)

In [55]:
MP_get_index_all_gamestates(player_name='Nick Suzuki')

Nick Suzuki's ALL SITUATIONS indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 5-ON-5 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 4-ON-5 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 5-ON-4 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's OTHER SITUATIONS indices are: {2022: [827], 2023: [1909], 2024: [2496]}



In [56]:
def MP_get_players_baseline_gamestate_stats(original_gamestate_df, player_name):
    """
    Returns the baseline performance metrics of the player you are finding comparable players of 
    so you can see how their stats are over the course of the seasons in the engine.
    Args:
    - original_gamestate_df (pd.DataFrame): DataFrame containing the original skater stats.
    - player_name: must be a string of the full name of the player you want to look up, 
    If player name is misspelled or there is no data for that player, 
    the function returns an empty dataframe.
    -Small adustment from the other function. The MP function uses 'name' instead of 'Player' 

    """
    baseline_gamestate_stats = original_gamestate_df.loc[original_gamestate_df['name'] == player_name]
    return baseline_gamestate_stats

In [57]:
MP_get_players_baseline_gamestate_stats(MP_AS_stats, 'Nick Suzuki')

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,onIce_fenwickPercentage,offIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,I_F_xFreeze,I_F_xPlayStopped,I_F_xPlayContinuedInZone,I_F_xPlayContinuedOutsideZone,I_F_flurryAdjustedxGoals,I_F_scoreVenueAdjustedxGoals,I_F_flurryScoreVenueAdjustedxGoals,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotsOnGoal,I_F_missedShots,I_F_blockedShotAttempts,I_F_shotAttempts,I_F_points,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playStopped,I_F_playContinuedInZone,I_F_playContinuedOutsideZone,I_F_savedShotsOnGoal,I_F_savedUnblockedShotAttempts,penalties,I_F_penalityMinutes,I_F_faceOffsWon,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerShots,I_F_mediumDangerShots,I_F_highDangerShots,I_F_lowDangerxGoals,I_F_mediumDangerxGoals,I_F_highDangerxGoals,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_scoreAdjustedShotsAttempts,I_F_unblockedShotAttempts,I_F_scoreAdjustedUnblockedShotAttempts,I_F_dZoneGiveaways,I_F_xGoalsFromxReboundsOfShots,I_F_xGoalsFromActualReboundsOfShots,I_F_reboundxGoals,I_F_xGoals_with_earned_rebounds,I_F_xGoals_with_earned_rebounds_scoreAdjusted,I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,I_F_shifts,I_F_oZoneShiftStarts,I_F_dZoneShiftStarts,I_F_neutralZoneShiftStarts,I_F_flyShiftStarts,I_F_oZoneShiftEnds,I_F_dZoneShiftEnds,I_F_neutralZoneShiftEnds,I_F_flyShiftEnds,faceoffsWon,faceoffsLost,timeOnBench,penalityMinutes,penalityMinutesDrawn,penaltiesDrawn,shotsBlockedByPlayer,OnIce_F_xOnGoal,OnIce_F_xGoals,OnIce_F_flurryAdjustedxGoals,OnIce_F_scoreVenueAdjustedxGoals,OnIce_F_flurryScoreVenueAdjustedxGoals,OnIce_F_shotsOnGoal,OnIce_F_missedShots,OnIce_F_blockedShotAttempts,OnIce_F_shotAttempts,OnIce_F_goals,OnIce_F_rebounds,OnIce_F_reboundGoals,OnIce_F_lowDangerShots,OnIce_F_mediumDangerShots,OnIce_F_highDangerShots,OnIce_F_lowDangerxGoals,OnIce_F_mediumDangerxGoals,OnIce_F_highDangerxGoals,OnIce_F_lowDangerGoals,OnIce_F_mediumDangerGoals,OnIce_F_highDangerGoals,OnIce_F_scoreAdjustedShotsAttempts,OnIce_F_unblockedShotAttempts,OnIce_F_scoreAdjustedUnblockedShotAttempts,OnIce_F_xGoalsFromxReboundsOfShots,OnIce_F_xGoalsFromActualReboundsOfShots,OnIce_F_reboundxGoals,OnIce_F_xGoals_with_earned_rebounds,OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OnIce_A_xOnGoal,OnIce_A_xGoals,OnIce_A_flurryAdjustedxGoals,OnIce_A_scoreVenueAdjustedxGoals,OnIce_A_flurryScoreVenueAdjustedxGoals,OnIce_A_shotsOnGoal,OnIce_A_missedShots,OnIce_A_blockedShotAttempts,OnIce_A_shotAttempts,OnIce_A_goals,OnIce_A_rebounds,OnIce_A_reboundGoals,OnIce_A_lowDangerShots,OnIce_A_mediumDangerShots,OnIce_A_highDangerShots,OnIce_A_lowDangerxGoals,OnIce_A_mediumDangerxGoals,OnIce_A_highDangerxGoals,OnIce_A_lowDangerGoals,OnIce_A_mediumDangerGoals,OnIce_A_highDangerGoals,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shoots,avg_ice_time/shift (s),avg_shifts_per_game,age,age_group,ZR_gameScore,playerRating,ZR_playerRating
827,8480018,2022,Nick Suzuki,MTL,C,all,82,100910.0,1978.0,50.28,0.45,0.43,0.51,0.44,0.51,0.43,136.0,177.26,18.81,12.77,38.45,6.03,92.23,69.72,17.56,18.65,17.41,19.0,21.0,186.0,52.0,71.0,309.0,61.0,21.0,17.0,5.0,31.0,3.0,67.0,99.0,165.0,217.0,15.0,30.0,699.0,89.0,50.0,71.0,175.0,42.0,21.0,5.74,5.26,7.8,7.0,5.0,9.0,304.13,238.0,234.29,44.0,2.93,3.37,4.59,17.16,16.95,16.66,1978.0,341.0,279.0,306.0,1052.0,263.0,273.0,336.0,1106.0,699.0,711.0,197423.0,30.0,50.0,26.0,62.0,877.02,82.03,78.56,81.67,78.22,899.0,313.0,407.0,1619.0,90.0,69.0,15.0,930.0,203.0,79.0,27.1,25.01,29.92,29.0,34.0,27.0,1594.56,1212.0,1196.04,14.64,14.08,14.17,82.5,82.04,79.91,857.05,98.29,94.83,99.43,95.93,883.0,297.0,389.0,1569.0,106.0,65.0,15.0,802.0,270.0,108.0,24.45,33.76,40.07,22.0,48.0,36.0,1602.15,1180.0,1203.86,13.34,14.78,15.02,96.6,97.7,95.74,134.52,180.46,2721.0,3420.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10,201.0,"5' 11""",CAN,R,51.0,24.0,23.0,Young Pro,56.545,39.885403,44.627423
1909,8480018,2023,Nick Suzuki,MTL,C,all,82,103790.0,1849.0,52.7,0.44,0.4,0.49,0.42,0.49,0.42,107.0,161.52,18.09,12.02,35.49,5.34,84.7,61.36,17.3,17.92,17.15,25.0,15.0,162.0,55.0,77.0,294.0,66.0,26.0,17.0,2.0,32.0,3.0,58.0,81.0,136.0,191.0,10.0,23.0,663.0,50.0,42.0,52.0,146.0,49.0,22.0,4.65,6.08,7.36,7.0,12.0,7.0,292.48,217.0,215.59,26.0,2.8,4.03,3.26,17.63,17.54,17.19,1849.0,347.0,292.0,321.0,889.0,221.0,235.0,325.0,1068.0,663.0,738.0,194627.0,23.0,48.0,25.0,56.0,859.95,86.16,81.43,85.65,80.97,851.0,330.0,428.0,1609.0,103.0,75.0,11.0,835.0,251.0,95.0,24.04,30.39,31.73,28.0,46.0,29.0,1594.69,1181.0,1173.06,14.71,14.62,14.67,86.18,85.86,83.09,882.13,111.47,105.82,112.68,107.02,890.0,318.0,436.0,1644.0,110.0,95.0,16.0,793.0,275.0,140.0,24.73,33.88,52.85,22.0,42.0,46.0,1675.17,1208.0,1225.97,14.34,22.53,22.53,103.28,104.24,101.37,133.35,203.05,2604.0,3563.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10,201.0,"5' 11""",CAN,R,56.0,23.0,24.0,Young Pro,60.71,38.481139,39.421373
2496,8480018,2024,Nick Suzuki,MTL,C,all,82,104619.0,1878.0,69.75,0.55,0.39,0.54,0.4,0.55,0.4,107.0,195.82,21.76,14.25,44.74,6.58,105.04,73.64,20.97,21.73,20.94,25.0,19.0,185.0,81.0,58.0,324.0,77.0,33.0,12.0,3.0,30.0,4.0,109.0,78.0,152.0,233.0,17.0,36.0,689.0,69.0,41.0,63.0,185.0,56.0,25.0,6.43,7.2,8.12,16.0,9.0,8.0,321.77,266.0,264.56,29.0,3.33,2.86,3.69,21.4,21.33,20.75,1878.0,385.0,246.0,348.0,899.0,213.0,260.0,316.0,1089.0,689.0,622.0,196168.0,36.0,40.0,20.0,48.0,1008.31,104.79,99.91,105.08,100.15,972.0,419.0,478.0,1869.0,111.0,86.0,11.0,1007.0,266.0,118.0,31.45,32.54,40.81,44.0,31.0,36.0,1863.29,1391.0,1389.05,16.94,19.83,19.83,101.91,101.95,99.05,846.9,87.41,83.4,87.83,83.79,814.0,346.0,434.0,1594.0,101.0,76.0,10.0,867.0,190.0,103.0,25.29,24.38,37.74,38.0,34.0,29.0,1605.4,1160.0,1168.01,12.18,17.34,17.34,82.25,82.44,80.71,124.32,193.35,2664.0,3942.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10,201.0,"5' 11""",CAN,R,56.0,23.0,25.0,Young Pro,71.625,48.192404,49.448955


## Building the preprocessing and processing pipeline for the recommender engine:

In [58]:
# handling the values that would interfere with the encoder that includes the 'inf' and NaN
#This is mainly for the ice_time/shift' column
MP_AS_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_5on5_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_4on5_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_5on4_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_OS_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)

### The Pipeline:

In [183]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('age_group', Pipeline([
            ('ordinal', OrdinalEncoder(categories=[['New Pro', 'Old Vet', 'Young Pro', 'Vet', 'Prime Age',]])),
            ('scaler', StandardScaler())  # Scale the ordinal-encoded age_group
        ]), ['age_group']),
        ('position', Pipeline([
            ('onehot', OneHotEncoder()),  # Apply OneHotEncoder to 'position'
            ('scaler', StandardScaler(with_mean=False))  # Apply StandardScaler after OneHotEncoder
        ]), ['position'])
    ])

# My current Pipeline
MP_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA())
])

In [184]:
# Columns not to included in the processing:
col_not_processed = ['playerId', 'season' , 'name', 'team', 'situation',
                      'nationality' ,'birthDate', 'weight','height', 'avg_ice_time/shift (s)','shoots', 'gameScore', 'playerRating', 'ZR_gameScore', 'ZR_playerRating']

### The transformed dataframes for the recommender:

In [185]:
MP_AS_stats_transformed = MP_pipeline.fit_transform(MP_AS_stats.drop(columns=col_not_processed)) # All Situations
MP_5on5_stats_transformed = MP_pipeline.fit_transform(MP_5on5_stats.drop(columns=col_not_processed)) # 5on5
MP_4on5_stats_transformed = MP_pipeline.fit_transform(MP_4on5_stats.drop(columns=col_not_processed)) # 4on5
MP_5on4_stats_transformed = MP_pipeline.fit_transform(MP_5on4_stats.drop(columns=col_not_processed)) # 5on4
MP_OS_stats_transformed = MP_pipeline.fit_transform(MP_OS_stats.drop(columns=col_not_processed)) # Other Situations

## Running the recommmender engine:

In [63]:
def MP_recommend_skaters(original_gamestate_df, processed_gamestate_df, season, player_index, top_n=6):
    """
    Recommends skaters based on their stats using a preprocessed PCA features.

    Args:
    - original_gamestate_df (pd.DataFrame): DataFrame containing the original skater stats.
        Acceptable inputs for original_gamestate_df are: [MP_AS_stats, MP_5on5_stats, MP_4on5_stats, MP_5on4_stats, MP_OS_stats]
    - processed_gamestate_df (pd.DataFrame): PCA-transformed and scaled features of the skaters.
        Acceptable inputs for processed_gamestate_df are: 
        [MP_AS_processed_data, MP_5on5_processed_data, MP_4on5_processed_data, MP_5on4_processed_data, MP_OS_processed_data]
    - season (int): The target season for comparison.
        Acceptable inputs for season are: 2021, 2022, 2023 
    - player_index (int): Index of the player in the DataFrame to get recommendations for.
        player_index as accessed through the function: MP_get_index_all_gamestates() 
    - top_n (int): Number of top recommendations to return.

    Returns:
    - pd.DataFrame: DataFrame containing the top_n recommended skaters for the given player in the specified season.
    """

    # Filter DataFrame for the target season
    target_season_data = processed_gamestate_df[original_gamestate_df['season'] == season]

    # Compute pairwise distances between all skaters and those from the target season
    distances = pairwise_distances(processed_gamestate_df, target_season_data)

    # Find the indices of the closest skaters
    indices = np.argsort(distances, axis=1)[:, :top_n]

    # Retrieve the recommendations from the original stats DataFrame
    MP_recommended_skaters = original_gamestate_df[original_gamestate_df['season'] == season].iloc[indices[player_index], :]

    return MP_recommended_skaters

In [186]:
# AS
nick_suzuki_AS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_AS_stats,
                                                  processed_gamestate_df=MP_AS_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=7)

#5on5
nick_suzuki_5on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on5_stats,
                                                  processed_gamestate_df=MP_5on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=7)

#4on5
nick_suzuki_4on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_4on5_stats,
                                                  processed_gamestate_df=MP_4on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=7)

#5on4
nick_suzuki_5on4_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on4_stats,
                                                  processed_gamestate_df=MP_5on4_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=7)

#OS
nick_suzuki_OS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_OS_stats,
                                                  processed_gamestate_df=MP_OS_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=7)


In [187]:
nick_suzuki_AS_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2496,Nick Suzuki,69.75,71.625,48.192404,49.448955,25.0,Young Pro
2619,Robert Thomas,74.78,76.82,51.258916,53.038521,25.0,Young Pro
2476,Bo Horvat,75.16,72.955,51.490581,50.367939,29.0,Prime Age
2798,Dylan Larkin,74.57,74.805,51.130891,51.646226,28.0,Prime Age
2315,Mika Zibanejad,73.21,80.385,50.301774,55.501814,31.0,Vet
2550,Sean Monahan,55.21,61.925,39.328172,42.746588,30.0,Prime Age
2469,Elias Pettersson,91.51,94.155,61.45827,65.01641,26.0,Young Pro


In [188]:
nick_suzuki_5on5_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age','age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2496,Nick Suzuki,69.75,36.68,48.192404,44.457919,25.0,Young Pro
2619,Robert Thomas,74.78,43.44,51.258916,52.655833,25.0,Young Pro
2393,Nick Bjugstad,45.31,36.85,33.29269,44.66408,32.0,Vet
2476,Bo Horvat,75.16,46.2,51.490581,56.002911,29.0,Prime Age
2550,Sean Monahan,55.21,36.055,39.328172,43.699976,30.0,Prime Age
2131,J.T. Miller,95.5,49.285,63.890752,59.744118,31.0,Vet
2315,Mika Zibanejad,73.21,31.64,50.301774,38.345865,31.0,Vet


In [189]:
nick_suzuki_4on5_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age','age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2496,Nick Suzuki,58.39,-1.085,58.827684,39.232409,25.0,Young Pro
2387,Tanner Pearson,9.77,-0.405,15.907486,45.031983,32.0,Vet
2476,Bo Horvat,40.12,0.145,42.699506,49.722814,29.0,Prime Age
2472,Nico Hischier,66.1,0.09,65.633828,49.253731,25.0,Young Pro
2495,Reese Johnson,2.55,-0.515,9.533898,44.093817,26.0,Young Pro
2536,John Beecher,1.4,-0.94,8.518715,40.469083,23.0,Young Pro
2172,Alex Killorn,34.1,-1.09,37.38524,39.189765,35.0,Vet


In [190]:
nick_suzuki_OS_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age','age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2496,Nick Suzuki,63.33,6.81,56.406662,30.584468,25.0,Young Pro
2811,Cole Caufield,55.12,6.975,49.637203,31.244996,23.0,Young Pro
2512,Aleksander Barkov,82.48,10.425,72.19657,45.056045,29.0,Prime Age
2602,Tage Thompson,54.16,8.69,48.845646,38.110488,27.0,Prime Age
2697,Elias Lindholm,35.98,5.84,33.855541,26.701361,30.0,Prime Age
2235,Tim Sttzle,52.59,9.08,47.551121,39.671737,22.0,Young Pro
2746,Mikael Granlund,29.98,6.55,28.908311,29.543635,32.0,Vet


## Logan Cooley:

In [136]:
MP_get_index_all_gamestates('Logan Cooley')

Logan Cooley's ALL SITUATIONS indices are: {2024: [2752]}
Logan Cooley's 5-ON-5 indices are: {2024: [2752]}
Logan Cooley's 4-ON-5 indices are: {2024: [2752]}
Logan Cooley's 5-ON-4 indices are: {2024: [2752]}
Logan Cooley's OTHER SITUATIONS indices are: {2024: [2752]}



In [191]:
# AS
LC_AS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_AS_stats,
                                                  processed_gamestate_df=MP_AS_stats_transformed,
                                                  season=2024,
                                                  player_index=2752,
                                                  top_n=7)

#5on5
LC_5on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on5_stats,
                                                  processed_gamestate_df=MP_5on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2752,
                                                  top_n=7)

#4on5
LC_4on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_4on5_stats,
                                                  processed_gamestate_df=MP_4on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2752,
                                                  top_n=7)

#5on4
LC_5on4_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on4_stats,
                                                  processed_gamestate_df=MP_5on4_stats_transformed,
                                                  season=2024,
                                                  player_index=2752,
                                                  top_n=7)

#OS
LC_OS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_OS_stats,
                                                  processed_gamestate_df=MP_OS_stats_transformed,
                                                  season=2024,
                                                  player_index=2752,
                                                  top_n=7)

In [192]:
LC_AS_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2752,Logan Cooley,37.98,45.14,28.823996,31.14873,20.0,New Pro
2662,Connor McMichael,25.3,31.65,21.093702,21.827604,23.0,Young Pro
2257,Marco Rossi,47.34,45.565,34.530269,31.442391,23.0,Young Pro
2708,Matty Beniers,40.15,39.845,30.146924,27.490067,22.0,Young Pro
2055,Morgan Frost,45.59,45.38,33.463391,31.314562,25.0,Young Pro
2351,Morgan Geekie,31.32,42.525,24.763763,29.341855,26.0,Young Pro
2669,Anton Lundell,52.71,42.235,37.80406,29.141475,23.0,Young Pro


In [193]:
LC_5on5_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2752,Logan Cooley,37.98,28.305,28.823996,34.30148,20.0,New Pro
2662,Connor McMichael,25.3,23.23,21.093702,28.14698,23.0,Young Pro
2257,Marco Rossi,47.34,37.145,34.530269,45.021829,23.0,Young Pro
2451,Dylan Cozens,49.98,31.41,36.139731,38.066942,23.0,Young Pro
2055,Morgan Frost,45.59,31.255,33.463391,37.878972,25.0,Young Pro
2708,Matty Beniers,40.15,26.56,30.146924,32.185302,22.0,Young Pro
2669,Anton Lundell,52.71,28.475,37.80406,34.50764,23.0,Young Pro


In [194]:
LC_4on5_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2752,Logan Cooley,24.12,0.475,28.575212,52.537313,20.0,New Pro
2475,Nick Schmaltz,25.72,0.465,29.987641,52.452026,28.0,Prime Age
2204,Fabian Zetterlund,18.63,0.395,23.728814,51.855011,25.0,Young Pro
2168,Curtis Lazar,22.13,0.49,26.818503,52.665245,29.0,Prime Age
2221,Jack Roslovic,20.93,0.075,25.759181,49.1258,27.0,Prime Age
2816,Dillon Dube,4.42,-0.245,11.184675,46.396588,26.0,Young Pro
2386,Nikita Okhotiuk,-1.97,-0.84,5.543785,41.321962,24.0,Young Pro


In [195]:
LC_OS_sim_skaters[['name', 'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2752,Logan Cooley,23.64,2.27,23.680739,12.409928,20.0,New Pro
1962,Adam Fantilli,13.18,3.425,15.056069,17.033627,20.0,New Pro
2049,Pierre-Luc Dubois,17.8,1.945,18.865435,11.108887,26.0,Young Pro
2638,Michael Rasmussen,16.26,5.32,17.595646,24.619696,25.0,Young Pro
2283,Luke Evangelista,22.91,2.16,23.078826,11.969576,22.0,Young Pro
2257,Marco Rossi,27.91,0.355,27.201517,4.743795,23.0,Young Pro
1995,Jesperi Kotkaniemi,21.54,1.515,21.949208,9.38751,24.0,Young Pro


## Clayton Keller performance tracking:

In [202]:
MP_get_index_all_gamestates('Clayton Keller')

Clayton Keller's ALL SITUATIONS indices are: {2022: [757], 2023: [1841], 2024: [2809]}
Clayton Keller's 5-ON-5 indices are: {2022: [757], 2023: [1841], 2024: [2809]}
Clayton Keller's 4-ON-5 indices are: {2022: [757], 2023: [1841], 2024: [2809]}
Clayton Keller's 5-ON-4 indices are: {2022: [757], 2023: [1841], 2024: [2809]}
Clayton Keller's OTHER SITUATIONS indices are: {2022: [757], 2023: [1841], 2024: [2809]}



In [203]:
# AS
CK_AS_sim_skaters_22 = MP_recommend_skaters(original_gamestate_df=MP_AS_stats,
                                                  processed_gamestate_df=MP_AS_stats_transformed,
                                                  season=2022,
                                                  player_index=757,
                                                  top_n=7)

#5on5
CK_5on5_sim_skaters_22 = MP_recommend_skaters(original_gamestate_df=MP_5on5_stats,
                                                  processed_gamestate_df=MP_5on5_stats_transformed,
                                                  season=2022,
                                                  player_index=757,
                                                  top_n=7)

#4on5
CK_4on5_sim_skaters_22 = MP_recommend_skaters(original_gamestate_df=MP_4on5_stats,
                                                  processed_gamestate_df=MP_4on5_stats_transformed,
                                                  season=2022,
                                                  player_index=757,
                                                  top_n=7)

#5on4
CK_5on4_sim_skaters_22 = MP_recommend_skaters(original_gamestate_df=MP_5on4_stats,
                                                  processed_gamestate_df=MP_5on4_stats_transformed,
                                                  season=2022,
                                                  player_index=757,
                                                  top_n=7)

#OS
CK_OS_sim_skaters_22 = MP_recommend_skaters(original_gamestate_df=MP_OS_stats,
                                                  processed_gamestate_df=MP_OS_stats_transformed,
                                                  season=2022,
                                                  player_index=757,
                                                  top_n=7)

In [204]:
# AS
CK_AS_sim_skaters_23 = MP_recommend_skaters(original_gamestate_df=MP_AS_stats,
                                                  processed_gamestate_df=MP_AS_stats_transformed,
                                                  season=2023,
                                                  player_index=1841,
                                                  top_n=7)

#5on5
CK_5on5_sim_skaters_23 = MP_recommend_skaters(original_gamestate_df=MP_5on5_stats,
                                                  processed_gamestate_df=MP_5on5_stats_transformed,
                                                  season=2023,
                                                  player_index=1841,
                                                  top_n=7)

#4on5
CK_4on5_sim_skaters_23 = MP_recommend_skaters(original_gamestate_df=MP_4on5_stats,
                                                  processed_gamestate_df=MP_4on5_stats_transformed,
                                                  season=2023,
                                                  player_index=1841,
                                                  top_n=7)

#5on4
CK_5on4_sim_skaters_23 = MP_recommend_skaters(original_gamestate_df=MP_5on4_stats,
                                                  processed_gamestate_df=MP_5on4_stats_transformed,
                                                  season=2023,
                                                  player_index=1841,
                                                  top_n=7)

#OS
CK_OS_sim_skaters_23 = MP_recommend_skaters(original_gamestate_df=MP_OS_stats,
                                                  processed_gamestate_df=MP_OS_stats_transformed,
                                                  season=2023,
                                                  player_index=1841,
                                                  top_n=7)

In [205]:
# AS
CK_AS_sim_skaters_24 = MP_recommend_skaters(original_gamestate_df=MP_AS_stats,
                                                  processed_gamestate_df=MP_AS_stats_transformed,
                                                  season=2024,
                                                  player_index=2809,
                                                  top_n=7)

#5on5
CK_5on5_sim_skaters_24 = MP_recommend_skaters(original_gamestate_df=MP_5on5_stats,
                                                  processed_gamestate_df=MP_5on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2809,
                                                  top_n=7)

#4on5
CK_4on5_sim_skaters_24 = MP_recommend_skaters(original_gamestate_df=MP_4on5_stats,
                                                  processed_gamestate_df=MP_4on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2809,
                                                  top_n=7)

#5on4
CK_5on4_sim_skaters_24 = MP_recommend_skaters(original_gamestate_df=MP_5on4_stats,
                                                  processed_gamestate_df=MP_5on4_stats_transformed,
                                                  season=2024,
                                                  player_index=2809,
                                                  top_n=7)

#OS
CK_OS_sim_skaters_24 = MP_recommend_skaters(original_gamestate_df=MP_OS_stats,
                                                  processed_gamestate_df=MP_OS_stats_transformed,
                                                  season=2024,
                                                  player_index=2809,
                                                  top_n=7)

### AS - Clayton Keller:

In [212]:
CK_AS_sim_skaters_22[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
757,Clayton Keller,2022,R,50.78,61.875,40.234784,48.777544,24.0,Young Pro
703,Troy Terry,2022,R,64.63,66.125,49.912655,52.08674,25.0,Young Pro
859,Blake Wheeler,2022,R,52.85,55.295,41.681224,43.654131,36.0,Old Vet
343,Pavel Buchnevich,2022,L,73.75,75.02,56.285375,59.012692,27.0,Prime Age
954,Nick Schmaltz,2022,C,44.51,54.215,35.853539,42.813206,26.0,Young Pro
561,Tyler Bertuzzi,2022,L,57.89,61.785,45.202991,48.707467,27.0,Prime Age
318,Jordan Eberle,2022,R,42.07,43.88,34.148557,34.76602,32.0,Vet


In [213]:
CK_AS_sim_skaters_23[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
1841,Clayton Keller,2023,R,77.36,83.095,53.806476,53.845163,25.0,Young Pro
1695,Alex Tuch,2023,R,84.19,81.16,58.051084,52.598344,27.0,Prime Age
1655,Troy Terry,2023,R,56.97,61.8,41.134796,40.123715,26.0,Young Pro
1558,Johnny Gaudreau,2023,L,59.35,68.675,42.613884,44.553626,30.0,Prime Age
1308,Travis Konecny,2023,R,58.13,60.345,41.855696,39.186185,26.0,Young Pro
1672,Jordan Kyrou,2023,C,64.66,72.685,45.913865,47.137472,25.0,Young Pro
1831,Patrick Kane,2023,R,38.17,57.995,29.451246,37.671961,35.0,Vet


In [214]:
CK_AS_sim_skaters_24[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2809,Clayton Keller,2024,R,73.75,75.76,50.630982,52.306098,26.0,Young Pro
2399,Alex DeBrincat,2024,R,65.76,73.065,45.759922,50.443945,27.0,Prime Age
2051,Jordan Kyrou,2024,C,69.45,70.465,48.00951,48.647435,26.0,Young Pro
2541,Drake Batherson,2024,R,63.08,66.63,44.126074,45.997582,26.0,Young Pro
2742,JJ Peterka,2024,R,63.86,59.07,44.601597,40.773881,22.0,Young Pro
2336,Adrian Kempe,2024,R,80.08,79.205,54.490032,54.686474,28.0,Prime Age
2112,Travis Konecny,2024,R,81.51,69.465,55.361824,47.956469,27.0,Prime Age


## Clayton Keller - 5on5:


In [209]:
CK_5on5_sim_skaters_22[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
757,Clayton Keller,R,50.78,39.44,40.234784,52.756692,24.0,Young Pro
954,Nick Schmaltz,C,44.51,35.93,35.853539,48.082301,26.0,Young Pro
700,Connor Brown,R,37.91,24.98,31.241702,33.4998,28.0,Prime Age
67,Brandon Hagel,L,43.88,28.64,35.413318,38.373951,24.0,Young Pro
859,Blake Wheeler,R,52.85,32.6,41.681224,43.647623,36.0,Old Vet
352,Josh Anderson,R,28.48,29.28,24.652365,39.226262,28.0,Prime Age
561,Tyler Bertuzzi,L,57.89,39.26,45.202991,52.51698,27.0,Prime Age


In [210]:
CK_5on5_sim_skaters_23[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
1841,Clayton Keller,R,77.36,52.325,53.806476,73.488665,25.0,Young Pro
1655,Troy Terry,R,56.97,40.67,41.134796,57.178841,26.0,Young Pro
1363,Alex DeBrincat,R,68.13,35.755,48.07035,50.300868,26.0,Young Pro
1695,Alex Tuch,R,84.19,51.0,58.051084,71.634481,27.0,Prime Age
1282,Nikita Kucherov,R,110.74,51.82,74.550991,72.781976,30.0,Prime Age
1943,Jeff Skinner,L,90.84,58.635,62.183829,82.31878,31.0,Vet
1918,Trevor Zegras,C,47.91,36.585,35.504319,51.462357,22.0,Young Pro


In [211]:
CK_5on5_sim_skaters_24[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2809,Clayton Keller,R,73.75,40.495,50.630982,49.084405,26.0,Young Pro
2799,Alex Tuch,R,60.59,45.015,42.60806,54.56585,28.0,Prime Age
2478,Troy Terry,R,50.19,32.2,36.267756,39.024982,27.0,Prime Age
2399,Alex DeBrincat,R,65.76,47.285,45.759922,57.3187,27.0,Prime Age
2541,Drake Batherson,R,63.08,38.26,44.126074,46.374,26.0,Young Pro
2112,Travis Konecny,R,81.51,46.285,55.361824,56.105991,27.0,Prime Age
2082,Kirill Marchenko,R,44.76,34.595,32.957386,41.92942,24.0,Young Pro


## Clayton Keller - 4on5:

In [215]:
CK_4on5_sim_skaters_22[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
757,Clayton Keller,2022,R,46.37,1.105,46.382979,61.169968,24.0,Young Pro
713,Cal Clutterbuck,2022,R,11.92,-0.94,17.06383,42.330723,35.0,Vet
380,Garnet Hathaway,2022,R,34.05,0.13,35.897872,52.187932,31.0,Vet
624,Elias Pettersson,2022,C,39.58,1.295,40.604255,62.920313,24.0,Young Pro
597,Reilly Smith,2022,R,44.4,3.065,44.706383,79.226163,31.0,Vet
1001,Ryan Lomberg,2022,L,24.57,0.27,27.829787,53.47766,28.0,Prime Age
788,Valeri Nichushkin,2022,R,67.55,1.37,64.408511,63.611239,27.0,Prime Age


In [216]:
CK_4on5_sim_skaters_23[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
1841,Clayton Keller,2023,R,62.17,0.355,48.991237,49.737533,25.0,Young Pro
1402,Nicolas Deslauriers,2023,L,5.95,0.565,10.800897,51.574803,32.0,Vet
1728,Liam O'Brien,2023,C,-2.24,-1.015,5.237416,37.751531,29.0,Prime Age
1461,Mathieu Olivier,2023,R,0.74,-0.445,7.261735,42.738408,26.0,Young Pro
1053,Anthony Beauvillier,2023,L,21.41,-0.055,21.302901,46.150481,26.0,Young Pro
1569,Kyle Connor,2023,L,53.34,-0.37,42.993003,43.394576,27.0,Prime Age
1170,Vince Dunn,2023,D,51.83,0.15,41.967258,47.944007,27.0,Prime Age


In [217]:
CK_4on5_sim_skaters_24[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2809,Clayton Keller,2024,R,8.86,0.0,15.104167,48.486141,26.0,Young Pro
2811,Cole Caufield,2024,R,6.1,0.0,12.667726,48.486141,23.0,Young Pro
2760,Kailer Yamamoto,2024,R,-0.16,-0.145,7.141596,47.249467,26.0,Young Pro
2249,Brock Boeser,2024,R,8.22,0.0,14.539195,48.486141,27.0,Prime Age
2040,Eeli Tolvanen,2024,R,7.01,0.265,13.471045,50.746269,25.0,Young Pro
1999,Jordan Eberle,2024,R,9.31,-0.01,15.501412,48.400853,34.0,Vet
2568,Jonathan Marchessault,2024,R,3.22,0.0,10.125353,48.486141,34.0,Vet


## Clayton Keller - 5on4:

In [219]:
CK_5on4_sim_skaters_22[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
757,Clayton Keller,2022,R,48.93,13.135,38.274336,33.324997,24.0,Young Pro
914,Drake Batherson,2022,R,39.4,13.12,31.473023,33.287483,24.0,Young Pro
364,Rickard Rakell,2022,R,46.52,11.965,36.554382,30.3989,29.0,Prime Age
703,Troy Terry,2022,R,60.31,13.845,46.395946,35.100663,25.0,Young Pro
564,Erik Karlsson,2022,D,33.0,9.91,26.90551,25.259472,32.0,Vet
720,Phil Kessel,2022,R,34.46,13.305,27.947474,33.750156,35.0,Vet
392,Denis Gurianov,2022,R,29.3,9.33,24.264916,23.808928,25.0,Young Pro


In [220]:
CK_5on4_sim_skaters_23[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
1841,Clayton Keller,2023,R,77.9,18.855,53.134309,30.226599,25.0,Young Pro
1831,Patrick Kane,2023,R,36.37,19.855,26.240124,31.828009,35.0,Vet
1763,Matt Duchene,2023,C,50.96,11.62,35.688382,18.640404,32.0,Vet
1233,Max Domi,2023,C,37.49,18.135,26.965419,29.073585,28.0,Prime Age
1094,Taylor Raddysh,2023,R,27.82,15.39,20.703277,24.677716,25.0,Young Pro
1672,Jordan Kyrou,2023,C,64.72,24.38,44.599145,39.074385,25.0,Young Pro
1676,Martin Necas,2023,C,86.43,21.23,58.658205,34.029946,24.0,Young Pro


In [221]:
CK_5on4_sim_skaters_24[['name', 'season','position','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating', 'age', 'age_group']]

Unnamed: 0,name,season,position,gameScore,ZR_gameScore,playerRating,ZR_playerRating,age,age_group
2809,Clayton Keller,2024,R,73.75,28.905,48.603707,62.49059,26.0,Young Pro
2082,Kirill Marchenko,2024,R,45.14,14.855,30.445545,32.272287,24.0,Young Pro
2052,David Perron,2024,L,33.28,18.47,22.918253,40.047317,36.0,Old Vet
2112,Travis Konecny,2024,R,77.28,10.685,50.844123,23.303581,27.0,Prime Age
2475,Nick Schmaltz,2024,C,58.31,22.13,38.804265,47.919131,28.0,Prime Age
2051,Jordan Kyrou,2024,C,63.52,19.16,42.110942,41.531347,26.0,Young Pro
2038,Pavel Buchnevich,2024,L,61.19,19.65,40.63214,42.585224,29.0,Prime Age


## Random Forest Regressor:

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [67]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('age_group', Pipeline([
            ('ordinal', OrdinalEncoder(categories=[['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']])),
            ('scaler', StandardScaler())  # Scale the ordinal-encoded age_group
        ]), ['age_group']),
        ('position', Pipeline([
            ('onehot', OneHotEncoder()),  # Apply OneHotEncoder to 'position'
            ('scaler', StandardScaler(with_mean=False))  # Apply StandardScaler after OneHotEncoder
        ]), ['position'])
    ])

# My current Pipeline
MP_RFR_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Regressor
])

In [68]:
col_not_processed = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age' ,'gameScore'] 
# gameScore is the target variable

col_not_processed_without_points = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age' , 'I_F_points','gameScore'] 

## All Situations:

### AS with I_F_points

In [69]:
# Drop the target column to create the feature matrix X
MP_AS_X = MP_AS_stats.drop(columns=col_not_processed) 
MP_AS_y = MP_AS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)

In [70]:
# Fit the pipeline to your training data
AS_model = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = AS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = AS_model.named_steps['preprocessor']

# Get feature names after the transformation
def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, features in column_transformer.transformers_:
        if transformer == 'drop' or transformer is None:
            continue
        if isinstance(transformer, Pipeline):
            transformer = transformer.named_steps['onehot'] if 'onehot' in transformer.named_steps else transformer
        try:
            if hasattr(transformer, 'get_feature_names_out'):
                feature_names = transformer.get_feature_names_out(features)
                output_features.extend(feature_names)
            else:
                output_features.extend(features)
        except NotFittedError:
            output_features.extend(features)
    return output_features

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

playerRating                   0.991202
ZR_gameScore                   0.004273
OnIce_F_blockedShotAttempts    0.000345
OnIce_F_rebounds               0.000341
ZR_playerRating                0.000220
I_F_playStopped                0.000194
I_F_points                     0.000168
OnIce_F_missedShots            0.000126
OnIce_F_highDangerShots        0.000104
OnIce_F_highDangerxGoals       0.000084
dtype: float64


#### It makes a lot of sense that the model did very well with I_F points. Points are made up of both goals and assists which are weighted more heavily on Goals, primary assists and then secondary assists. It also follows that the next most important features are the onIce_fenwickPercentage and onIce_corciPercentage which are very similar and also prioritize the individual, offensive, production of a player which undoubtedly is an important feature when trying to rank players as gameScore does. The heavy weight put on offensive metrics (even though corsi and fenwick percentage opperates on total shots including shots against) explains why the model is significantly weaker at generating a gameScore value for special teams especially 4on5 penalty kill. 

In [71]:
# Drop the target column to create the feature matrix X
MP_AS_X = MP_AS_stats.drop(columns=col_not_processed_without_points) 
MP_AS_y = MP_AS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)

# Fit the pipeline to your training data
AS_model = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = AS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = AS_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

playerRating                   0.991018
ZR_gameScore                   0.004465
OnIce_F_blockedShotAttempts    0.000350
OnIce_F_rebounds               0.000343
ZR_playerRating                0.000312
I_F_playStopped                0.000196
OnIce_F_missedShots            0.000132
OnIce_F_highDangerxGoals       0.000098
OnIce_F_highDangerShots        0.000096
OnIce_F_reboundxGoals          0.000084
dtype: float64


### AS Model -  Comparing the accuracy of the model with and without the I_F_points column:

In [72]:
# Re assign the variable so that the comparison doesn't throw an error
MP_AS_X = MP_AS_stats.drop(columns=col_not_processed) 
MP_AS_y = MP_AS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
AS_model_with_points = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)
predictions_with_points = AS_model_with_points.predict(MP_AS_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_AS_y_test, predictions_with_points)
r2_with_points = r2_score(MP_AS_y_test, predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_AS_X_train_no_points = MP_AS_X_train.drop(columns=['I_F_points'])
MP_AS_X_test_no_points = MP_AS_X_test.drop(columns=['I_F_points'])

AS_model_without_points = MP_RFR_pipeline.fit(MP_AS_X_train_no_points, MP_AS_y_train)
predictions_without_points = AS_model_without_points.predict(MP_AS_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_AS_y_test, predictions_without_points)
r2_without_points = r2_score(MP_AS_y_test, predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 1.370131110434028
R2 Score: 0.9977201406180781
Model without I_F_points:
Mean Squared Error: 1.365308424652776
R2 Score: 0.997728165430697/n
Comparison of Model Performance:
Difference in MSE: -0.00482268578125189
Difference in R2 Score: 8.024812618856458e-06


## 5on5: 

### With I_F_points:

In [73]:

# Drop the target column to create the feature matrix X
MP_5on5_X = MP_5on5_stats.drop(columns=col_not_processed) 
MP_5on5_y = MP_5on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on5_X_train, MP_5on5_X_test, MP_5on5_y_train, MP_5on5_y_test = train_test_split(MP_5on5_X, MP_5on5_y, test_size=0.2, random_state=42)

In [74]:
# Fit the pipeline to your training data
MP_5on5_model = MP_RFR_pipeline.fit(MP_5on5_X_train, MP_5on5_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = MP_5on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

playerRating                   0.994419
ZR_gameScore                   0.000651
ZR_playerRating                0.000347
I_F_playStopped                0.000254
OnIce_F_blockedShotAttempts    0.000170
OnIce_F_rebounds               0.000142
I_F_points                     0.000123
OnIce_F_mediumDangerxGoals     0.000113
OnIce_A_blockedShotAttempts    0.000109
OnIce_A_rebounds               0.000102
dtype: float64


### Without I_F_points:

In [75]:
# Drop the target column to create the feature matrix X
MP_5on5_X = MP_5on5_stats.drop(columns=col_not_processed_without_points) 
MP_5on5_y = MP_5on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on5_X_train, MP_5on5_X_test, MP_5on5_y_train, MP_5on5_y_test = train_test_split(MP_5on5_X, MP_5on5_y, test_size=0.2, random_state=42)

In [76]:
# Fit the pipeline to your training data
MP_5on5_model = MP_RFR_pipeline.fit(MP_5on5_X_train, MP_5on5_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = MP_5on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

playerRating                   0.994437
ZR_gameScore                   0.000716
ZR_playerRating                0.000348
I_F_playStopped                0.000258
OnIce_F_blockedShotAttempts    0.000179
OnIce_F_rebounds               0.000157
OnIce_F_mediumDangerxGoals     0.000127
OnIce_A_blockedShotAttempts    0.000119
I_F_takeaways                  0.000105
OnIce_A_rebounds               0.000102
dtype: float64


### Comparing the 5on5 models with and without I_F_points

In [77]:
# Re assign the variable so that the comparison doesn't throw an error
MP_5on5_X = MP_5on5_stats.drop(columns=col_not_processed) 
MP_5on5_y = MP_5on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on5_X_train, MP_5on5_X_test, MP_5on5_y_train, MP_5on5_y_test = train_test_split(MP_5on5_X, MP_5on5_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_5on5_model_with_points = MP_RFR_pipeline.fit(MP_5on5_X_train, MP_5on5_y_train)
MP_5on5_predictions_with_points = MP_5on5_model_with_points.predict(MP_5on5_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_5on5_y_test, MP_5on5_predictions_with_points)
r2_with_points = r2_score(MP_5on5_y_test, MP_5on5_predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_5on5_X_train_no_points = MP_5on5_X_train.drop(columns=['I_F_points'])
MP_5on5_X_test_no_points = MP_5on5_X_test.drop(columns=['I_F_points'])

MP_5on5_model_without_points = MP_RFR_pipeline.fit(MP_5on5_X_train_no_points, MP_5on5_y_train)
MP_5on5_predictions_without_points = MP_5on5_model_without_points.predict(MP_5on5_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_5on5_y_test, MP_5on5_predictions_without_points)
r2_without_points = r2_score(MP_5on5_y_test, MP_5on5_predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 1.799206072725692
R2 Score: 0.9970061720271317
Model without I_F_points:
Mean Squared Error: 1.8544789000173572
R2 Score: 0.9969141996071884/n
Comparison of Model Performance:
Difference in MSE: 0.05527282729166516
Difference in R2 Score: -9.197241994329808e-05


## 4on5:

### 4on5 with I_F_points:

In [78]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [79]:

MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

playerRating               0.982388
OffIce_F_xGoals            0.001092
timeOnBench                0.000959
OffIce_A_xGoals            0.000495
I_F_oZoneShiftStarts       0.000439
OffIce_F_shotAttempts      0.000432
ZR_playerRating            0.000417
shotsBlockedByPlayer       0.000399
I_F_flyShiftStarts         0.000392
I_F_blockedShotAttempts    0.000382
dtype: float64


### 4on5 without I_F_points:

In [80]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed_without_points) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [81]:

MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

playerRating               0.982313
OffIce_F_xGoals            0.001123
timeOnBench                0.000894
ZR_playerRating            0.000528
OffIce_A_xGoals            0.000496
I_F_oZoneShiftStarts       0.000446
I_F_flyShiftStarts         0.000415
shotsBlockedByPlayer       0.000405
OffIce_F_shotAttempts      0.000397
I_F_blockedShotAttempts    0.000387
dtype: float64


### Comparing the 4on5 models with and without I_F_points

In [82]:
# Re assign the variable so that the comparison doesn't throw an error
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_4on5_model_with_points = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)
MP_4on5_predictions_with_points = MP_4on5_model_with_points.predict(MP_4on5_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_with_points)
r2_with_points = r2_score(MP_4on5_y_test, MP_4on5_predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_4on5_X_train_no_points = MP_4on5_X_train.drop(columns=['I_F_points'])
MP_4on5_X_test_no_points = MP_4on5_X_test.drop(columns=['I_F_points'])

MP_4on5_model_without_points = MP_RFR_pipeline.fit(MP_4on5_X_train_no_points, MP_4on5_y_train)
MP_4on5_predictions_without_points = MP_4on5_model_without_points.predict(MP_4on5_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_without_points)
r2_without_points = r2_score(MP_4on5_y_test, MP_4on5_predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 6.503694920920136
R2 Score: 0.9800815823220064
Model without I_F_points:
Mean Squared Error: 6.5581172601388875
R2 Score: 0.9799149067788339/n
Comparison of Model Performance:
Difference in MSE: 0.0544223392187515
Difference in R2 Score: -0.00016667554317251199


#### This tells me that in fact the gameScore metric is not a very good metric to understand/evaluate player performance when players are on the ice for a 4on5 penalty kill. 
#### It also makes sense that when on the penalty kill, the most important feature to determin a player's rating on the penalty kill is 'timeOnBench' because in order for a player to do positive things on the 4on5, they would need to be on the ice. It is interesting that it's timeOnBench and not timeOnIce but it does make sense that since there are more players on the bench in that situation that bench time is the most influential feature. 
#### Furthermore, the R2 Score and MSE are actually still quite low and so the model with or without I_F_points doesn't do very well at explaining the variance in the data. To that end, I'm curious to see if timeOnBench was removed, what the feature importances might be. I think its worth exploring because timeOnBench is not a "player active" feature and so can't be worked on or improved other than through coaching decisions regarding line-changes. 

## Exploring 4on5 with and without timeOnBench rather than I_F_points

In [83]:
col_not_processed_4on5 = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age','gameScore'] 
# gameScore is the target variable

col_not_processed_4on5_without_timeOnBench = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age', 'timeOnBench','gameScore'] 

### 4on5 with timeOnBench:

In [84]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed_4on5) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [85]:
MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

playerRating               0.982388
OffIce_F_xGoals            0.001092
timeOnBench                0.000959
OffIce_A_xGoals            0.000495
I_F_oZoneShiftStarts       0.000439
OffIce_F_shotAttempts      0.000432
ZR_playerRating            0.000417
shotsBlockedByPlayer       0.000399
I_F_flyShiftStarts         0.000392
I_F_blockedShotAttempts    0.000382
dtype: float64


### 4on5 without timeOnBench:

In [86]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed_4on5_without_timeOnBench) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [87]:
MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

playerRating               0.982392
OffIce_F_xGoals            0.001214
ZR_playerRating            0.000581
OffIce_F_shotAttempts      0.000570
I_F_flyShiftStarts         0.000470
I_F_oZoneShiftStarts       0.000445
shotsBlockedByPlayer       0.000424
OffIce_A_xGoals            0.000403
I_F_blockedShotAttempts    0.000402
games_played               0.000335
dtype: float64


#### When timeOnBench is removed, the next most influential feature is the OffIce_F_shotAttempts. This makes a lot of sense. This means that it was a strong coaching decision to have that group of players on the ice. That feature means that though you have fewer players on the ice, you are still generating offensive chances. The other team can't score on you while you are in the offensive zone.
#### Additionally, though much less influential. The 3rd and 4th most influential features are 'games_played' and 'faceoffsLost'. This shows that for 4on5 penalty kill success, experience is by far the most important feature. This also demomstrates that generally speaking, it is more important to have centers with veteran experience and generally veteran/late prime aged players on the penalty kill.
#### All in all, more than any other gamestate, 4on5 penalty kills (probably extrapolated to other PK situations), the most important thing is the coaching decision for which personel to be on the ice and that those players should be players with the most experience in that situation. 

### Comparing the models with and without the timeOnBench

In [88]:
# Re assign the variable so that the comparison doesn't throw an error
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_4on5_model_with_benchTime = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)
MP_4on5_predictions_with_benchTime = MP_4on5_model_with_benchTime.predict(MP_4on5_X_test)

# Evaluate the model
mse_with_benchTime = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_with_benchTime)
r2_with_benchTime = r2_score(MP_4on5_y_test, MP_4on5_predictions_with_benchTime)

print("Model with timeOnBench:")
print(f"Mean Squared Error: {mse_with_benchTime}")
print(f"R2 Score: {r2_with_benchTime}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_4on5_X_train_no_benchTime = MP_4on5_X_train.drop(columns=['timeOnBench'])
MP_4on5_X_test_no_benchTime = MP_4on5_X_test.drop(columns=['timeOnBench'])

MP_4on5_model_without_benchTime = MP_RFR_pipeline.fit(MP_4on5_X_train_no_benchTime, MP_4on5_y_train)
MP_4on5_predictions_without_benchTime = MP_4on5_model_without_benchTime.predict(MP_4on5_X_test_no_benchTime)

# Evaluate the model
mse_without_benchTime = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_without_benchTime)
r2_without_benchTime = r2_score(MP_4on5_y_test, MP_4on5_predictions_without_benchTime)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_benchTime}")
print(f"R2 Score: {r2_without_benchTime}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_benchTime - mse_with_benchTime}")
print(f"Difference in R2 Score: {r2_without_benchTime - r2_with_benchTime}")

Model with timeOnBench:
Mean Squared Error: 6.503694920920136
R2 Score: 0.9800815823220064
Model without I_F_points:
Mean Squared Error: 6.575822961822919
R2 Score: 0.9798606807479833/n
Comparison of Model Performance:
Difference in MSE: 0.07212804090278269
Difference in R2 Score: -0.0002209015740231024


## 5on4:

### 5on4 with I_F_points:

In [89]:
# Drop the target column to create the feature matrix X
MP_5on4_X = MP_5on4_stats.drop(columns=col_not_processed) 
MP_5on4_y = MP_5on4_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on4_X_train, MP_5on4_X_test, MP_5on4_y_train, MP_5on4_y_test = train_test_split(MP_5on4_X, MP_5on4_y, test_size=0.2, random_state=42)

In [90]:
MP_5on4_model = MP_RFR_pipeline.fit(MP_5on4_X_train, MP_5on4_y_train)

rf_model = MP_5on4_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on4_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

playerRating                   0.992665
OnIce_F_blockedShotAttempts    0.000569
I_F_secondaryAssists           0.000225
ZR_playerRating                0.000203
I_F_playStopped                0.000169
OffIce_F_shotAttempts          0.000166
shifts                         0.000165
OnIce_F_highDangerxGoals       0.000161
I_F_oZoneShiftEnds             0.000157
OnIce_F_highDangerShots        0.000155
dtype: float64


### 5on4 without I_F_points:

In [91]:
# Drop the target column to create the feature matrix X
MP_5on4_X = MP_5on4_stats.drop(columns=col_not_processed_without_points) 
MP_5on4_y = MP_5on4_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on4_X_train, MP_5on4_X_test, MP_5on4_y_train, MP_5on4_y_test = train_test_split(MP_5on4_X, MP_5on4_y, test_size=0.2, random_state=42)

In [92]:
MP_5on4_model = MP_RFR_pipeline.fit(MP_5on4_X_train, MP_5on4_y_train)

rf_model = MP_5on4_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on4_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

playerRating                   0.992710
OnIce_F_blockedShotAttempts    0.000550
I_F_secondaryAssists           0.000259
ZR_playerRating                0.000203
OffIce_F_shotAttempts          0.000173
OnIce_F_highDangerxGoals       0.000171
I_F_playStopped                0.000168
shifts                         0.000161
OffIce_F_xGoals                0.000144
OnIce_F_highDangerShots        0.000144
dtype: float64


### 5on4 Comparison of with and without I_F_points:

In [93]:
# Re assign the variable so that the comparison doesn't throw an error
MP_5on4_X = MP_5on4_stats.drop(columns=col_not_processed) 
MP_5on4_y = MP_5on4_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on4_X_train, MP_5on4_X_test, MP_5on4_y_train, MP_5on4_y_test = train_test_split(MP_5on4_X, MP_5on4_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_5on4_model_with_points = MP_RFR_pipeline.fit(MP_5on4_X_train, MP_5on4_y_train)
MP_5on4_predictions_with_points = MP_5on4_model_with_points.predict(MP_5on4_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_5on4_y_test, MP_5on4_predictions_with_points)
r2_with_points = r2_score(MP_5on4_y_test, MP_5on4_predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_5on4_X_train_no_points = MP_5on4_X_train.drop(columns=['I_F_points'])
MP_5on4_X_test_no_points = MP_5on4_X_test.drop(columns=['I_F_points'])

MP_5on4_model_without_points = MP_RFR_pipeline.fit(MP_5on4_X_train_no_points, MP_5on4_y_train)
MP_5on4_predictions_without_points = MP_5on4_model_without_points.predict(MP_5on4_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_5on4_y_test, MP_5on4_predictions_without_points)
r2_without_points = r2_score(MP_5on4_y_test, MP_5on4_predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 4.0533542273090255
R2 Score: 0.9931201484804324
Model without I_F_points:
Mean Squared Error: 3.989578785815973
R2 Score: 0.993228396006669/n
Comparison of Model Performance:
Difference in MSE: -0.06377544149305248
Difference in R2 Score: 0.0001082475262366156


## Other Situations:

### Other situations with I_F_points:

In [94]:
# Drop the target column to create the feature matrix X
MP_OS_X = MP_OS_stats.drop(columns=col_not_processed) 
MP_OS_y = MP_OS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_OS_X_train, MP_OS_X_test, MP_OS_y_train, MP_OS_y_test = train_test_split(MP_OS_X, MP_OS_y, test_size=0.2, random_state=42)

In [95]:
# Fit the pipeline to your training data
OS_model = MP_RFR_pipeline.fit(MP_OS_X_train, MP_OS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = OS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = OS_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

playerRating                           0.998137
OffIce_F_xGoals                        0.000076
ZR_playerRating                        0.000064
OffIce_A_xGoals                        0.000055
timeOnBench                            0.000054
offIce_xGoalsPercentage                0.000049
OffIce_F_shotAttempts                  0.000046
I_F_xGoalsFromActualReboundsOfShots    0.000042
avg_ice_time/shift (s)                 0.000037
OnIce_F_lowDangerxGoals                0.000036
dtype: float64


### Other situations without I_F_points

In [96]:
# Drop the target column to create the feature matrix X
MP_OS_X = MP_OS_stats.drop(columns=col_not_processed_without_points) 
MP_OS_y = MP_OS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_OS_X_train, MP_OS_X_test, MP_OS_y_train, MP_OS_y_test = train_test_split(MP_OS_X, MP_OS_y, test_size=0.2, random_state=42)

# Fit the pipeline to your training data
OS_model = MP_RFR_pipeline.fit(MP_OS_X_train, MP_OS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = OS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = OS_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

playerRating                           0.998161
OffIce_F_xGoals                        0.000071
ZR_playerRating                        0.000063
timeOnBench                            0.000052
OffIce_A_xGoals                        0.000048
OffIce_F_shotAttempts                  0.000045
offIce_xGoalsPercentage                0.000039
avg_ice_time/shift (s)                 0.000038
I_F_xGoalsFromActualReboundsOfShots    0.000037
games_played                           0.000033
dtype: float64


### Comparing the model with and without the I_F_points column

In [97]:
# Re assign the variable so that the comparison doesn't throw an error
MP_OS_X = MP_OS_stats.drop(columns=col_not_processed) 
MP_OS_y = MP_OS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_OS_X_train, MP_OS_X_test, MP_OS_y_train, MP_OS_y_test = train_test_split(MP_OS_X, MP_OS_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
OS_model_with_points = MP_RFR_pipeline.fit(MP_OS_X_train, MP_OS_y_train)
predictions_with_points = OS_model_with_points.predict(MP_OS_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_OS_y_test, predictions_with_points)
r2_with_points = r2_score(MP_OS_y_test, predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_OS_X_train_no_points = MP_OS_X_train.drop(columns=['I_F_points'])
MP_OS_X_test_no_points = MP_OS_X_test.drop(columns=['I_F_points'])

OS_model_without_points = MP_RFR_pipeline.fit(MP_OS_X_train_no_points, MP_OS_y_train)
predictions_without_points = OS_model_without_points.predict(MP_OS_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_OS_y_test, predictions_without_points)
r2_without_points = r2_score(MP_OS_y_test, predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 0.640942075295139
R2 Score: 0.9982298433589563
Model without I_F_points:
Mean Squared Error: 0.6474760695659723
R2 Score: 0.9982117977448565/n
Comparison of Model Performance:
Difference in MSE: 0.006533994270833232
Difference in R2 Score: -1.8045614099793106e-05


# Exploring a new gameScore metric that takes into account takeaways and giveaways:

In [98]:
MP_AS_stats.loc[MP_AS_stats['name'] == 'Clayton Keller']

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,onIce_fenwickPercentage,offIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,I_F_xFreeze,I_F_xPlayStopped,I_F_xPlayContinuedInZone,I_F_xPlayContinuedOutsideZone,I_F_flurryAdjustedxGoals,I_F_scoreVenueAdjustedxGoals,I_F_flurryScoreVenueAdjustedxGoals,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotsOnGoal,I_F_missedShots,I_F_blockedShotAttempts,I_F_shotAttempts,I_F_points,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playStopped,I_F_playContinuedInZone,I_F_playContinuedOutsideZone,I_F_savedShotsOnGoal,I_F_savedUnblockedShotAttempts,penalties,I_F_penalityMinutes,I_F_faceOffsWon,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerShots,I_F_mediumDangerShots,I_F_highDangerShots,I_F_lowDangerxGoals,I_F_mediumDangerxGoals,I_F_highDangerxGoals,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_scoreAdjustedShotsAttempts,I_F_unblockedShotAttempts,I_F_scoreAdjustedUnblockedShotAttempts,I_F_dZoneGiveaways,I_F_xGoalsFromxReboundsOfShots,I_F_xGoalsFromActualReboundsOfShots,I_F_reboundxGoals,I_F_xGoals_with_earned_rebounds,I_F_xGoals_with_earned_rebounds_scoreAdjusted,I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,I_F_shifts,I_F_oZoneShiftStarts,I_F_dZoneShiftStarts,I_F_neutralZoneShiftStarts,I_F_flyShiftStarts,I_F_oZoneShiftEnds,I_F_dZoneShiftEnds,I_F_neutralZoneShiftEnds,I_F_flyShiftEnds,faceoffsWon,faceoffsLost,timeOnBench,penalityMinutes,penalityMinutesDrawn,penaltiesDrawn,shotsBlockedByPlayer,OnIce_F_xOnGoal,OnIce_F_xGoals,OnIce_F_flurryAdjustedxGoals,OnIce_F_scoreVenueAdjustedxGoals,OnIce_F_flurryScoreVenueAdjustedxGoals,OnIce_F_shotsOnGoal,OnIce_F_missedShots,OnIce_F_blockedShotAttempts,OnIce_F_shotAttempts,OnIce_F_goals,OnIce_F_rebounds,OnIce_F_reboundGoals,OnIce_F_lowDangerShots,OnIce_F_mediumDangerShots,OnIce_F_highDangerShots,OnIce_F_lowDangerxGoals,OnIce_F_mediumDangerxGoals,OnIce_F_highDangerxGoals,OnIce_F_lowDangerGoals,OnIce_F_mediumDangerGoals,OnIce_F_highDangerGoals,OnIce_F_scoreAdjustedShotsAttempts,OnIce_F_unblockedShotAttempts,OnIce_F_scoreAdjustedUnblockedShotAttempts,OnIce_F_xGoalsFromxReboundsOfShots,OnIce_F_xGoalsFromActualReboundsOfShots,OnIce_F_reboundxGoals,OnIce_F_xGoals_with_earned_rebounds,OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OnIce_A_xOnGoal,OnIce_A_xGoals,OnIce_A_flurryAdjustedxGoals,OnIce_A_scoreVenueAdjustedxGoals,OnIce_A_flurryScoreVenueAdjustedxGoals,OnIce_A_shotsOnGoal,OnIce_A_missedShots,OnIce_A_blockedShotAttempts,OnIce_A_shotAttempts,OnIce_A_goals,OnIce_A_rebounds,OnIce_A_reboundGoals,OnIce_A_lowDangerShots,OnIce_A_mediumDangerShots,OnIce_A_highDangerShots,OnIce_A_lowDangerxGoals,OnIce_A_mediumDangerxGoals,OnIce_A_highDangerxGoals,OnIce_A_lowDangerGoals,OnIce_A_mediumDangerGoals,OnIce_A_highDangerGoals,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shoots,avg_ice_time/shift (s),avg_shifts_per_game,age,age_group,ZR_gameScore,playerRating,ZR_playerRating
757,8479343,2022,Clayton Keller,ARI,R,all,67,80958.0,1693.0,50.78,0.5,0.37,0.49,0.4,0.49,0.39,113.0,167.7,20.52,12.37,33.37,5.21,92.05,64.48,19.99,20.5,19.98,22.0,13.0,177.0,51.0,46.0,274.0,63.0,28.0,16.0,2.0,37.0,5.0,84.0,58.0,149.0,200.0,14.0,28.0,29.0,17.0,61.0,53.0,161.0,40.0,27.0,5.35,4.84,10.33,8.0,6.0,14.0,269.44,228.0,225.1,16.0,2.87,2.72,2.4,20.99,20.97,20.59,1693.0,276.0,271.0,272.0,874.0,172.0,248.0,280.0,993.0,29.0,39.0,162527.0,28.0,48.0,24.0,31.0,670.43,70.46,68.02,70.2,67.76,673.0,269.0,246.0,1188.0,86.0,57.0,7.0,682.0,187.0,73.0,20.34,22.42,27.7,24.0,31.0,31.0,1167.57,942.0,929.83,11.31,10.47,10.47,71.3,71.02,69.47,701.16,70.06,67.36,70.96,68.26,737.0,234.0,267.0,1238.0,74.0,56.0,9.0,712.0,196.0,63.0,21.81,23.87,24.36,32.0,19.0,23.0,1267.59,971.0,990.75,10.57,12.67,12.67,67.95,68.91,67.16,88.76,151.83,1971.0,2931.0,0.0,0.0,0.0,0.0,0.0,0.0,1998-07-29 00:00:00,170.0,"5' 10""",USA,L,48.0,25.0,24.0,Young Pro,61.875,40.234784,48.777544
1841,8479343,2023,Clayton Keller,ARI,R,all,82,102091.0,2067.0,77.36,0.52,0.36,0.53,0.38,0.53,0.37,143.0,218.67,24.95,16.13,45.29,6.88,123.61,85.14,24.32,24.9,24.27,38.0,11.0,224.0,78.0,81.0,383.0,86.0,37.0,16.0,2.0,37.0,6.0,99.0,107.0,187.0,265.0,19.0,39.0,22.0,18.0,58.0,63.0,212.0,68.0,22.0,7.56,7.76,9.63,16.0,8.0,13.0,376.21,302.0,298.95,15.0,3.63,3.19,3.42,25.15,24.99,24.47,2067.0,338.0,264.0,316.0,1149.0,209.0,309.0,329.0,1220.0,22.0,41.0,197418.0,39.0,50.0,24.0,34.0,893.73,101.42,95.04,100.68,94.32,889.0,347.0,383.0,1619.0,110.0,73.0,10.0,853.0,276.0,107.0,27.25,33.06,41.11,36.0,32.0,42.0,1592.01,1236.0,1222.29,15.26,15.82,16.17,100.52,99.75,95.16,802.13,94.89,91.48,96.1,92.64,801.0,299.0,315.0,1415.0,87.0,68.0,15.0,723.0,279.0,98.0,23.36,35.34,36.19,27.0,34.0,26.0,1445.24,1100.0,1117.55,12.19,15.8,15.8,91.28,92.39,90.75,122.12,218.39,2335.0,3814.0,0.0,0.0,0.0,0.0,0.0,0.0,1998-07-29 00:00:00,170.0,"5' 10""",USA,L,49.0,25.0,25.0,Young Pro,83.095,53.806476,53.845163
2809,8479343,2024,Clayton Keller,ARI,R,all,78,90070.0,1743.0,73.75,0.56,0.43,0.57,0.43,0.57,0.42,221.0,237.47,24.88,16.54,52.75,8.12,137.29,91.42,24.13,24.91,24.17,27.0,16.0,228.0,103.0,99.0,430.0,76.0,33.0,23.0,2.0,48.0,1.0,127.0,99.0,195.0,298.0,16.0,32.0,30.0,20.0,43.0,47.0,249.0,57.0,25.0,7.88,7.0,10.0,12.0,8.0,13.0,428.53,331.0,330.81,5.0,3.74,2.86,2.31,26.3,26.33,25.69,1743.0,310.0,189.0,318.0,926.0,206.0,264.0,317.0,956.0,30.0,35.0,193372.0,32.0,56.0,28.0,30.0,853.06,91.95,87.62,91.91,87.59,782.0,398.0,416.0,1596.0,94.0,77.0,5.0,868.0,205.0,107.0,25.76,25.27,40.92,30.0,27.0,37.0,1584.97,1180.0,1174.17,14.31,13.32,13.32,92.95,92.88,89.49,648.98,72.24,69.62,73.15,70.47,631.0,264.0,298.0,1193.0,78.0,64.0,8.0,655.0,150.0,90.0,19.58,17.96,34.7,31.0,20.0,27.0,1208.33,895.0,907.11,8.87,12.81,12.72,68.39,68.94,67.53,139.34,188.46,2767.0,3715.0,0.0,0.0,0.0,0.0,0.0,0.0,1998-07-29 00:00:00,170.0,"5' 10""",USA,L,52.0,22.0,26.0,Young Pro,75.76,50.630982,52.306098


## Exploring how to weight xGoals for each scoring chance category low, med, high:

In [99]:
AS_total_xGoals = MP_AS_stats['I_F_xGoals'].sum()
print(AS_total_xGoals)

AS_total_lowDangerxGoals = MP_AS_stats['I_F_lowDangerxGoals'].sum()
print(AS_total_lowDangerxGoals)

AS_total_medDangerxGoals = MP_AS_stats['I_F_mediumDangerxGoals'].sum()
print(AS_total_medDangerxGoals)

AS_total_highDangerxGoals = MP_AS_stats['I_F_highDangerxGoals'].sum()
print(AS_total_highDangerxGoals)

#find the scoring chance xGoals percentage for weighting.
AS_LDxG_percent = 100 * (AS_total_lowDangerxGoals/AS_total_xGoals)
print('the AS LDxG_percentage is:  ', AS_LDxG_percent)

AS_MDxG_percent = 100 * (AS_total_medDangerxGoals/AS_total_xGoals)
print('the AS MDxG_percentage is:  ', AS_MDxG_percent)

AS_HDxG_percent = 100 * (AS_total_highDangerxGoals/AS_total_xGoals)
print('the AS HDxG_percentage is:  ', AS_HDxG_percent)


24634.84
7256.17
8331.32
9047.81
the AS LDxG_percentage is:   29.454910200350398
the AS MDxG_percentage is:   33.81925760427102
the AS HDxG_percentage is:   36.727699469531764


## Making my ZR_gameScore column:

In [100]:
def calculate_ZR_gameScore(df):
    """
    Calculates the ZR_gameScore for a given DataFrame.

    Args:
        df: The DataFrame containing player statistics.

    Returns:
        The DataFrame with the 'ZR_gameScore' column added.
    """

    df['ZR_gameScore'] = (
        (df['I_F_goals'] * 0.75) 
        + (df['I_F_primaryAssists'] * 0.7) 
        + (df['I_F_secondaryAssists'] * 0.55)
        + (df['I_F_shotsOnGoal'] * 0.075) 
        + (df['shotsBlockedByPlayer'] * 0.05) 
        + (df['penaltiesDrawn'] * 0.15) 
        - (df['penalties'] * 0.15)
        + (df['I_F_hits'] * 0.01) 
        - (df['I_F_dZoneGiveaways'] * 0.03) 
        + (df['I_F_takeaways'] * 0.015) 
        - (df['I_F_giveaways'] * 0.015)
        + (df['onIce_corsiPercentage']) 
        + (df['faceoffsWon'] * 0.01) 
        - (df['faceoffsLost'] * 0.01)
        + (df['OnIce_F_goals'] * 0.15) 
        - (df['OnIce_A_goals'] * 0.15)
    )

    return df

# Apply the function to your DataFrames
MP_AS_stats = calculate_ZR_gameScore(MP_AS_stats)
MP_5on5_stats = calculate_ZR_gameScore(MP_5on5_stats)
MP_4on5_stats = calculate_ZR_gameScore(MP_4on5_stats)
MP_5on4_stats = calculate_ZR_gameScore(MP_5on4_stats)
MP_OS_stats = calculate_ZR_gameScore(MP_OS_stats)

In [101]:
gameScore_df = MP_AS_stats[['name', 'gameScore', 'playerRating','ZR_gameScore', 'ZR_playerRating']]
gameScore_df.loc[gameScore_df['name'] == 'Nick Suzuki']

Unnamed: 0,name,gameScore,playerRating,ZR_gameScore,ZR_playerRating
827,Nick Suzuki,50.28,39.885403,56.545,44.627423
1909,Nick Suzuki,52.7,38.481139,60.71,39.421373
2496,Nick Suzuki,69.75,48.192404,71.625,49.448955
