# The Money Puck Recommender Engine:

In [175]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances

# Load the datasets
pd.set_option('display.max_columns', None) # Display Preference

In [176]:
#Saving the new MoneyPuck Datafranes as CSVs:
# All Situations:
MP_all_situations_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2021_2022.csv')
MP_all_situations_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2022_2023.csv')
MP_all_situations_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2023_2024.csv')
# 5on5:
MP_5on5_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2021_2022.csv')
MP_5on5_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2022_2023.csv')
MP_5on5_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2023_2024.csv')
# 4on5:
MP_4on5_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2021_2022.csv')
MP_4on5_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2022_2023.csv')
MP_4on5_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2023_2024.csv')
#5on4:
MP_5on4_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2021_2022.csv')
MP_5on4_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2022_2023.csv')
MP_5on4_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2023_2024.csv')
#Other Situations:
MP_other_situations_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2021_2022.csv')
MP_other_situations_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2022_2023.csv')
MP_other_situations_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2023_2024.csv')

In [177]:
#Concatenating the situational dataframes:
#All situations
MP_all_situations_frames = [MP_all_situations_2021_2022_df, MP_all_situations_2022_2023_df, MP_all_situations_2023_2024_df]
MP_AS_stats = pd.concat(MP_all_situations_frames, ignore_index=True)

#5on5:
MP_5on5_frames = [MP_5on5_2021_2022_df, MP_5on5_2022_2023_df, MP_5on5_2023_2024_df]
MP_5on5_stats = pd.concat(MP_5on5_frames, ignore_index=True)

#4on5:
MP_4on5_frames = [MP_4on5_2021_2022_df, MP_4on5_2022_2023_df, MP_4on5_2023_2024_df]
MP_4on5_stats = pd.concat(MP_4on5_frames, ignore_index=True)

#5on4:
MP_5on4_frames = [MP_5on4_2021_2022_df, MP_5on4_2022_2023_df, MP_5on4_2023_2024_df]
MP_5on4_stats = pd.concat(MP_5on4_frames, ignore_index=True)

#Other:
MP_other_situations_frames = [MP_other_situations_2021_2022_df, MP_other_situations_2022_2023_df, MP_other_situations_2023_2024_df]
MP_OS_stats = pd.concat(MP_other_situations_frames, ignore_index=True)

In [178]:
#Saving combined DFs to CSVs:
#All Situations:
MP_AS_stats.to_csv('MP_all_situations_2022_to_2024.csv', index=0)
#5on5:
MP_5on5_stats.to_csv('MP_5on5_2022_to_2024.csv', index=0)
#4on5:
MP_4on5_stats.to_csv('MP_4on5_2022_to_2024.csv', index=0)
#5on4:
MP_5on4_stats.to_csv('MP_5on4_2022_to_2024.csv', index=0)
#Other Situations:
MP_OS_stats.to_csv('MP_other_situations_2022_to_2024.csv', index=0)

### Adding the biographical data

In [179]:
# Reading the players' bio data
MP_player_bios = pd.read_csv('MP_NHL_data/allPlayersLookup.csv')

In [180]:
# Combining the data frames
MP_merged_AS_stats_bios = pd.merge(MP_AS_stats, MP_player_bios, on='playerId', how='left')
MP_merged_5on5_stats_bios = pd.merge(MP_5on5_stats, MP_player_bios, on='playerId', how='left')
MP_merged_4on5_stats_bios = pd.merge(MP_4on5_stats, MP_player_bios, on='playerId', how='left')
MP_merged_5on4_stats_bios = pd.merge(MP_5on4_stats, MP_player_bios, on='playerId', how='left')
MP_merged_OS_stats_bios = pd.merge(MP_OS_stats, MP_player_bios, on='playerId', how='left')

In [181]:
# Columns to drop after merge:
drop_cols_post_merge = ['primaryNumber', 'primaryPosition', 'name_y', 'position_y', 'team_y']

MP_merged_AS_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_5on5_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_4on5_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_5on4_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_OS_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)

In [182]:
# Renaming the columns that were shared in the merge:
col_rename_map = {'name_x': 'name', 'team_x': 'team', 'position_x': 'position','shootsCatches': 'shoots'}
MP_AS_stats = MP_merged_AS_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_5on5_stats = MP_merged_5on5_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_4on5_stats = MP_merged_4on5_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_5on4_stats = MP_merged_5on4_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_OS_stats = MP_merged_OS_stats_bios.rename(mapper=col_rename_map, axis=1)

## Feature Engineering:

### Average Ice Time Column

In [183]:
# Making an Avg IceTime per shift column

# All Situations
MP_AS_stats['avg_ice_time/shift (s)'] = round((MP_AS_stats['icetime'] / MP_AS_stats['shifts']), 0)
cols = MP_AS_stats.columns.tolist()
cols.insert(9, cols.pop(cols.index('avg_ice_time/shift (s)')))
MP_all_situations_stats = MP_AS_stats[cols]

#5on5:
MP_5on5_stats['avg_ice_time/shift (s)'] = round((MP_5on5_stats['icetime'] / MP_5on5_stats['shifts']), 0)
cols_5on5 = MP_5on5_stats.columns.tolist()
cols_5on5.insert(9, cols_5on5.pop(cols_5on5.index('avg_ice_time/shift (s)')))
MP_5on5_stats = MP_5on5_stats[cols_5on5]

#4on5:
MP_4on5_stats['avg_ice_time/shift (s)'] = round((MP_4on5_stats['icetime'] / MP_4on5_stats['shifts']), 0)
cols_4on5 = MP_4on5_stats.columns.tolist()
cols_4on5.insert(9, cols_4on5.pop(cols_4on5.index('avg_ice_time/shift (s)')))
MP_4on5_stats = MP_4on5_stats[cols_4on5]

#5on4:
MP_5on4_stats['avg_ice_time/shift (s)'] = round((MP_5on4_stats['icetime'] / MP_5on4_stats['shifts']), 0)
cols_5on4 = MP_5on4_stats.columns.tolist()
cols_5on4.insert(9, cols_5on4.pop(cols_5on4.index('avg_ice_time/shift (s)')))
MP_5on4_stats = MP_5on4_stats[cols_5on4]

#Other situations:
MP_OS_stats['avg_ice_time/shift (s)'] = round((MP_OS_stats['icetime'] / MP_OS_stats['shifts']), 0)
cols_OS = MP_OS_stats.columns.tolist()
cols_OS.insert(9, cols_OS.pop(cols_OS.index('avg_ice_time/shift (s)')))
MP_OS_stats = MP_OS_stats[cols_OS]


### Average shifts per game column:

In [184]:
# Making an Avg shifts per game column
#AS:
MP_AS_stats['avg_shifts_per_game'] = round(MP_AS_stats['shifts']/MP_AS_stats['games_played'], 0)
#5on5:
MP_5on5_stats['avg_shifts_per_game'] = round(MP_5on5_stats['shifts']/MP_5on5_stats['games_played'], 0)
#4on5:
MP_4on5_stats['avg_shifts_per_game'] = round(MP_4on5_stats['shifts']/MP_4on5_stats['games_played'], 0)
#5on4:
MP_5on4_stats['avg_shifts_per_game'] = round(MP_5on4_stats['shifts']/MP_5on4_stats['games_played'], 0)
#OS:
MP_OS_stats['avg_shifts_per_game'] = round(MP_OS_stats['shifts']/MP_OS_stats['games_played'], 0)

### Adjusting seasn to be the year the season finished to help get an accurate player age:

In [185]:
# Update the season info to represent the year the season ended rather than the year that started the season.

season_map = {2021: 2022, 2022: 2023, 2023: 2024}
MP_AS_stats['season'] = MP_AS_stats['season'].map(season_map)
MP_5on5_stats['season'] = MP_5on5_stats['season'].map(season_map)
MP_4on5_stats['season'] = MP_4on5_stats['season'].map(season_map)
MP_5on4_stats['season'] = MP_5on4_stats['season'].map(season_map)
MP_OS_stats['season'] = MP_OS_stats['season'].map(season_map)

### Player age column:

In [186]:
def MP_calculate_playing_age(df, dob_col_name, season_col_name, age_col_name):
    """
    Updates the age of players in the DataFrame based on their date of birth.

    Parameters:
    df (pd.DataFrame): The DataFrame containing player data.
    dob_col_name (str): The name of the column with date of birth information.
    age_col_name (str): The name of the column where the age should be updated.
    current_year (int): The year to calculate current age from.

    Returns:
    pd.DataFrame: The DataFrame with updated ages.
    """
    # Convert the 'birthDate' column to datetime format
    df[dob_col_name] = pd.to_datetime(df[dob_col_name], errors='coerce')  # Handle potential errors during conversion

    # Extract the year
    df['birth_year'] = df[dob_col_name].dt.year

    # Calculate the new age and replace the 'Age' column
    df[age_col_name] = df[season_col_name] - df['birth_year']

    # Drop the helper column
    df.drop(columns='birth_year', inplace=True)

    return df

In [187]:
#Making an Age column based on taking the season column and subtracting the year from the birthdate using the function I made
MP_AS_stats = MP_calculate_playing_age(df=MP_AS_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_5on5_stats = MP_calculate_playing_age(df=MP_5on5_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_4on5_stats = MP_calculate_playing_age(df=MP_4on5_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_5on4_stats = MP_calculate_playing_age(df=MP_5on4_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_OS_stats = MP_calculate_playing_age(df=MP_OS_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')

In [188]:
# Handling the places where ages missing because the NaNs prevent the Pipelines from working
missing_age_dict = {
    'Adam Edstrom': 23,
 'Adam Ginning': 24,
 'Adam Klapka': 23,
 'Akil Thomas': 24,
 'Aku Raty': 23,
 'Alex Vlasic': 23,
 'Andy Andreoff': 33,
 'Angus Crookshank': 24,
 'Anton Levtchi': 28,
 'Arshdeep Bains': 23,
 'Blake Lizotte': 26,
 'Brad Lambert': 20,
 'Bradly Nadeau': 19,
 'Brandon Gignac': 26,
 'Brandon Scanlin': 25,
 'Brendan Brisson': 22,
 'Brennan Othmann': 21,
 'Brian Halonen': 25,
 'Cameron Butler': 22,
 'Cameron Crotty': 25,
 'Collin Graf': 21,
 'Cutter Gauthier': 20,
 'Declan Carlile': 24,
 'Elliot Desnoyers': 22,
 'Emil Heineman': 22,
 'Emil Lilleberg': 23,
 'Ethan Del Mastro': 21,
 'Filip Roos': 25,
 'Frank Nazar': 20,
 'Gage Goncalves': 23,
 'Gavin Brindley': 19,
 'Georgii Merkulov':23,
 'Graeme Clarke': 23,
 'Hudson Fasching': 29,
 'Isak Rosen': 21,
 'Ivan Miroshnichenko': 20,
 'Jack St. Ivany': 25,
 'Jack Thompson': 22,
 'Jackson Blake': 21,
 'Jacob MacDonald': 31,
 'James Malatesta': 21,
 'Jason Polin': 25,
 'Jayden Struble': 22,
 'Jeff Malott': 28,
 'Jiri Kulich': 20,
 'Jiri Smejkal': 27,
 'Josh Doan': 22,
 'Joshua Roy': 21,
 'Justin Brazeau': 26,
 'Kyle MacLean': 25,
 'Landon Slaggert': 22,
 'Lane Hutson': 20,
 'Liam Ohgren': 20,
 'Linus Karlsson': 24,
 'Logan Mailloux': 21,
 'Logan Morrison': 22,
 'Logan Stankoven': 21,
 'Louis Crevier': 23,
 'Luca Del Bel Belluz': 20,
 'Lukas Cormier': 22,
 'Maksymilian Szuber': 21,
 'Marat Khusnutdinov': 22,
 'Marc Johnstone': 28,
 'Marshall Rifai': 26,
 'Mason Marchment': 29,
 'Mason Morelli': 28,
 'Matt Rempe': 22,
 'Matt Roy': 29,
 'Matt Savoie': 20,
 'Mavrik Bourque': 22,
 'Maxwell Crozier': 24,
 'Nathan Bastian': 26,
 'Nikita Chibrikov': 21,
 'Olen Zellweger': 20,
 'Olle Lycksell': 24,
 'Ondrej Pavel': 23,
 'Oskar Steen': 26,
 'Patrik Koch': 27,
 'Philip Kemp': 25,
 'Pierrick Dube': 23,
 'Ruslan Iskhakov': 24,
 'Ryan Winterton': 20,
 'Ryker Evans': 22,
 'Sam Colangelo': 22,
 'Sam Malinski': 26,
 'Samuel Laberge': 27,
 'Scott Morrow': 21,
 'Shakir Mukhamadullin': 22,
 'Simon Nemec': 20,
 'Vasily Ponomarev': 22,
 'William Lockwood': 26,
 'Wyatt Kaiser': 22,
 'Yan Kuznetsov': 22,
 'Zach Dean': 21,
 'Zachary Hayes': 25,
 'Zack Bolduc': 21,
 'Zack Ostapchuk': 21
 }

In [189]:
# Applying the missing_age_dict to the original dataframes
MP_AS_stats['age'] = MP_AS_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)
MP_5on5_stats['age'] = MP_5on5_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_4on5_stats['age'] = MP_4on5_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_5on4_stats['age'] = MP_5on4_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_OS_stats['age'] = MP_OS_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

### Making a binnned age_group column based on age:

In [190]:
# COLUMN EDITS
# Age Column: Making Age Bins 
bins = [0, 20, 26, 30, 35, 45]
labels = ['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']

MP_AS_stats['age_group'] = pd.cut(MP_AS_stats['age'], bins, labels=labels)
MP_5on5_stats['age_group'] = pd.cut(MP_5on5_stats['age'], bins, labels=labels)
MP_4on5_stats['age_group'] = pd.cut(MP_4on5_stats['age'], bins, labels=labels)
MP_5on4_stats['age_group'] = pd.cut(MP_5on4_stats['age'], bins, labels=labels)
MP_OS_stats['age_group'] = pd.cut(MP_OS_stats['age'], bins, labels=labels)


In [191]:
#Saving the new feature engineered dataframes to csvs:
MP_AS_stats.to_csv('MP_AS_stats_bios_new_features.csv', index=0)
MP_5on5_stats.to_csv('MP_5on5_stats_bios_new_features.csv', index=0)
MP_4on5_stats.to_csv('MP_4on5_stats_bios_new_features.csv', index=0)
MP_5on4_stats.to_csv('MP_5on4_stats_bios_new_features.csv', index=0)
MP_OS_stats.to_csv('MP_OS_stats_bios_new_features.csv', index=0)

## Making the functions of the recommender engine so that it is more user friendly:

In [192]:
def MP_create_player_index_dict(df):
      """
    Create a nested dictionary from a DataFrame that maps player names to their indices for each season.

    This function resets the index of the DataFrame to ensure that the index column 
    holds the original row indices. It then groups the DataFrame by 'name' and 'season' 
    and aggregates the indices into a list for each group. After grouping, it pivots the DataFrame 
    so each players' 'name' is a row with each 'season' as columns, containing lists of indices 
    as values. Finally, it converts the pivoted DataFrame into a nested dictionary where each player's 
    name is a key to a dictionary mapping each season to the player's indices.

    Parameters:
    df (pandas.DataFrame): The DataFrame to process, which must contain 'Player' and 'Season' columns 
                           and has a unique index.

    Returns:
    dict: A nested dictionary where the first level keys are player names, and second level keys are 
          seasons, each mapping to a list of index positions for that player in that season.
    """

    # Reset the index 
      df = df.reset_index()

    # Group by 'Player' and 'Season', then aggregate the original index values into a list.
      grouped = df.groupby(['name', 'season'])['index'].agg(lambda x: list(x)).reset_index()

    # Pivot the DataFrame to have 'Player' as rows and 'Season' as columns with list of indices as values.
      pivot_df = grouped.pivot(index='name', columns='season', values='index')

    # Convert the pivoted DataFrame into a nested dictionary.
      MP_player_index_dict = pivot_df.apply(lambda row: row.dropna().to_dict(), axis=1).to_dict()

      return MP_player_index_dict

In [193]:
# Saving Index Dict Variables:
MP_AS_player_dict = MP_create_player_index_dict(MP_AS_stats)
MP_5on5_player_dict = MP_create_player_index_dict(MP_5on5_stats)
MP_4on5_player_dict = MP_create_player_index_dict(MP_4on5_stats)
MP_5on4_player_dict = MP_create_player_index_dict(MP_5on4_stats)
MP_OS_player_dict = MP_create_player_index_dict(MP_OS_stats)

In [194]:
def MP_get_index_all_gamestates(player_name, MP_AS_dict= MP_AS_player_dict, MP_5on5_dict= MP_5on5_player_dict, 
                                MP_4on5_dict= MP_4on5_player_dict, MP_5on4_dict= MP_5on4_player_dict,
                                MP_OS_dict= MP_OS_player_dict):
    """
    Returns a string with all the indices for each game state (All Strengths, Even Strength,
    Power Play, and Penalty Kill) for a given player.
    
    Parameters:
    - player_name (str): The name of the player to lookup.
    - player_index_dict_AS (dict): The dictionary with indices for All Strengths.
    - player_index_dict_ES (dict): The dictionary with indices for Even Strength.
    - player_index_dict_PP (dict): The dictionary with indices for Power Play.
    - player_index_dict_PK (dict): The dictionary with indices for Penalty Kill.

    Returns:
    - str: A formatted string containing the indices for each game state for the player.
    """
    result_string= (
        f"{player_name}'s ALL SITUATIONS indices are: {MP_AS_dict.get(player_name)}\n"
        f"{player_name}'s 5-ON-5 indices are: {MP_5on5_dict.get(player_name)}\n"
        f"{player_name}'s 4-ON-5 indices are: {MP_4on5_dict.get(player_name)}\n"
        f"{player_name}'s 5-ON-4 indices are: {MP_5on4_dict.get(player_name)}\n"
        f"{player_name}'s OTHER SITUATIONS indices are: {MP_OS_dict.get(player_name)}\n"
    )

    return print(result_string)

In [204]:
MP_get_index_all_gamestates(player_name='Nick Suzuki')

Nick Suzuki's ALL SITUATIONS indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 5-ON-5 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 4-ON-5 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 5-ON-4 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's OTHER SITUATIONS indices are: {2022: [827], 2023: [1909], 2024: [2496]}



In [195]:
def MP_get_players_baseline_gamestate_stats(original_gamestate_df, player_name):
    """
    Returns the baseline performance metrics of the player you are finding comparable players of 
    so you can see how their stats are over the course of the seasons in the engine.
    Args:
    - original_gamestate_df (pd.DataFrame): DataFrame containing the original skater stats.
    - player_name: must be a string of the full name of the player you want to look up, 
    If player name is misspelled or there is no data for that player, 
    the function returns an empty dataframe.
    -Small adustment from the other function. The MP function uses 'name' instead of 'Player' 

    """
    baseline_gamestate_stats = original_gamestate_df.loc[original_gamestate_df['name'] == player_name]
    return baseline_gamestate_stats

In [205]:
MP_get_players_baseline_gamestate_stats(MP_AS_stats, 'Nick Suzuki')

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,onIce_fenwickPercentage,offIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,I_F_xFreeze,I_F_xPlayStopped,I_F_xPlayContinuedInZone,I_F_xPlayContinuedOutsideZone,I_F_flurryAdjustedxGoals,I_F_scoreVenueAdjustedxGoals,I_F_flurryScoreVenueAdjustedxGoals,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotsOnGoal,I_F_missedShots,I_F_blockedShotAttempts,I_F_shotAttempts,I_F_points,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playStopped,I_F_playContinuedInZone,I_F_playContinuedOutsideZone,I_F_savedShotsOnGoal,I_F_savedUnblockedShotAttempts,penalties,I_F_penalityMinutes,I_F_faceOffsWon,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerShots,I_F_mediumDangerShots,I_F_highDangerShots,I_F_lowDangerxGoals,I_F_mediumDangerxGoals,I_F_highDangerxGoals,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_scoreAdjustedShotsAttempts,I_F_unblockedShotAttempts,I_F_scoreAdjustedUnblockedShotAttempts,I_F_dZoneGiveaways,I_F_xGoalsFromxReboundsOfShots,I_F_xGoalsFromActualReboundsOfShots,I_F_reboundxGoals,I_F_xGoals_with_earned_rebounds,I_F_xGoals_with_earned_rebounds_scoreAdjusted,I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,I_F_shifts,I_F_oZoneShiftStarts,I_F_dZoneShiftStarts,I_F_neutralZoneShiftStarts,I_F_flyShiftStarts,I_F_oZoneShiftEnds,I_F_dZoneShiftEnds,I_F_neutralZoneShiftEnds,I_F_flyShiftEnds,faceoffsWon,faceoffsLost,timeOnBench,penalityMinutes,penalityMinutesDrawn,penaltiesDrawn,shotsBlockedByPlayer,OnIce_F_xOnGoal,OnIce_F_xGoals,OnIce_F_flurryAdjustedxGoals,OnIce_F_scoreVenueAdjustedxGoals,OnIce_F_flurryScoreVenueAdjustedxGoals,OnIce_F_shotsOnGoal,OnIce_F_missedShots,OnIce_F_blockedShotAttempts,OnIce_F_shotAttempts,OnIce_F_goals,OnIce_F_rebounds,OnIce_F_reboundGoals,OnIce_F_lowDangerShots,OnIce_F_mediumDangerShots,OnIce_F_highDangerShots,OnIce_F_lowDangerxGoals,OnIce_F_mediumDangerxGoals,OnIce_F_highDangerxGoals,OnIce_F_lowDangerGoals,OnIce_F_mediumDangerGoals,OnIce_F_highDangerGoals,OnIce_F_scoreAdjustedShotsAttempts,OnIce_F_unblockedShotAttempts,OnIce_F_scoreAdjustedUnblockedShotAttempts,OnIce_F_xGoalsFromxReboundsOfShots,OnIce_F_xGoalsFromActualReboundsOfShots,OnIce_F_reboundxGoals,OnIce_F_xGoals_with_earned_rebounds,OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OnIce_A_xOnGoal,OnIce_A_xGoals,OnIce_A_flurryAdjustedxGoals,OnIce_A_scoreVenueAdjustedxGoals,OnIce_A_flurryScoreVenueAdjustedxGoals,OnIce_A_shotsOnGoal,OnIce_A_missedShots,OnIce_A_blockedShotAttempts,OnIce_A_shotAttempts,OnIce_A_goals,OnIce_A_rebounds,OnIce_A_reboundGoals,OnIce_A_lowDangerShots,OnIce_A_mediumDangerShots,OnIce_A_highDangerShots,OnIce_A_lowDangerxGoals,OnIce_A_mediumDangerxGoals,OnIce_A_highDangerxGoals,OnIce_A_lowDangerGoals,OnIce_A_mediumDangerGoals,OnIce_A_highDangerGoals,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shoots,avg_ice_time/shift (s),avg_shifts_per_game,age,age_group
827,8480018,2022,Nick Suzuki,MTL,C,all,82,100910.0,1978.0,50.28,0.45,0.43,0.51,0.44,0.51,0.43,136.0,177.26,18.81,12.77,38.45,6.03,92.23,69.72,17.56,18.65,17.41,19.0,21.0,186.0,52.0,71.0,309.0,61.0,21.0,17.0,5.0,31.0,3.0,67.0,99.0,165.0,217.0,15.0,30.0,699.0,89.0,50.0,71.0,175.0,42.0,21.0,5.74,5.26,7.8,7.0,5.0,9.0,304.13,238.0,234.29,44.0,2.93,3.37,4.59,17.16,16.95,16.66,1978.0,341.0,279.0,306.0,1052.0,263.0,273.0,336.0,1106.0,699.0,711.0,197423.0,30.0,50.0,26.0,62.0,877.02,82.03,78.56,81.67,78.22,899.0,313.0,407.0,1619.0,90.0,69.0,15.0,930.0,203.0,79.0,27.1,25.01,29.92,29.0,34.0,27.0,1594.56,1212.0,1196.04,14.64,14.08,14.17,82.5,82.04,79.91,857.05,98.29,94.83,99.43,95.93,883.0,297.0,389.0,1569.0,106.0,65.0,15.0,802.0,270.0,108.0,24.45,33.76,40.07,22.0,48.0,36.0,1602.15,1180.0,1203.86,13.34,14.78,15.02,96.6,97.7,95.74,134.52,180.46,2721.0,3420.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10 00:00:00,201.0,"5' 11""",CAN,R,51.0,24.0,23.0,Young Pro
1909,8480018,2023,Nick Suzuki,MTL,C,all,82,103790.0,1849.0,52.7,0.44,0.4,0.49,0.42,0.49,0.42,107.0,161.52,18.09,12.02,35.49,5.34,84.7,61.36,17.3,17.92,17.15,25.0,15.0,162.0,55.0,77.0,294.0,66.0,26.0,17.0,2.0,32.0,3.0,58.0,81.0,136.0,191.0,10.0,23.0,663.0,50.0,42.0,52.0,146.0,49.0,22.0,4.65,6.08,7.36,7.0,12.0,7.0,292.48,217.0,215.59,26.0,2.8,4.03,3.26,17.63,17.54,17.19,1849.0,347.0,292.0,321.0,889.0,221.0,235.0,325.0,1068.0,663.0,738.0,194627.0,23.0,48.0,25.0,56.0,859.95,86.16,81.43,85.65,80.97,851.0,330.0,428.0,1609.0,103.0,75.0,11.0,835.0,251.0,95.0,24.04,30.39,31.73,28.0,46.0,29.0,1594.69,1181.0,1173.06,14.71,14.62,14.67,86.18,85.86,83.09,882.13,111.47,105.82,112.68,107.02,890.0,318.0,436.0,1644.0,110.0,95.0,16.0,793.0,275.0,140.0,24.73,33.88,52.85,22.0,42.0,46.0,1675.17,1208.0,1225.97,14.34,22.53,22.53,103.28,104.24,101.37,133.35,203.05,2604.0,3563.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10 00:00:00,201.0,"5' 11""",CAN,R,56.0,23.0,24.0,Young Pro
2496,8480018,2024,Nick Suzuki,MTL,C,all,82,104619.0,1878.0,69.75,0.55,0.39,0.54,0.4,0.55,0.4,107.0,195.82,21.76,14.25,44.74,6.58,105.04,73.64,20.97,21.73,20.94,25.0,19.0,185.0,81.0,58.0,324.0,77.0,33.0,12.0,3.0,30.0,4.0,109.0,78.0,152.0,233.0,17.0,36.0,689.0,69.0,41.0,63.0,185.0,56.0,25.0,6.43,7.2,8.12,16.0,9.0,8.0,321.77,266.0,264.56,29.0,3.33,2.86,3.69,21.4,21.33,20.75,1878.0,385.0,246.0,348.0,899.0,213.0,260.0,316.0,1089.0,689.0,622.0,196168.0,36.0,40.0,20.0,48.0,1008.31,104.79,99.91,105.08,100.15,972.0,419.0,478.0,1869.0,111.0,86.0,11.0,1007.0,266.0,118.0,31.45,32.54,40.81,44.0,31.0,36.0,1863.29,1391.0,1389.05,16.94,19.83,19.83,101.91,101.95,99.05,846.9,87.41,83.4,87.83,83.79,814.0,346.0,434.0,1594.0,101.0,76.0,10.0,867.0,190.0,103.0,25.29,24.38,37.74,38.0,34.0,29.0,1605.4,1160.0,1168.01,12.18,17.34,17.34,82.25,82.44,80.71,124.32,193.35,2664.0,3942.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10 00:00:00,201.0,"5' 11""",CAN,R,56.0,23.0,25.0,Young Pro


## Building the preprocessing and processing pipeline for the recommender engine:

In [196]:
# handling the values that would interfere with the encoder that includes the 'inf' and NaN
#This is mainly for the ice_time/shift' column
MP_AS_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_5on5_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_4on5_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_5on4_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_OS_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)

### The Pipeline:

In [197]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('age_group', Pipeline([
            ('ordinal', OrdinalEncoder(categories=[['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']])),
            ('scaler', StandardScaler())  # Scale the ordinal-encoded age_group
        ]), ['age_group']),
        ('position', Pipeline([
            ('onehot', OneHotEncoder()),  # Apply OneHotEncoder to 'position'
            ('scaler', StandardScaler(with_mean=False))  # Apply StandardScaler after OneHotEncoder
        ]), ['position'])
    ])

# My current Pipeline
MP_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA())
])

In [271]:
# Columns not to included in the processing:
col_not_processed = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age', 'gameScore', 'I_F_points',]

### The transformed dataframes for the recommender:

In [272]:
MP_AS_stats_transformed = MP_pipeline.fit_transform(MP_AS_stats.drop(columns=col_not_processed)) # All Situations
MP_5on5_stats_transformed = MP_pipeline.fit_transform(MP_5on5_stats.drop(columns=col_not_processed)) # 5on5
MP_4on5_stats_transformed = MP_pipeline.fit_transform(MP_4on5_stats.drop(columns=col_not_processed)) # 4on5
MP_5on4_stats_transformed = MP_pipeline.fit_transform(MP_5on4_stats.drop(columns=col_not_processed)) # 5on4
MP_OS_stats_transformed = MP_pipeline.fit_transform(MP_OS_stats.drop(columns=col_not_processed)) # Other Situations

## Running the recommmender engine:

In [201]:
def MP_recommend_skaters(original_gamestate_df, processed_gamestate_df, season, player_index, top_n=6):
    """
    Recommends skaters based on their stats using a preprocessed PCA features.

    Args:
    - original_gamestate_df (pd.DataFrame): DataFrame containing the original skater stats.
        Acceptable inputs for original_gamestate_df are: [MP_AS_stats, MP_5on5_stats, MP_4on5_stats, MP_5on4_stats, MP_OS_stats]
    - processed_gamestate_df (pd.DataFrame): PCA-transformed and scaled features of the skaters.
        Acceptable inputs for processed_gamestate_df are: 
        [MP_AS_processed_data, MP_5on5_processed_data, MP_4on5_processed_data, MP_5on4_processed_data, MP_OS_processed_data]
    - season (int): The target season for comparison.
        Acceptable inputs for season are: 2021, 2022, 2023 
    - player_index (int): Index of the player in the DataFrame to get recommendations for.
        player_index as accessed through the function: MP_get_index_all_gamestates() 
    - top_n (int): Number of top recommendations to return.

    Returns:
    - pd.DataFrame: DataFrame containing the top_n recommended skaters for the given player in the specified season.
    """

    # Filter DataFrame for the target season
    target_season_data = processed_gamestate_df[original_gamestate_df['season'] == season]

    # Compute pairwise distances between all skaters and those from the target season
    distances = pairwise_distances(processed_gamestate_df, target_season_data)

    # Find the indices of the closest skaters
    indices = np.argsort(distances, axis=1)[:, :top_n]

    # Retrieve the recommendations from the original stats DataFrame
    MP_recommended_skaters = original_gamestate_df[original_gamestate_df['season'] == season].iloc[indices[player_index], :]

    return MP_recommended_skaters

In [273]:
# AS
nick_suzuki_AS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_AS_stats,
                                                  processed_gamestate_df=MP_AS_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#5on5
nick_suzuki_5on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on5_stats,
                                                  processed_gamestate_df=MP_5on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#4on5
nick_suzuki_4on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_4on5_stats,
                                                  processed_gamestate_df=MP_4on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#5on4
nick_suzuki_5on4_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on4_stats,
                                                  processed_gamestate_df=MP_5on4_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#OS
nick_suzuki_OS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_OS_stats,
                                                  processed_gamestate_df=MP_OS_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)


In [274]:
nick_suzuki_AS_sim_skaters

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,onIce_fenwickPercentage,offIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,I_F_xFreeze,I_F_xPlayStopped,I_F_xPlayContinuedInZone,I_F_xPlayContinuedOutsideZone,I_F_flurryAdjustedxGoals,I_F_scoreVenueAdjustedxGoals,I_F_flurryScoreVenueAdjustedxGoals,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotsOnGoal,I_F_missedShots,I_F_blockedShotAttempts,I_F_shotAttempts,I_F_points,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playStopped,I_F_playContinuedInZone,I_F_playContinuedOutsideZone,I_F_savedShotsOnGoal,I_F_savedUnblockedShotAttempts,penalties,I_F_penalityMinutes,I_F_faceOffsWon,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerShots,I_F_mediumDangerShots,I_F_highDangerShots,I_F_lowDangerxGoals,I_F_mediumDangerxGoals,I_F_highDangerxGoals,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_scoreAdjustedShotsAttempts,I_F_unblockedShotAttempts,I_F_scoreAdjustedUnblockedShotAttempts,I_F_dZoneGiveaways,I_F_xGoalsFromxReboundsOfShots,I_F_xGoalsFromActualReboundsOfShots,I_F_reboundxGoals,I_F_xGoals_with_earned_rebounds,I_F_xGoals_with_earned_rebounds_scoreAdjusted,I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,I_F_shifts,I_F_oZoneShiftStarts,I_F_dZoneShiftStarts,I_F_neutralZoneShiftStarts,I_F_flyShiftStarts,I_F_oZoneShiftEnds,I_F_dZoneShiftEnds,I_F_neutralZoneShiftEnds,I_F_flyShiftEnds,faceoffsWon,faceoffsLost,timeOnBench,penalityMinutes,penalityMinutesDrawn,penaltiesDrawn,shotsBlockedByPlayer,OnIce_F_xOnGoal,OnIce_F_xGoals,OnIce_F_flurryAdjustedxGoals,OnIce_F_scoreVenueAdjustedxGoals,OnIce_F_flurryScoreVenueAdjustedxGoals,OnIce_F_shotsOnGoal,OnIce_F_missedShots,OnIce_F_blockedShotAttempts,OnIce_F_shotAttempts,OnIce_F_goals,OnIce_F_rebounds,OnIce_F_reboundGoals,OnIce_F_lowDangerShots,OnIce_F_mediumDangerShots,OnIce_F_highDangerShots,OnIce_F_lowDangerxGoals,OnIce_F_mediumDangerxGoals,OnIce_F_highDangerxGoals,OnIce_F_lowDangerGoals,OnIce_F_mediumDangerGoals,OnIce_F_highDangerGoals,OnIce_F_scoreAdjustedShotsAttempts,OnIce_F_unblockedShotAttempts,OnIce_F_scoreAdjustedUnblockedShotAttempts,OnIce_F_xGoalsFromxReboundsOfShots,OnIce_F_xGoalsFromActualReboundsOfShots,OnIce_F_reboundxGoals,OnIce_F_xGoals_with_earned_rebounds,OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OnIce_A_xOnGoal,OnIce_A_xGoals,OnIce_A_flurryAdjustedxGoals,OnIce_A_scoreVenueAdjustedxGoals,OnIce_A_flurryScoreVenueAdjustedxGoals,OnIce_A_shotsOnGoal,OnIce_A_missedShots,OnIce_A_blockedShotAttempts,OnIce_A_shotAttempts,OnIce_A_goals,OnIce_A_rebounds,OnIce_A_reboundGoals,OnIce_A_lowDangerShots,OnIce_A_mediumDangerShots,OnIce_A_highDangerShots,OnIce_A_lowDangerxGoals,OnIce_A_mediumDangerxGoals,OnIce_A_highDangerxGoals,OnIce_A_lowDangerGoals,OnIce_A_mediumDangerGoals,OnIce_A_highDangerGoals,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shoots,avg_ice_time/shift (s),avg_shifts_per_game,age,age_group
2496,8480018,2024,Nick Suzuki,MTL,C,all,82,104619.0,1878.0,69.75,0.55,0.39,0.54,0.4,0.55,0.4,107.0,195.82,21.76,14.25,44.74,6.58,105.04,73.64,20.97,21.73,20.94,25.0,19.0,185.0,81.0,58.0,324.0,77.0,33.0,12.0,3.0,30.0,4.0,109.0,78.0,152.0,233.0,17.0,36.0,689.0,69.0,41.0,63.0,185.0,56.0,25.0,6.43,7.2,8.12,16.0,9.0,8.0,321.77,266.0,264.56,29.0,3.33,2.86,3.69,21.4,21.33,20.75,1878.0,385.0,246.0,348.0,899.0,213.0,260.0,316.0,1089.0,689.0,622.0,196168.0,36.0,40.0,20.0,48.0,1008.31,104.79,99.91,105.08,100.15,972.0,419.0,478.0,1869.0,111.0,86.0,11.0,1007.0,266.0,118.0,31.45,32.54,40.81,44.0,31.0,36.0,1863.29,1391.0,1389.05,16.94,19.83,19.83,101.91,101.95,99.05,846.9,87.41,83.4,87.83,83.79,814.0,346.0,434.0,1594.0,101.0,76.0,10.0,867.0,190.0,103.0,25.29,24.38,37.74,38.0,34.0,29.0,1605.4,1160.0,1168.01,12.18,17.34,17.34,82.25,82.44,80.71,124.32,193.35,2664.0,3942.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10 00:00:00,201.0,"5' 11""",CAN,R,56.0,23.0,25.0,Young Pro
2476,8477500,2024,Bo Horvat,NYI,C,all,81,95712.0,1916.0,75.16,0.53,0.44,0.55,0.42,0.56,0.43,173.0,247.22,23.4,17.31,55.6,8.23,137.72,96.74,22.43,23.62,22.64,19.0,16.0,248.0,91.0,91.0,430.0,68.0,33.0,24.0,3.0,54.0,0.0,112.0,116.0,215.0,306.0,13.0,29.0,765.0,61.0,35.0,44.0,246.0,71.0,22.0,7.63,8.39,7.39,14.0,13.0,6.0,434.2,339.0,343.2,11.0,3.9,5.64,3.74,23.56,23.76,23.08,1916.0,354.0,221.0,249.0,1092.0,252.0,270.0,322.0,1072.0,765.0,654.0,199674.0,29.0,29.0,13.0,42.0,1060.69,100.9,96.64,101.27,97.01,1034.0,422.0,496.0,1952.0,114.0,113.0,18.0,1108.0,231.0,117.0,30.25,29.08,41.58,46.0,39.0,29.0,1962.01,1456.0,1464.93,17.08,23.75,23.75,94.24,94.67,91.97,831.94,88.18,83.01,88.32,83.13,801.0,349.0,446.0,1596.0,90.0,87.0,11.0,869.0,174.0,107.0,24.93,21.66,41.58,30.0,22.0,38.0,1596.65,1150.0,1150.15,11.87,18.25,18.25,81.8,81.88,79.38,135.78,174.05,2797.0,3866.0,0.0,0.0,0.0,0.0,0.0,0.0,1995-04-05 00:00:00,215.0,"6' 0""",CAN,L,50.0,24.0,29.0,Prime Age
2619,8480023,2024,Robert Thomas,STL,C,all,82,103155.0,1893.0,74.78,0.52,0.43,0.53,0.43,0.52,0.44,131.0,176.92,22.59,13.32,40.63,6.33,95.98,64.15,20.43,22.52,20.37,35.0,25.0,170.0,73.0,83.0,326.0,86.0,26.0,19.0,3.0,25.0,2.0,92.0,79.0,144.0,217.0,19.0,38.0,876.0,16.0,70.0,54.0,162.0,55.0,26.0,5.38,7.14,10.08,10.0,4.0,12.0,324.98,243.0,242.18,13.0,3.19,6.63,4.93,20.86,20.73,19.58,1893.0,379.0,361.0,330.0,823.0,204.0,229.0,322.0,1138.0,876.0,773.0,195492.0,38.0,34.0,17.0,47.0,1005.81,114.23,107.0,114.31,107.09,958.0,428.0,485.0,1871.0,116.0,104.0,14.0,993.0,263.0,130.0,30.16,33.12,50.94,37.0,30.0,49.0,1866.69,1386.0,1386.68,17.14,24.6,24.21,107.15,107.16,103.05,925.59,105.51,101.52,105.38,101.43,854.0,416.0,395.0,1665.0,98.0,80.0,10.0,891.0,263.0,116.0,26.08,32.79,46.63,23.0,32.0,43.0,1667.85,1270.0,1272.03,14.52,15.56,15.56,104.46,104.61,102.01,124.69,167.92,2680.0,3485.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-07-02 00:00:00,188.0,"6' 0""",CAN,R,54.0,23.0,25.0,Young Pro
2550,8477497,2024,Sean Monahan,WPG,C,all,83,90109.0,1803.0,55.21,0.52,0.44,0.52,0.44,0.53,0.45,309.0,174.78,24.8,14.41,36.23,5.79,88.27,66.5,23.72,24.81,23.74,21.0,12.0,171.0,65.0,61.0,297.0,59.0,26.0,16.0,4.0,32.0,0.0,70.0,92.0,145.0,210.0,5.0,12.0,698.0,29.0,43.0,33.0,138.0,62.0,36.0,4.65,7.86,12.29,7.0,7.0,12.0,297.33,236.0,236.53,20.0,3.4,2.72,6.84,21.36,21.4,20.88,1803.0,339.0,320.0,245.0,899.0,210.0,222.0,306.0,1065.0,698.0,574.0,212705.0,12.0,16.0,8.0,44.0,868.07,93.09,88.49,93.4,88.79,838.0,355.0,435.0,1628.0,106.0,81.0,16.0,852.0,231.0,110.0,24.51,29.18,39.4,30.0,28.0,48.0,1627.65,1193.0,1193.61,15.22,18.74,18.74,89.56,89.8,86.82,774.66,85.04,80.4,85.21,80.57,743.0,333.0,413.0,1489.0,79.0,90.0,7.0,768.0,207.0,101.0,21.55,25.81,37.68,23.0,27.0,29.0,1496.33,1076.0,1080.03,11.88,21.12,21.12,75.79,76.03,74.08,148.35,189.19,3068.0,3855.0,0.0,0.0,0.0,0.0,0.0,0.0,1994-10-12 00:00:00,200.0,"6' 2""",CAN,L,50.0,22.0,30.0,Prime Age
2798,8477946,2024,Dylan Larkin,DET,C,all,68,82304.0,1622.0,74.57,0.53,0.44,0.55,0.43,0.54,0.44,121.0,229.97,27.95,16.25,48.75,7.44,122.99,89.61,26.78,27.93,26.77,26.0,10.0,221.0,92.0,112.0,425.0,69.0,33.0,21.0,3.0,34.0,0.0,106.0,119.0,188.0,280.0,13.0,29.0,716.0,44.0,48.0,44.0,224.0,51.0,38.0,6.69,6.17,15.1,10.0,9.0,14.0,425.47,313.0,312.37,17.0,3.77,3.79,5.65,26.08,26.12,25.19,1622.0,248.0,248.0,304.0,822.0,216.0,224.0,274.0,908.0,716.0,577.0,166074.0,29.0,63.0,30.0,34.0,843.43,87.31,83.29,87.35,83.33,815.0,347.0,426.0,1588.0,106.0,71.0,13.0,863.0,201.0,98.0,25.71,23.81,37.79,39.0,31.0,36.0,1586.86,1162.0,1159.73,13.87,15.12,15.12,86.06,86.17,83.23,719.66,76.24,71.35,76.33,71.42,693.0,298.0,331.0,1322.0,82.0,73.0,9.0,722.0,182.0,87.0,20.98,22.57,32.69,31.0,30.0,21.0,1323.44,991.0,991.91,10.89,16.39,16.39,70.73,70.78,67.71,111.78,141.24,2337.0,3042.0,0.0,0.0,0.0,0.0,0.0,0.0,1996-07-30 00:00:00,198.0,"6' 1""",USA,L,51.0,24.0,28.0,Prime Age
2315,8476459,2024,Mika Zibanejad,NYR,C,all,81,95955.0,1974.0,73.21,0.59,0.47,0.57,0.48,0.57,0.48,217.0,240.49,25.94,16.65,54.59,8.41,140.0,94.41,24.36,25.97,24.38,30.0,16.0,221.0,119.0,74.0,414.0,72.0,26.0,19.0,2.0,36.0,0.0,138.0,121.0,195.0,314.0,10.0,20.0,548.0,49.0,48.0,46.0,253.0,59.0,28.0,7.8,6.85,11.29,10.0,7.0,9.0,414.69,340.0,341.23,9.0,3.91,3.06,4.18,25.67,25.71,24.39,1974.0,387.0,314.0,335.0,938.0,229.0,228.0,315.0,1202.0,548.0,564.0,198923.0,20.0,48.0,23.0,55.0,1020.7,119.11,110.31,119.03,110.25,972.0,458.0,497.0,1927.0,128.0,98.0,14.0,1026.0,254.0,150.0,29.49,31.43,58.19,35.0,39.0,54.0,1923.65,1430.0,1429.99,17.7,21.18,21.15,115.66,115.55,108.67,774.4,83.07,80.17,83.12,80.21,744.0,318.0,388.0,1450.0,67.0,79.0,7.0,782.0,188.0,92.0,22.36,23.67,37.05,17.0,26.0,24.0,1443.64,1062.0,1059.59,11.94,14.04,14.04,80.98,81.02,79.44,144.15,163.82,3149.0,3414.0,0.0,0.0,0.0,0.0,0.0,0.0,1993-04-18 00:00:00,213.0,"6' 2""",SWE,R,49.0,24.0,31.0,Vet


In [224]:
# print("Nick Suzuki top 5 AS similar players in the 2024 season are:", nick_suzuki_AS_sim_skaters['name'])
# print("Nick Suzuki top 5 5on5 similar players in the 2024 season are:", nick_suzuki_5on5_sim_skaters['name'])
# print("Nick Suzuki top 5 5on4 similar players in the 2024 season are:", nick_suzuki_5on4_sim_skaters['name'])
# print("Nick Suzuki top 5 4on5 similar players in the 2024 season are:", nick_suzuki_4on5_sim_skaters['name'])
# print("Nick Suzuki top 5 OS similar players in the 2024 season are:", nick_suzuki_OS_sim_skaters['name'])

In [211]:
MP_AS_stats.loc[MP_AS_stats['name'] == 'Sidney Crosby']

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,onIce_fenwickPercentage,offIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,I_F_xFreeze,I_F_xPlayStopped,I_F_xPlayContinuedInZone,I_F_xPlayContinuedOutsideZone,I_F_flurryAdjustedxGoals,I_F_scoreVenueAdjustedxGoals,I_F_flurryScoreVenueAdjustedxGoals,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotsOnGoal,I_F_missedShots,I_F_blockedShotAttempts,I_F_shotAttempts,I_F_points,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playStopped,I_F_playContinuedInZone,I_F_playContinuedOutsideZone,I_F_savedShotsOnGoal,I_F_savedUnblockedShotAttempts,penalties,I_F_penalityMinutes,I_F_faceOffsWon,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerShots,I_F_mediumDangerShots,I_F_highDangerShots,I_F_lowDangerxGoals,I_F_mediumDangerxGoals,I_F_highDangerxGoals,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_scoreAdjustedShotsAttempts,I_F_unblockedShotAttempts,I_F_scoreAdjustedUnblockedShotAttempts,I_F_dZoneGiveaways,I_F_xGoalsFromxReboundsOfShots,I_F_xGoalsFromActualReboundsOfShots,I_F_reboundxGoals,I_F_xGoals_with_earned_rebounds,I_F_xGoals_with_earned_rebounds_scoreAdjusted,I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,I_F_shifts,I_F_oZoneShiftStarts,I_F_dZoneShiftStarts,I_F_neutralZoneShiftStarts,I_F_flyShiftStarts,I_F_oZoneShiftEnds,I_F_dZoneShiftEnds,I_F_neutralZoneShiftEnds,I_F_flyShiftEnds,faceoffsWon,faceoffsLost,timeOnBench,penalityMinutes,penalityMinutesDrawn,penaltiesDrawn,shotsBlockedByPlayer,OnIce_F_xOnGoal,OnIce_F_xGoals,OnIce_F_flurryAdjustedxGoals,OnIce_F_scoreVenueAdjustedxGoals,OnIce_F_flurryScoreVenueAdjustedxGoals,OnIce_F_shotsOnGoal,OnIce_F_missedShots,OnIce_F_blockedShotAttempts,OnIce_F_shotAttempts,OnIce_F_goals,OnIce_F_rebounds,OnIce_F_reboundGoals,OnIce_F_lowDangerShots,OnIce_F_mediumDangerShots,OnIce_F_highDangerShots,OnIce_F_lowDangerxGoals,OnIce_F_mediumDangerxGoals,OnIce_F_highDangerxGoals,OnIce_F_lowDangerGoals,OnIce_F_mediumDangerGoals,OnIce_F_highDangerGoals,OnIce_F_scoreAdjustedShotsAttempts,OnIce_F_unblockedShotAttempts,OnIce_F_scoreAdjustedUnblockedShotAttempts,OnIce_F_xGoalsFromxReboundsOfShots,OnIce_F_xGoalsFromActualReboundsOfShots,OnIce_F_reboundxGoals,OnIce_F_xGoals_with_earned_rebounds,OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OnIce_A_xOnGoal,OnIce_A_xGoals,OnIce_A_flurryAdjustedxGoals,OnIce_A_scoreVenueAdjustedxGoals,OnIce_A_flurryScoreVenueAdjustedxGoals,OnIce_A_shotsOnGoal,OnIce_A_missedShots,OnIce_A_blockedShotAttempts,OnIce_A_shotAttempts,OnIce_A_goals,OnIce_A_rebounds,OnIce_A_reboundGoals,OnIce_A_lowDangerShots,OnIce_A_mediumDangerShots,OnIce_A_highDangerShots,OnIce_A_lowDangerxGoals,OnIce_A_mediumDangerxGoals,OnIce_A_highDangerxGoals,OnIce_A_lowDangerGoals,OnIce_A_mediumDangerGoals,OnIce_A_highDangerGoals,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shoots,avg_ice_time/shift (s),avg_shifts_per_game,age,age_group
935,8471675,2022,Sidney Crosby,PIT,C,all,69,82694.0,1691.0,86.95,0.6,0.48,0.6,0.47,0.59,0.49,169.0,203.16,26.02,15.46,40.49,6.49,107.36,76.18,24.59,26.44,24.99,31.0,22.0,208.0,64.0,62.0,334.0,84.0,31.0,20.0,2.0,31.0,10.0,71.0,109.0,177.0,241.0,17.0,32.0,760.0,50.0,34.0,54.0,181.0,60.0,31.0,6.71,7.01,12.3,11.0,8.0,12.0,340.16,272.0,276.43,12.0,3.54,4.77,4.62,24.94,25.27,24.59,1691.0,389.0,116.0,335.0,851.0,137.0,224.0,274.0,1056.0,760.0,690.0,168653.0,32.0,30.0,15.0,37.0,952.93,110.01,103.48,110.81,104.25,980.0,334.0,394.0,1708.0,117.0,88.0,11.0,904.0,301.0,109.0,29.06,36.19,44.76,32.0,49.0,36.0,1726.75,1314.0,1325.51,16.67,18.57,19.75,106.93,107.74,104.09,671.13,72.13,70.34,71.95,70.18,696.0,214.0,248.0,1158.0,58.0,61.0,4.0,646.0,182.0,82.0,19.26,23.09,29.79,18.0,18.0,22.0,1149.51,910.0,904.46,9.42,11.84,11.69,69.87,69.8,69.02,119.26,129.49,2391.0,2650.0,0.0,0.0,0.0,0.0,0.0,0.0,1987-08-07 00:00:00,200.0,"5' 11""",CAN,L,49.0,25.0,35.0,Vet
1779,8471675,2023,Sidney Crosby,PIT,C,all,82,99106.0,1945.0,100.72,0.6,0.48,0.61,0.47,0.6,0.47,181.0,260.63,33.31,19.79,58.86,9.2,132.56,94.29,30.51,33.64,30.82,36.0,24.0,255.0,93.0,95.0,443.0,93.0,33.0,35.0,5.0,53.0,6.0,99.0,122.0,222.0,315.0,16.0,32.0,978.0,71.0,59.0,67.0,205.0,106.0,37.0,6.29,14.03,12.99,12.0,10.0,11.0,446.0,348.0,350.9,14.0,4.54,6.44,10.65,27.2,27.34,26.14,1945.0,446.0,147.0,431.0,921.0,156.0,256.0,316.0,1217.0,978.0,868.0,198867.0,32.0,44.0,21.0,47.0,1153.56,137.5,128.48,137.99,128.94,1129.0,471.0,590.0,2190.0,131.0,144.0,22.0,1045.0,401.0,154.0,31.4,49.76,56.33,33.0,47.0,51.0,2198.31,1600.0,1606.5,21.16,30.25,30.25,128.41,128.71,123.54,786.63,89.98,86.76,90.25,87.02,798.0,288.0,340.0,1426.0,77.0,63.0,6.0,748.0,235.0,103.0,22.74,29.65,37.59,24.0,22.0,31.0,1429.3,1086.0,1087.98,11.32,12.98,12.98,88.32,88.54,86.39,162.48,177.5,3016.0,3432.0,0.0,0.0,0.0,0.0,0.0,0.0,1987-08-07 00:00:00,200.0,"5' 11""",CAN,L,51.0,24.0,36.0,Old Vet
1968,8471675,2024,Sidney Crosby,PIT,C,all,82,98827.0,1996.0,107.43,0.62,0.46,0.61,0.46,0.59,0.46,197.0,284.29,40.47,21.41,64.14,10.7,155.11,106.2,38.1,40.52,38.16,34.0,18.0,278.0,120.0,92.0,490.0,94.0,42.0,33.0,4.0,32.0,1.0,154.0,136.0,236.0,356.0,20.0,40.0,1090.0,101.0,73.0,57.0,255.0,85.0,58.0,8.16,10.25,22.04,15.0,7.0,20.0,490.85,398.0,399.29,17.0,4.97,5.96,7.36,38.07,38.19,36.64,1996.0,439.0,159.0,427.0,971.0,164.0,268.0,355.0,1209.0,1090.0,782.0,199462.0,40.0,60.0,29.0,37.0,1222.48,146.41,134.82,146.45,134.9,1175.0,510.0,656.0,2341.0,134.0,159.0,16.0,1169.0,333.0,183.0,35.06,41.58,69.76,52.0,32.0,50.0,2343.87,1685.0,1687.42,21.61,31.96,32.23,135.78,135.94,128.84,840.52,90.17,86.68,90.49,87.02,790.0,362.0,338.0,1490.0,97.0,87.0,17.0,833.0,208.0,111.0,24.87,26.04,39.26,30.0,37.0,30.0,1493.53,1152.0,1154.81,11.89,18.62,18.62,83.43,83.73,81.94,139.69,165.8,3020.0,3553.0,0.0,0.0,0.0,0.0,0.0,0.0,1987-08-07 00:00:00,200.0,"5' 11""",CAN,L,50.0,24.0,37.0,Old Vet


# Creating Player Rating Category:

### I want to find out which features are most important to generating gameScore so I can build my own gameScore-style column to demonstrate easily the player comparison and make it easily explainable to stakeholders

## Random Forest Regressor:

In [284]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [285]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('age_group', Pipeline([
            ('ordinal', OrdinalEncoder(categories=[['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']])),
            ('scaler', StandardScaler())  # Scale the ordinal-encoded age_group
        ]), ['age_group']),
        ('position', Pipeline([
            ('onehot', OneHotEncoder()),  # Apply OneHotEncoder to 'position'
            ('scaler', StandardScaler(with_mean=False))  # Apply StandardScaler after OneHotEncoder
        ]), ['position'])
    ])

# My current Pipeline
MP_RFR_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Regressor
])

In [286]:
col_not_processed = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age' ,'gameScore'] 
# gameScore is the target variable

col_not_processed_without_points = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age' , 'I_F_points','gameScore'] 

## All Situations:

### AS with I_F_points

In [287]:
# Drop the target column to create the feature matrix X
MP_AS_X = MP_AS_stats.drop(columns=col_not_processed) 
MP_AS_y = MP_AS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)

In [288]:
# Fit the pipeline to your training data
AS_model = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = AS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = AS_model.named_steps['preprocessor']

# Get feature names after the transformation
def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, features in column_transformer.transformers_:
        if transformer == 'drop' or transformer is None:
            continue
        if isinstance(transformer, Pipeline):
            transformer = transformer.named_steps['onehot'] if 'onehot' in transformer.named_steps else transformer
        try:
            if hasattr(transformer, 'get_feature_names_out'):
                feature_names = transformer.get_feature_names_out(features)
                output_features.extend(feature_names)
            else:
                output_features.extend(features)
        except NotFittedError:
            output_features.extend(features)
    return output_features

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

I_F_points                                    0.904845
onIce_fenwickPercentage                       0.013960
onIce_corsiPercentage                         0.009082
OnIce_F_scoreAdjustedUnblockedShotAttempts    0.004477
I_F_scoreAdjustedShotsAttempts                0.003807
offIce_xGoalsPercentage                       0.003801
offIce_corsiPercentage                        0.003559
offIce_fenwickPercentage                      0.003338
onIce_xGoalsPercentage                        0.003032
I_F_oZoneShiftEnds                            0.001937
dtype: float64


In [289]:
# Drop the target column to create the feature matrix X
MP_AS_X = MP_AS_stats.drop(columns=col_not_processed_without_points) 
MP_AS_y = MP_AS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)

# Fit the pipeline to your training data
AS_model = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = AS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = AS_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

I_F_scoreAdjustedUnblockedShotAttempts    0.418901
OnIce_F_goals                             0.101088
I_F_xOnGoal                               0.081860
OnIce_F_xGoalsFromxReboundsOfShots        0.059631
I_F_lowDangerxGoals                       0.052338
I_F_xPlayContinuedInZone                  0.051980
I_F_xPlayStopped                          0.038196
I_F_scoreAdjustedShotsAttempts            0.035224
I_F_shotsOnGoal                           0.019388
onIce_fenwickPercentage                   0.011265
dtype: float64


### AS Model -  Comparing the accuracy of the model with and without the I_F_points column:

In [291]:
# Re assign the variable so that the comparison doesn't throw an error
MP_AS_X = MP_AS_stats.drop(columns=col_not_processed) 
MP_AS_y = MP_AS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
AS_model_with_points = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)
predictions_with_points = AS_model_with_points.predict(MP_AS_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_AS_y_test, predictions_with_points)
r2_with_points = r2_score(MP_AS_y_test, predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_AS_X_train_no_points = MP_AS_X_train.drop(columns=['I_F_points'])
MP_AS_X_test_no_points = MP_AS_X_test.drop(columns=['I_F_points'])

AS_model_without_points = MP_RFR_pipeline.fit(MP_AS_X_train_no_points, MP_AS_y_train)
predictions_without_points = AS_model_without_points.predict(MP_AS_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_AS_y_test, predictions_without_points)
r2_without_points = r2_score(MP_AS_y_test, predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 16.62153429479167
R2 Score: 0.9723422374579082
Model without I_F_points:
Mean Squared Error: 22.02977477850695
R2 Score: 0.9633430783901441/n
Comparison of Model Performance:
Difference in MSE: 5.408240483715282
Difference in R2 Score: -0.008999159067764051


## 5on5: 

### With I_F_points:

In [255]:

# Drop the target column to create the feature matrix X
MP_5on5_X = MP_5on5_stats.drop(columns=col_not_processed) 
MP_5on5_y = MP_5on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on5_X_train, MP_5on5_X_test, MP_5on5_y_train, MP_5on5_y_test = train_test_split(MP_5on5_X, MP_5on5_y, test_size=0.2, random_state=42)

In [256]:
# Fit the pipeline to your training data
MP_5on5_model = MP_RFR_pipeline.fit(MP_5on5_X_train, MP_5on5_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = MP_5on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

I_F_points                                    0.839199
onIce_corsiPercentage                         0.029955
onIce_fenwickPercentage                       0.015417
OnIce_F_scoreAdjustedUnblockedShotAttempts    0.007215
OnIce_F_lowDangerxGoals                       0.007126
I_F_oZoneShiftStarts                          0.005628
OnIce_F_scoreAdjustedShotsAttempts            0.004592
I_F_scoreAdjustedShotsAttempts                0.004561
OnIce_F_xGoalsFromxReboundsOfShots            0.004160
I_F_lowDangerxGoals                           0.003420
dtype: float64


### Without I_F_points:

In [261]:
# Drop the target column to create the feature matrix X
MP_5on5_X = MP_5on5_stats.drop(columns=col_not_processed_without_points) 
MP_5on5_y = MP_5on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on5_X_train, MP_5on5_X_test, MP_5on5_y_train, MP_5on5_y_test = train_test_split(MP_5on5_X, MP_5on5_y, test_size=0.2, random_state=42)

In [258]:
# Fit the pipeline to your training data
MP_5on5_model = MP_RFR_pipeline.fit(MP_5on5_X_train, MP_5on5_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = MP_5on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

I_F_scoreAdjustedUnblockedShotAttempts    0.371925
I_F_lowDangerxGoals                       0.168417
I_F_oZoneShiftStarts                      0.085230
I_F_xPlayContinuedInZone                  0.066758
I_F_primaryAssists                        0.031127
OnIce_F_goals                             0.030560
I_F_shotsOnGoal                           0.027182
I_F_xOnGoal                               0.024491
I_F_xPlayContinuedOutsideZone             0.021077
onIce_corsiPercentage                     0.019968
dtype: float64


### Comparing the 5on5 models with and without I_F_points

In [292]:
# Re assign the variable so that the comparison doesn't throw an error
MP_5on5_X = MP_5on5_stats.drop(columns=col_not_processed) 
MP_5on5_y = MP_5on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on5_X_train, MP_5on5_X_test, MP_5on5_y_train, MP_5on5_y_test = train_test_split(MP_5on5_X, MP_5on5_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_5on5_model_with_points = MP_RFR_pipeline.fit(MP_5on5_X_train, MP_5on5_y_train)
MP_5on5_predictions_with_points = MP_5on5_model_with_points.predict(MP_5on5_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_5on5_y_test, MP_5on5_predictions_with_points)
r2_with_points = r2_score(MP_5on5_y_test, MP_5on5_predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_5on5_X_train_no_points = MP_5on5_X_train.drop(columns=['I_F_points'])
MP_5on5_X_test_no_points = MP_5on5_X_test.drop(columns=['I_F_points'])

MP_5on5_model_without_points = MP_RFR_pipeline.fit(MP_5on5_X_train_no_points, MP_5on5_y_train)
MP_5on5_predictions_without_points = MP_5on5_model_without_points.predict(MP_5on5_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_5on5_y_test, MP_5on5_predictions_without_points)
r2_without_points = r2_score(MP_5on5_y_test, MP_5on5_predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 28.69208522243055
R2 Score: 0.9522571824089604
Model without I_F_points:
Mean Squared Error: 32.00765860326388
R2 Score: 0.9467401621609136/n
Comparison of Model Performance:
Difference in MSE: 3.315573380833328
Difference in R2 Score: -0.005517020248046789


## 4on5:

### 4on5 with I_F_points:

In [263]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [264]:

MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

timeOnBench                   0.529996
faceoffsLost                  0.034284
penalityMinutesDrawn          0.033011
games_played                  0.029911
penaltiesDrawn                0.025901
shotsBlockedByPlayer          0.025319
OnIce_F_mediumDangerxGoals    0.020186
I_F_takeaways                 0.012557
I_F_xFreeze                   0.011759
OffIce_F_xGoals               0.008115
dtype: float64


### 4on5 without I_F_points:

In [282]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed_without_points) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [283]:

MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

timeOnBench                   0.530298
faceoffsLost                  0.034620
penaltiesDrawn                0.030993
games_played                  0.029959
penalityMinutesDrawn          0.028477
shotsBlockedByPlayer          0.025171
OnIce_F_mediumDangerxGoals    0.020666
I_F_takeaways                 0.012356
I_F_xFreeze                   0.011854
OffIce_F_xGoals               0.008586
dtype: float64


### Comparing the 4on5 models with and without I_F_points

In [293]:
# Re assign the variable so that the comparison doesn't throw an error
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_4on5_model_with_points = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)
MP_4on5_predictions_with_points = MP_4on5_model_with_points.predict(MP_4on5_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_with_points)
r2_with_points = r2_score(MP_4on5_y_test, MP_4on5_predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_4on5_X_train_no_points = MP_4on5_X_train.drop(columns=['I_F_points'])
MP_4on5_X_test_no_points = MP_4on5_X_test.drop(columns=['I_F_points'])

MP_4on5_model_without_points = MP_RFR_pipeline.fit(MP_4on5_X_train_no_points, MP_4on5_y_train)
MP_4on5_predictions_without_points = MP_4on5_model_without_points.predict(MP_4on5_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_without_points)
r2_without_points = r2_score(MP_4on5_y_test, MP_4on5_predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 75.12914597234374
R2 Score: 0.7699071485572728
Model without I_F_points:
Mean Squared Error: 76.47908517623266
R2 Score: 0.7657727829036094/n
Comparison of Model Performance:
Difference in MSE: 1.3499392038889226
Difference in R2 Score: -0.00413436565366343


#### This tells me that in fact the gameScore metric is not a very good metric to understand/evaluate player performance when players are on the ice for a 4on5 penalty kill. 
#### It also makes sense that when on the penalty kill, the most important feature to determin a player's rating on the penalty kill is 'timeOnBench' because in order for a player to do positive things on the 4on5, they would need to be on the ice. It is interesting that it's timeOnBench and not timeOnIce but it does make sense that since there are more players on the bench in that situation that bench time is the most influential feature. 
#### Furthermore, the R2 Score and MSE are actually still quite low and so the model with or without I_F_points doesn't do very well at explaining the variance in the data. To that end, I'm curious to see if timeOnBench was removed, what the feature importances might be. I think its worth exploring because timeOnBench is not a "player active" feature and so can't be worked on or improved other than through coaching decisions regarding line-changes. 

## Exploring 4on5 with and without timeOnBench rather than I_F_points

In [304]:
col_not_processed_4on5 = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age','gameScore'] 
# gameScore is the target variable

col_not_processed_4on5_without_timeOnBench = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age', 'timeOnBench','gameScore'] 

### 4on5 with timeOnBench:

In [None]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed_4on5) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [305]:
MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

timeOnBench                   0.529996
faceoffsLost                  0.034284
penalityMinutesDrawn          0.033011
games_played                  0.029911
penaltiesDrawn                0.025901
shotsBlockedByPlayer          0.025319
OnIce_F_mediumDangerxGoals    0.020186
I_F_takeaways                 0.012557
I_F_xFreeze                   0.011759
OffIce_F_xGoals               0.008115
dtype: float64


### 4on5 without timeOnBench:

In [306]:
# Drop the target column to create the feature matrix X
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed_4on5_without_timeOnBench) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

In [307]:
MP_4on5_model = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)

rf_model = MP_4on5_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_4on5_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

OffIce_F_shotAttempts         0.402510
OffIce_A_shotAttempts         0.072049
games_played                  0.042851
faceoffsLost                  0.037595
penaltiesDrawn                0.032538
shotsBlockedByPlayer          0.028096
penalityMinutesDrawn          0.026823
OffIce_F_xGoals               0.020611
OnIce_F_mediumDangerxGoals    0.015637
I_F_takeaways                 0.014763
dtype: float64


#### When timeOnBench is removed, the next most influential feature is the OffIce_F_shotAttempts. This makes a lot of sense. This means that it was a strong coaching decision to have that group of players on the ice. That feature means that though you have fewer players on the ice, you are still generating offensive chances. The other team can't score on you while you are in the offensive zone.
#### Additionally, though much less influential. The 3rd and 4th most influential features are 'games_played' and 'faceoffsLost'. This shows that for 4on5 penalty kill success, experience is by far the most important feature. This also demomstrates that generally speaking, it is more important to have centers with veteran experience and generally veteran/late prime aged players on the penalty kill.
#### All in all, more than any other gamestate, 4on5 penalty kills (probably extrapolated to other PK situations), the most important thing is the coaching decision for which personel to be on the ice and that those players should be players with the most experience in that situation. 

### Comparing the models with and without the timeOnBench

In [308]:
# Re assign the variable so that the comparison doesn't throw an error
MP_4on5_X = MP_4on5_stats.drop(columns=col_not_processed) 
MP_4on5_y = MP_4on5_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_4on5_X_train, MP_4on5_X_test, MP_4on5_y_train, MP_4on5_y_test = train_test_split(MP_4on5_X, MP_4on5_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_4on5_model_with_benchTime = MP_RFR_pipeline.fit(MP_4on5_X_train, MP_4on5_y_train)
MP_4on5_predictions_with_benchTime = MP_4on5_model_with_benchTime.predict(MP_4on5_X_test)

# Evaluate the model
mse_with_benchTime = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_with_benchTime)
r2_with_benchTime = r2_score(MP_4on5_y_test, MP_4on5_predictions_with_benchTime)

print("Model with timeOnBench:")
print(f"Mean Squared Error: {mse_with_benchTime}")
print(f"R2 Score: {r2_with_benchTime}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_4on5_X_train_no_benchTime = MP_4on5_X_train.drop(columns=['timeOnBench'])
MP_4on5_X_test_no_benchTime = MP_4on5_X_test.drop(columns=['timeOnBench'])

MP_4on5_model_without_benchTime = MP_RFR_pipeline.fit(MP_4on5_X_train_no_benchTime, MP_4on5_y_train)
MP_4on5_predictions_without_benchTime = MP_4on5_model_without_benchTime.predict(MP_4on5_X_test_no_benchTime)

# Evaluate the model
mse_without_benchTime = mean_squared_error(MP_4on5_y_test, MP_4on5_predictions_without_benchTime)
r2_without_benchTime = r2_score(MP_4on5_y_test, MP_4on5_predictions_without_benchTime)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_benchTime}")
print(f"R2 Score: {r2_without_benchTime}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_benchTime - mse_with_benchTime}")
print(f"Difference in R2 Score: {r2_without_benchTime - r2_with_benchTime}")

Model with timeOnBench:
Mean Squared Error: 75.12914597234374
R2 Score: 0.7699071485572728
Model without I_F_points:
Mean Squared Error: 77.43742001720484
R2 Score: 0.7628377569114682/n
Comparison of Model Performance:
Difference in MSE: 2.3082740448611077
Difference in R2 Score: -0.007069391645804668


## 5on4:

### 5on4 with I_F_points:

In [294]:
# Drop the target column to create the feature matrix X
MP_5on4_X = MP_5on4_stats.drop(columns=col_not_processed) 
MP_5on4_y = MP_5on4_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on4_X_train, MP_5on4_X_test, MP_5on4_y_train, MP_5on4_y_test = train_test_split(MP_5on4_X, MP_5on4_y, test_size=0.2, random_state=42)

In [295]:
MP_5on4_model = MP_RFR_pipeline.fit(MP_5on4_X_train, MP_5on4_y_train)

rf_model = MP_5on4_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on4_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

shifts                                                     0.142544
OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted    0.100175
OnIce_F_flurryScoreVenueAdjustedxGoals                     0.088463
OnIce_F_flurryAdjustedxGoals                               0.083220
OnIce_F_scoreVenueAdjustedxGoals                           0.080145
OnIce_F_xGoals                                             0.079953
I_F_points                                                 0.055406
OnIce_F_xGoals_with_earned_rebounds                        0.030141
OnIce_F_scoreAdjustedUnblockedShotAttempts                 0.028875
OnIce_F_shotsOnGoal                                        0.025919
dtype: float64


### 5on4 without I_F_points:

In [297]:
# Drop the target column to create the feature matrix X
MP_5on4_X = MP_5on4_stats.drop(columns=col_not_processed_without_points) 
MP_5on4_y = MP_5on4_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on4_X_train, MP_5on4_X_test, MP_5on4_y_train, MP_5on4_y_test = train_test_split(MP_5on4_X, MP_5on4_y, test_size=0.2, random_state=42)

In [298]:
MP_5on4_model = MP_RFR_pipeline.fit(MP_5on4_X_train, MP_5on4_y_train)

rf_model = MP_5on4_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = MP_5on4_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

print(feature_importances.head(10))

shifts                                                     0.142586
OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted    0.107190
OnIce_F_flurryScoreVenueAdjustedxGoals                     0.088337
OnIce_F_flurryAdjustedxGoals                               0.083515
OnIce_F_xGoals                                             0.082359
OnIce_F_scoreVenueAdjustedxGoals                           0.080296
OnIce_F_shotsOnGoal                                        0.032125
OnIce_F_xGoals_with_earned_rebounds                        0.030383
OnIce_F_xOnGoal                                            0.027864
OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted          0.025060
dtype: float64


### 5on4 Comparison of with and without I_F_points:

In [299]:
# Re assign the variable so that the comparison doesn't throw an error
MP_5on4_X = MP_5on4_stats.drop(columns=col_not_processed) 
MP_5on4_y = MP_5on4_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_5on4_X_train, MP_5on4_X_test, MP_5on4_y_train, MP_5on4_y_test = train_test_split(MP_5on4_X, MP_5on4_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
MP_5on4_model_with_points = MP_RFR_pipeline.fit(MP_5on4_X_train, MP_5on4_y_train)
MP_5on4_predictions_with_points = MP_5on4_model_with_points.predict(MP_5on4_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_5on4_y_test, MP_5on4_predictions_with_points)
r2_with_points = r2_score(MP_5on4_y_test, MP_5on4_predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_5on4_X_train_no_points = MP_5on4_X_train.drop(columns=['I_F_points'])
MP_5on4_X_test_no_points = MP_5on4_X_test.drop(columns=['I_F_points'])

MP_5on4_model_without_points = MP_RFR_pipeline.fit(MP_5on4_X_train_no_points, MP_5on4_y_train)
MP_5on4_predictions_without_points = MP_5on4_model_without_points.predict(MP_5on4_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_5on4_y_test, MP_5on4_predictions_without_points)
r2_without_points = r2_score(MP_5on4_y_test, MP_5on4_predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 57.10492802555556
R2 Score: 0.9030744899608139
Model without I_F_points:
Mean Squared Error: 58.93719043137152
R2 Score: 0.8999645487639725/n
Comparison of Model Performance:
Difference in MSE: 1.8322624058159604
Difference in R2 Score: -0.0031099411968413815


## Other Situations:

### Other situations with I_F_points:

In [300]:
# Drop the target column to create the feature matrix X
MP_OS_X = MP_OS_stats.drop(columns=col_not_processed) 
MP_OS_y = MP_OS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_OS_X_train, MP_OS_X_test, MP_OS_y_train, MP_OS_y_test = train_test_split(MP_OS_X, MP_OS_y, test_size=0.2, random_state=42)

In [301]:
# Fit the pipeline to your training data
OS_model = MP_RFR_pipeline.fit(MP_OS_X_train, MP_OS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = OS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = OS_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

OnIce_F_scoreAdjustedUnblockedShotAttempts                 0.236193
OnIce_F_unblockedShotAttempts                              0.213680
OnIce_F_scoreAdjustedShotsAttempts                         0.063591
I_F_points                                                 0.047669
OnIce_F_shotsOnGoal                                        0.043997
OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted          0.038317
OnIce_F_shotAttempts                                       0.034535
OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted    0.032342
OnIce_F_xOnGoal                                            0.031354
OnIce_F_xGoals_with_earned_rebounds                        0.029061
dtype: float64


### Other situations without I_F_points

In [302]:
# Drop the target column to create the feature matrix X
MP_OS_X = MP_OS_stats.drop(columns=col_not_processed_without_points) 
MP_OS_y = MP_OS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_OS_X_train, MP_OS_X_test, MP_OS_y_train, MP_OS_y_test = train_test_split(MP_OS_X, MP_OS_y, test_size=0.2, random_state=42)

# Fit the pipeline to your training data
OS_model = MP_RFR_pipeline.fit(MP_OS_X_train, MP_OS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = OS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = OS_model.named_steps['preprocessor']

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

OnIce_F_scoreAdjustedUnblockedShotAttempts                 0.269358
OnIce_F_unblockedShotAttempts                              0.180553
OnIce_F_scoreAdjustedShotsAttempts                         0.062064
OnIce_F_xGoals_with_earned_rebounds                        0.046606
OnIce_F_shotsOnGoal                                        0.044020
OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted    0.037822
OnIce_F_shotAttempts                                       0.036326
OnIce_F_xOnGoal                                            0.030908
OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted          0.028926
I_F_shotAttempts                                           0.021694
dtype: float64


### Comparing the model with and without the I_F_points column

In [303]:
# Re assign the variable so that the comparison doesn't throw an error
MP_OS_X = MP_OS_stats.drop(columns=col_not_processed) 
MP_OS_y = MP_OS_stats['gameScore']  # Target variable

# Split the data into training and testing sets
MP_OS_X_train, MP_OS_X_test, MP_OS_y_train, MP_OS_y_test = train_test_split(MP_OS_X, MP_OS_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
OS_model_with_points = MP_RFR_pipeline.fit(MP_OS_X_train, MP_OS_y_train)
predictions_with_points = OS_model_with_points.predict(MP_OS_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_OS_y_test, predictions_with_points)
r2_with_points = r2_score(MP_OS_y_test, predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_OS_X_train_no_points = MP_OS_X_train.drop(columns=['I_F_points'])
MP_OS_X_test_no_points = MP_OS_X_test.drop(columns=['I_F_points'])

OS_model_without_points = MP_RFR_pipeline.fit(MP_OS_X_train_no_points, MP_OS_y_train)
predictions_without_points = OS_model_without_points.predict(MP_OS_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_OS_y_test, predictions_without_points)
r2_without_points = r2_score(MP_OS_y_test, predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 40.28691845317709
R2 Score: 0.8887354115202359
Model without I_F_points:
Mean Squared Error: 40.86486431128473
R2 Score: 0.8871392381087433/n
Comparison of Model Performance:
Difference in MSE: 0.5779458581076398
Difference in R2 Score: -0.0015961734114926518
