# The Money Puck Recommender Engine:

In [175]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances

# Load the datasets
pd.set_option('display.max_columns', None) # Display Preference

In [176]:
#Saving the new MoneyPuck Datafranes as CSVs:
# All Situations:
MP_all_situations_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2021_2022.csv')
MP_all_situations_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2022_2023.csv')
MP_all_situations_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_all_situations_2023_2024.csv')
# 5on5:
MP_5on5_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2021_2022.csv')
MP_5on5_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2022_2023.csv')
MP_5on5_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on5_2023_2024.csv')
# 4on5:
MP_4on5_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2021_2022.csv')
MP_4on5_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2022_2023.csv')
MP_4on5_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_4on5_2023_2024.csv')
#5on4:
MP_5on4_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2021_2022.csv')
MP_5on4_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2022_2023.csv')
MP_5on4_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_5on4_2023_2024.csv')
#Other Situations:
MP_other_situations_2021_2022_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2021_2022.csv')
MP_other_situations_2022_2023_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2022_2023.csv')
MP_other_situations_2023_2024_df = pd.read_csv('MP_NHL_data/MoneyPuck_other_situations_2023_2024.csv')

In [177]:
#Concatenating the situational dataframes:
#All situations
MP_all_situations_frames = [MP_all_situations_2021_2022_df, MP_all_situations_2022_2023_df, MP_all_situations_2023_2024_df]
MP_AS_stats = pd.concat(MP_all_situations_frames, ignore_index=True)

#5on5:
MP_5on5_frames = [MP_5on5_2021_2022_df, MP_5on5_2022_2023_df, MP_5on5_2023_2024_df]
MP_5on5_stats = pd.concat(MP_5on5_frames, ignore_index=True)

#4on5:
MP_4on5_frames = [MP_4on5_2021_2022_df, MP_4on5_2022_2023_df, MP_4on5_2023_2024_df]
MP_4on5_stats = pd.concat(MP_4on5_frames, ignore_index=True)

#5on4:
MP_5on4_frames = [MP_5on4_2021_2022_df, MP_5on4_2022_2023_df, MP_5on4_2023_2024_df]
MP_5on4_stats = pd.concat(MP_5on4_frames, ignore_index=True)

#Other:
MP_other_situations_frames = [MP_other_situations_2021_2022_df, MP_other_situations_2022_2023_df, MP_other_situations_2023_2024_df]
MP_OS_stats = pd.concat(MP_other_situations_frames, ignore_index=True)

In [178]:
#Saving combined DFs to CSVs:
#All Situations:
MP_AS_stats.to_csv('MP_all_situations_2022_to_2024.csv', index=0)
#5on5:
MP_5on5_stats.to_csv('MP_5on5_2022_to_2024.csv', index=0)
#4on5:
MP_4on5_stats.to_csv('MP_4on5_2022_to_2024.csv', index=0)
#5on4:
MP_5on4_stats.to_csv('MP_5on4_2022_to_2024.csv', index=0)
#Other Situations:
MP_OS_stats.to_csv('MP_other_situations_2022_to_2024.csv', index=0)

### Adding the biographical data

In [179]:
# Reading the players' bio data
MP_player_bios = pd.read_csv('MP_NHL_data/allPlayersLookup.csv')

In [180]:
# Combining the data frames
MP_merged_AS_stats_bios = pd.merge(MP_AS_stats, MP_player_bios, on='playerId', how='left')
MP_merged_5on5_stats_bios = pd.merge(MP_5on5_stats, MP_player_bios, on='playerId', how='left')
MP_merged_4on5_stats_bios = pd.merge(MP_4on5_stats, MP_player_bios, on='playerId', how='left')
MP_merged_5on4_stats_bios = pd.merge(MP_5on4_stats, MP_player_bios, on='playerId', how='left')
MP_merged_OS_stats_bios = pd.merge(MP_OS_stats, MP_player_bios, on='playerId', how='left')

In [181]:
# Columns to drop after merge:
drop_cols_post_merge = ['primaryNumber', 'primaryPosition', 'name_y', 'position_y', 'team_y']

MP_merged_AS_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_5on5_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_4on5_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_5on4_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)
MP_merged_OS_stats_bios.drop(columns=drop_cols_post_merge, inplace=True)

In [182]:
# Renaming the columns that were shared in the merge:
col_rename_map = {'name_x': 'name', 'team_x': 'team', 'position_x': 'position','shootsCatches': 'shoots'}
MP_AS_stats = MP_merged_AS_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_5on5_stats = MP_merged_5on5_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_4on5_stats = MP_merged_4on5_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_5on4_stats = MP_merged_5on4_stats_bios.rename(mapper=col_rename_map, axis=1)
MP_OS_stats = MP_merged_OS_stats_bios.rename(mapper=col_rename_map, axis=1)

## Feature Engineering:

### Average Ice Time Column

In [183]:
# Making an Avg IceTime per shift column

# All Situations
MP_AS_stats['avg_ice_time/shift (s)'] = round((MP_AS_stats['icetime'] / MP_AS_stats['shifts']), 0)
cols = MP_AS_stats.columns.tolist()
cols.insert(9, cols.pop(cols.index('avg_ice_time/shift (s)')))
MP_all_situations_stats = MP_AS_stats[cols]

#5on5:
MP_5on5_stats['avg_ice_time/shift (s)'] = round((MP_5on5_stats['icetime'] / MP_5on5_stats['shifts']), 0)
cols_5on5 = MP_5on5_stats.columns.tolist()
cols_5on5.insert(9, cols_5on5.pop(cols_5on5.index('avg_ice_time/shift (s)')))
MP_5on5_stats = MP_5on5_stats[cols_5on5]

#4on5:
MP_4on5_stats['avg_ice_time/shift (s)'] = round((MP_4on5_stats['icetime'] / MP_4on5_stats['shifts']), 0)
cols_4on5 = MP_4on5_stats.columns.tolist()
cols_4on5.insert(9, cols_4on5.pop(cols_4on5.index('avg_ice_time/shift (s)')))
MP_4on5_stats = MP_4on5_stats[cols_4on5]

#5on4:
MP_5on4_stats['avg_ice_time/shift (s)'] = round((MP_5on4_stats['icetime'] / MP_5on4_stats['shifts']), 0)
cols_5on4 = MP_5on4_stats.columns.tolist()
cols_5on4.insert(9, cols_5on4.pop(cols_5on4.index('avg_ice_time/shift (s)')))
MP_5on4_stats = MP_5on4_stats[cols_5on4]

#Other situations:
MP_OS_stats['avg_ice_time/shift (s)'] = round((MP_OS_stats['icetime'] / MP_OS_stats['shifts']), 0)
cols_OS = MP_OS_stats.columns.tolist()
cols_OS.insert(9, cols_OS.pop(cols_OS.index('avg_ice_time/shift (s)')))
MP_OS_stats = MP_OS_stats[cols_OS]


### Average shifts per game column:

In [184]:
# Making an Avg shifts per game column
#AS:
MP_AS_stats['avg_shifts_per_game'] = round(MP_AS_stats['shifts']/MP_AS_stats['games_played'], 0)
#5on5:
MP_5on5_stats['avg_shifts_per_game'] = round(MP_5on5_stats['shifts']/MP_5on5_stats['games_played'], 0)
#4on5:
MP_4on5_stats['avg_shifts_per_game'] = round(MP_4on5_stats['shifts']/MP_4on5_stats['games_played'], 0)
#5on4:
MP_5on4_stats['avg_shifts_per_game'] = round(MP_5on4_stats['shifts']/MP_5on4_stats['games_played'], 0)
#OS:
MP_OS_stats['avg_shifts_per_game'] = round(MP_OS_stats['shifts']/MP_OS_stats['games_played'], 0)

### Adjusting seasn to be the year the season finished to help get an accurate player age:

In [185]:
# Update the season info to represent the year the season ended rather than the year that started the season.

season_map = {2021: 2022, 2022: 2023, 2023: 2024}
MP_AS_stats['season'] = MP_AS_stats['season'].map(season_map)
MP_5on5_stats['season'] = MP_5on5_stats['season'].map(season_map)
MP_4on5_stats['season'] = MP_4on5_stats['season'].map(season_map)
MP_5on4_stats['season'] = MP_5on4_stats['season'].map(season_map)
MP_OS_stats['season'] = MP_OS_stats['season'].map(season_map)

### Player age column:

In [186]:
def MP_calculate_playing_age(df, dob_col_name, season_col_name, age_col_name):
    """
    Updates the age of players in the DataFrame based on their date of birth.

    Parameters:
    df (pd.DataFrame): The DataFrame containing player data.
    dob_col_name (str): The name of the column with date of birth information.
    age_col_name (str): The name of the column where the age should be updated.
    current_year (int): The year to calculate current age from.

    Returns:
    pd.DataFrame: The DataFrame with updated ages.
    """
    # Convert the 'birthDate' column to datetime format
    df[dob_col_name] = pd.to_datetime(df[dob_col_name], errors='coerce')  # Handle potential errors during conversion

    # Extract the year
    df['birth_year'] = df[dob_col_name].dt.year

    # Calculate the new age and replace the 'Age' column
    df[age_col_name] = df[season_col_name] - df['birth_year']

    # Drop the helper column
    df.drop(columns='birth_year', inplace=True)

    return df

In [187]:
#Making an Age column based on taking the season column and subtracting the year from the birthdate using the function I made
MP_AS_stats = MP_calculate_playing_age(df=MP_AS_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_5on5_stats = MP_calculate_playing_age(df=MP_5on5_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_4on5_stats = MP_calculate_playing_age(df=MP_4on5_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_5on4_stats = MP_calculate_playing_age(df=MP_5on4_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')
MP_OS_stats = MP_calculate_playing_age(df=MP_OS_stats, dob_col_name='birthDate', 
                                    season_col_name='season', age_col_name='age')

In [188]:
# Handling the places where ages missing because the NaNs prevent the Pipelines from working
missing_age_dict = {
    'Adam Edstrom': 23,
 'Adam Ginning': 24,
 'Adam Klapka': 23,
 'Akil Thomas': 24,
 'Aku Raty': 23,
 'Alex Vlasic': 23,
 'Andy Andreoff': 33,
 'Angus Crookshank': 24,
 'Anton Levtchi': 28,
 'Arshdeep Bains': 23,
 'Blake Lizotte': 26,
 'Brad Lambert': 20,
 'Bradly Nadeau': 19,
 'Brandon Gignac': 26,
 'Brandon Scanlin': 25,
 'Brendan Brisson': 22,
 'Brennan Othmann': 21,
 'Brian Halonen': 25,
 'Cameron Butler': 22,
 'Cameron Crotty': 25,
 'Collin Graf': 21,
 'Cutter Gauthier': 20,
 'Declan Carlile': 24,
 'Elliot Desnoyers': 22,
 'Emil Heineman': 22,
 'Emil Lilleberg': 23,
 'Ethan Del Mastro': 21,
 'Filip Roos': 25,
 'Frank Nazar': 20,
 'Gage Goncalves': 23,
 'Gavin Brindley': 19,
 'Georgii Merkulov':23,
 'Graeme Clarke': 23,
 'Hudson Fasching': 29,
 'Isak Rosen': 21,
 'Ivan Miroshnichenko': 20,
 'Jack St. Ivany': 25,
 'Jack Thompson': 22,
 'Jackson Blake': 21,
 'Jacob MacDonald': 31,
 'James Malatesta': 21,
 'Jason Polin': 25,
 'Jayden Struble': 22,
 'Jeff Malott': 28,
 'Jiri Kulich': 20,
 'Jiri Smejkal': 27,
 'Josh Doan': 22,
 'Joshua Roy': 21,
 'Justin Brazeau': 26,
 'Kyle MacLean': 25,
 'Landon Slaggert': 22,
 'Lane Hutson': 20,
 'Liam Ohgren': 20,
 'Linus Karlsson': 24,
 'Logan Mailloux': 21,
 'Logan Morrison': 22,
 'Logan Stankoven': 21,
 'Louis Crevier': 23,
 'Luca Del Bel Belluz': 20,
 'Lukas Cormier': 22,
 'Maksymilian Szuber': 21,
 'Marat Khusnutdinov': 22,
 'Marc Johnstone': 28,
 'Marshall Rifai': 26,
 'Mason Marchment': 29,
 'Mason Morelli': 28,
 'Matt Rempe': 22,
 'Matt Roy': 29,
 'Matt Savoie': 20,
 'Mavrik Bourque': 22,
 'Maxwell Crozier': 24,
 'Nathan Bastian': 26,
 'Nikita Chibrikov': 21,
 'Olen Zellweger': 20,
 'Olle Lycksell': 24,
 'Ondrej Pavel': 23,
 'Oskar Steen': 26,
 'Patrik Koch': 27,
 'Philip Kemp': 25,
 'Pierrick Dube': 23,
 'Ruslan Iskhakov': 24,
 'Ryan Winterton': 20,
 'Ryker Evans': 22,
 'Sam Colangelo': 22,
 'Sam Malinski': 26,
 'Samuel Laberge': 27,
 'Scott Morrow': 21,
 'Shakir Mukhamadullin': 22,
 'Simon Nemec': 20,
 'Vasily Ponomarev': 22,
 'William Lockwood': 26,
 'Wyatt Kaiser': 22,
 'Yan Kuznetsov': 22,
 'Zach Dean': 21,
 'Zachary Hayes': 25,
 'Zack Bolduc': 21,
 'Zack Ostapchuk': 21
 }

In [189]:
# Applying the missing_age_dict to the original dataframes
MP_AS_stats['age'] = MP_AS_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)
MP_5on5_stats['age'] = MP_5on5_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_4on5_stats['age'] = MP_4on5_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_5on4_stats['age'] = MP_5on4_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

MP_OS_stats['age'] = MP_OS_stats.apply(
    lambda row: missing_age_dict.get(row['name'], row['age']),
    axis=1
)

### Making a binnned age_group column based on age:

In [190]:
# COLUMN EDITS
# Age Column: Making Age Bins 
bins = [0, 20, 26, 30, 35, 45]
labels = ['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']

MP_AS_stats['age_group'] = pd.cut(MP_AS_stats['age'], bins, labels=labels)
MP_5on5_stats['age_group'] = pd.cut(MP_5on5_stats['age'], bins, labels=labels)
MP_4on5_stats['age_group'] = pd.cut(MP_4on5_stats['age'], bins, labels=labels)
MP_5on4_stats['age_group'] = pd.cut(MP_5on4_stats['age'], bins, labels=labels)
MP_OS_stats['age_group'] = pd.cut(MP_OS_stats['age'], bins, labels=labels)


In [191]:
#Saving the new feature engineered dataframes to csvs:
MP_AS_stats.to_csv('MP_AS_stats_bios_new_features.csv', index=0)
MP_5on5_stats.to_csv('MP_5on5_stats_bios_new_features.csv', index=0)
MP_4on5_stats.to_csv('MP_4on5_stats_bios_new_features.csv', index=0)
MP_5on4_stats.to_csv('MP_5on4_stats_bios_new_features.csv', index=0)
MP_OS_stats.to_csv('MP_OS_stats_bios_new_features.csv', index=0)

## Making the functions of the recommender engine so that it is more user friendly:

In [192]:
def MP_create_player_index_dict(df):
      """
    Create a nested dictionary from a DataFrame that maps player names to their indices for each season.

    This function resets the index of the DataFrame to ensure that the index column 
    holds the original row indices. It then groups the DataFrame by 'name' and 'season' 
    and aggregates the indices into a list for each group. After grouping, it pivots the DataFrame 
    so each players' 'name' is a row with each 'season' as columns, containing lists of indices 
    as values. Finally, it converts the pivoted DataFrame into a nested dictionary where each player's 
    name is a key to a dictionary mapping each season to the player's indices.

    Parameters:
    df (pandas.DataFrame): The DataFrame to process, which must contain 'Player' and 'Season' columns 
                           and has a unique index.

    Returns:
    dict: A nested dictionary where the first level keys are player names, and second level keys are 
          seasons, each mapping to a list of index positions for that player in that season.
    """

    # Reset the index 
      df = df.reset_index()

    # Group by 'Player' and 'Season', then aggregate the original index values into a list.
      grouped = df.groupby(['name', 'season'])['index'].agg(lambda x: list(x)).reset_index()

    # Pivot the DataFrame to have 'Player' as rows and 'Season' as columns with list of indices as values.
      pivot_df = grouped.pivot(index='name', columns='season', values='index')

    # Convert the pivoted DataFrame into a nested dictionary.
      MP_player_index_dict = pivot_df.apply(lambda row: row.dropna().to_dict(), axis=1).to_dict()

      return MP_player_index_dict

In [193]:
# Saving Index Dict Variables:
MP_AS_player_dict = MP_create_player_index_dict(MP_AS_stats)
MP_5on5_player_dict = MP_create_player_index_dict(MP_5on5_stats)
MP_4on5_player_dict = MP_create_player_index_dict(MP_4on5_stats)
MP_5on4_player_dict = MP_create_player_index_dict(MP_5on4_stats)
MP_OS_player_dict = MP_create_player_index_dict(MP_OS_stats)

In [194]:
def MP_get_index_all_gamestates(player_name, MP_AS_dict= MP_AS_player_dict, MP_5on5_dict= MP_5on5_player_dict, 
                                MP_4on5_dict= MP_4on5_player_dict, MP_5on4_dict= MP_5on4_player_dict,
                                MP_OS_dict= MP_OS_player_dict):
    """
    Returns a string with all the indices for each game state (All Strengths, Even Strength,
    Power Play, and Penalty Kill) for a given player.
    
    Parameters:
    - player_name (str): The name of the player to lookup.
    - player_index_dict_AS (dict): The dictionary with indices for All Strengths.
    - player_index_dict_ES (dict): The dictionary with indices for Even Strength.
    - player_index_dict_PP (dict): The dictionary with indices for Power Play.
    - player_index_dict_PK (dict): The dictionary with indices for Penalty Kill.

    Returns:
    - str: A formatted string containing the indices for each game state for the player.
    """
    result_string= (
        f"{player_name}'s ALL SITUATIONS indices are: {MP_AS_dict.get(player_name)}\n"
        f"{player_name}'s 5-ON-5 indices are: {MP_5on5_dict.get(player_name)}\n"
        f"{player_name}'s 4-ON-5 indices are: {MP_4on5_dict.get(player_name)}\n"
        f"{player_name}'s 5-ON-4 indices are: {MP_5on4_dict.get(player_name)}\n"
        f"{player_name}'s OTHER SITUATIONS indices are: {MP_OS_dict.get(player_name)}\n"
    )

    return print(result_string)

In [204]:
MP_get_index_all_gamestates(player_name='Nick Suzuki')

Nick Suzuki's ALL SITUATIONS indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 5-ON-5 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 4-ON-5 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's 5-ON-4 indices are: {2022: [827], 2023: [1909], 2024: [2496]}
Nick Suzuki's OTHER SITUATIONS indices are: {2022: [827], 2023: [1909], 2024: [2496]}



In [195]:
def MP_get_players_baseline_gamestate_stats(original_gamestate_df, player_name):
    """
    Returns the baseline performance metrics of the player you are finding comparable players of 
    so you can see how their stats are over the course of the seasons in the engine.
    Args:
    - original_gamestate_df (pd.DataFrame): DataFrame containing the original skater stats.
    - player_name: must be a string of the full name of the player you want to look up, 
    If player name is misspelled or there is no data for that player, 
    the function returns an empty dataframe.
    -Small adustment from the other function. The MP function uses 'name' instead of 'Player' 

    """
    baseline_gamestate_stats = original_gamestate_df.loc[original_gamestate_df['name'] == player_name]
    return baseline_gamestate_stats

In [205]:
MP_get_players_baseline_gamestate_stats(MP_AS_stats, 'Nick Suzuki')

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,onIce_fenwickPercentage,offIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,I_F_xFreeze,I_F_xPlayStopped,I_F_xPlayContinuedInZone,I_F_xPlayContinuedOutsideZone,I_F_flurryAdjustedxGoals,I_F_scoreVenueAdjustedxGoals,I_F_flurryScoreVenueAdjustedxGoals,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotsOnGoal,I_F_missedShots,I_F_blockedShotAttempts,I_F_shotAttempts,I_F_points,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playStopped,I_F_playContinuedInZone,I_F_playContinuedOutsideZone,I_F_savedShotsOnGoal,I_F_savedUnblockedShotAttempts,penalties,I_F_penalityMinutes,I_F_faceOffsWon,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerShots,I_F_mediumDangerShots,I_F_highDangerShots,I_F_lowDangerxGoals,I_F_mediumDangerxGoals,I_F_highDangerxGoals,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_scoreAdjustedShotsAttempts,I_F_unblockedShotAttempts,I_F_scoreAdjustedUnblockedShotAttempts,I_F_dZoneGiveaways,I_F_xGoalsFromxReboundsOfShots,I_F_xGoalsFromActualReboundsOfShots,I_F_reboundxGoals,I_F_xGoals_with_earned_rebounds,I_F_xGoals_with_earned_rebounds_scoreAdjusted,I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,I_F_shifts,I_F_oZoneShiftStarts,I_F_dZoneShiftStarts,I_F_neutralZoneShiftStarts,I_F_flyShiftStarts,I_F_oZoneShiftEnds,I_F_dZoneShiftEnds,I_F_neutralZoneShiftEnds,I_F_flyShiftEnds,faceoffsWon,faceoffsLost,timeOnBench,penalityMinutes,penalityMinutesDrawn,penaltiesDrawn,shotsBlockedByPlayer,OnIce_F_xOnGoal,OnIce_F_xGoals,OnIce_F_flurryAdjustedxGoals,OnIce_F_scoreVenueAdjustedxGoals,OnIce_F_flurryScoreVenueAdjustedxGoals,OnIce_F_shotsOnGoal,OnIce_F_missedShots,OnIce_F_blockedShotAttempts,OnIce_F_shotAttempts,OnIce_F_goals,OnIce_F_rebounds,OnIce_F_reboundGoals,OnIce_F_lowDangerShots,OnIce_F_mediumDangerShots,OnIce_F_highDangerShots,OnIce_F_lowDangerxGoals,OnIce_F_mediumDangerxGoals,OnIce_F_highDangerxGoals,OnIce_F_lowDangerGoals,OnIce_F_mediumDangerGoals,OnIce_F_highDangerGoals,OnIce_F_scoreAdjustedShotsAttempts,OnIce_F_unblockedShotAttempts,OnIce_F_scoreAdjustedUnblockedShotAttempts,OnIce_F_xGoalsFromxReboundsOfShots,OnIce_F_xGoalsFromActualReboundsOfShots,OnIce_F_reboundxGoals,OnIce_F_xGoals_with_earned_rebounds,OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OnIce_A_xOnGoal,OnIce_A_xGoals,OnIce_A_flurryAdjustedxGoals,OnIce_A_scoreVenueAdjustedxGoals,OnIce_A_flurryScoreVenueAdjustedxGoals,OnIce_A_shotsOnGoal,OnIce_A_missedShots,OnIce_A_blockedShotAttempts,OnIce_A_shotAttempts,OnIce_A_goals,OnIce_A_rebounds,OnIce_A_reboundGoals,OnIce_A_lowDangerShots,OnIce_A_mediumDangerShots,OnIce_A_highDangerShots,OnIce_A_lowDangerxGoals,OnIce_A_mediumDangerxGoals,OnIce_A_highDangerxGoals,OnIce_A_lowDangerGoals,OnIce_A_mediumDangerGoals,OnIce_A_highDangerGoals,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shoots,avg_ice_time/shift (s),avg_shifts_per_game,age,age_group
827,8480018,2022,Nick Suzuki,MTL,C,all,82,100910.0,1978.0,50.28,0.45,0.43,0.51,0.44,0.51,0.43,136.0,177.26,18.81,12.77,38.45,6.03,92.23,69.72,17.56,18.65,17.41,19.0,21.0,186.0,52.0,71.0,309.0,61.0,21.0,17.0,5.0,31.0,3.0,67.0,99.0,165.0,217.0,15.0,30.0,699.0,89.0,50.0,71.0,175.0,42.0,21.0,5.74,5.26,7.8,7.0,5.0,9.0,304.13,238.0,234.29,44.0,2.93,3.37,4.59,17.16,16.95,16.66,1978.0,341.0,279.0,306.0,1052.0,263.0,273.0,336.0,1106.0,699.0,711.0,197423.0,30.0,50.0,26.0,62.0,877.02,82.03,78.56,81.67,78.22,899.0,313.0,407.0,1619.0,90.0,69.0,15.0,930.0,203.0,79.0,27.1,25.01,29.92,29.0,34.0,27.0,1594.56,1212.0,1196.04,14.64,14.08,14.17,82.5,82.04,79.91,857.05,98.29,94.83,99.43,95.93,883.0,297.0,389.0,1569.0,106.0,65.0,15.0,802.0,270.0,108.0,24.45,33.76,40.07,22.0,48.0,36.0,1602.15,1180.0,1203.86,13.34,14.78,15.02,96.6,97.7,95.74,134.52,180.46,2721.0,3420.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10 00:00:00,201.0,"5' 11""",CAN,R,51.0,24.0,23.0,Young Pro
1909,8480018,2023,Nick Suzuki,MTL,C,all,82,103790.0,1849.0,52.7,0.44,0.4,0.49,0.42,0.49,0.42,107.0,161.52,18.09,12.02,35.49,5.34,84.7,61.36,17.3,17.92,17.15,25.0,15.0,162.0,55.0,77.0,294.0,66.0,26.0,17.0,2.0,32.0,3.0,58.0,81.0,136.0,191.0,10.0,23.0,663.0,50.0,42.0,52.0,146.0,49.0,22.0,4.65,6.08,7.36,7.0,12.0,7.0,292.48,217.0,215.59,26.0,2.8,4.03,3.26,17.63,17.54,17.19,1849.0,347.0,292.0,321.0,889.0,221.0,235.0,325.0,1068.0,663.0,738.0,194627.0,23.0,48.0,25.0,56.0,859.95,86.16,81.43,85.65,80.97,851.0,330.0,428.0,1609.0,103.0,75.0,11.0,835.0,251.0,95.0,24.04,30.39,31.73,28.0,46.0,29.0,1594.69,1181.0,1173.06,14.71,14.62,14.67,86.18,85.86,83.09,882.13,111.47,105.82,112.68,107.02,890.0,318.0,436.0,1644.0,110.0,95.0,16.0,793.0,275.0,140.0,24.73,33.88,52.85,22.0,42.0,46.0,1675.17,1208.0,1225.97,14.34,22.53,22.53,103.28,104.24,101.37,133.35,203.05,2604.0,3563.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10 00:00:00,201.0,"5' 11""",CAN,R,56.0,23.0,24.0,Young Pro
2496,8480018,2024,Nick Suzuki,MTL,C,all,82,104619.0,1878.0,69.75,0.55,0.39,0.54,0.4,0.55,0.4,107.0,195.82,21.76,14.25,44.74,6.58,105.04,73.64,20.97,21.73,20.94,25.0,19.0,185.0,81.0,58.0,324.0,77.0,33.0,12.0,3.0,30.0,4.0,109.0,78.0,152.0,233.0,17.0,36.0,689.0,69.0,41.0,63.0,185.0,56.0,25.0,6.43,7.2,8.12,16.0,9.0,8.0,321.77,266.0,264.56,29.0,3.33,2.86,3.69,21.4,21.33,20.75,1878.0,385.0,246.0,348.0,899.0,213.0,260.0,316.0,1089.0,689.0,622.0,196168.0,36.0,40.0,20.0,48.0,1008.31,104.79,99.91,105.08,100.15,972.0,419.0,478.0,1869.0,111.0,86.0,11.0,1007.0,266.0,118.0,31.45,32.54,40.81,44.0,31.0,36.0,1863.29,1391.0,1389.05,16.94,19.83,19.83,101.91,101.95,99.05,846.9,87.41,83.4,87.83,83.79,814.0,346.0,434.0,1594.0,101.0,76.0,10.0,867.0,190.0,103.0,25.29,24.38,37.74,38.0,34.0,29.0,1605.4,1160.0,1168.01,12.18,17.34,17.34,82.25,82.44,80.71,124.32,193.35,2664.0,3942.0,0.0,0.0,0.0,0.0,0.0,0.0,1999-08-10 00:00:00,201.0,"5' 11""",CAN,R,56.0,23.0,25.0,Young Pro


## Building the preprocessing and processing pipeline for the recommender engine:

In [196]:
# handling the values that would interfere with the encoder that includes the 'inf' and NaN
#This is mainly for the ice_time/shift' column
MP_AS_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_5on5_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_4on5_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_5on4_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)
MP_OS_stats.replace([np.inf, -np.inf, np.nan], 0.0, inplace=True)

### The Pipeline:

In [197]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('age_group', Pipeline([
            ('ordinal', OrdinalEncoder(categories=[['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']])),
            ('scaler', StandardScaler())  # Scale the ordinal-encoded age_group
        ]), ['age_group']),
        ('position', Pipeline([
            ('onehot', OneHotEncoder()),  # Apply OneHotEncoder to 'position'
            ('scaler', StandardScaler(with_mean=False))  # Apply StandardScaler after OneHotEncoder
        ]), ['position'])
    ])

# My current Pipeline
MP_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA())
])

In [198]:
MP_AS_stats.head()

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,onIce_fenwickPercentage,offIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,I_F_xFreeze,I_F_xPlayStopped,I_F_xPlayContinuedInZone,I_F_xPlayContinuedOutsideZone,I_F_flurryAdjustedxGoals,I_F_scoreVenueAdjustedxGoals,I_F_flurryScoreVenueAdjustedxGoals,I_F_primaryAssists,I_F_secondaryAssists,I_F_shotsOnGoal,I_F_missedShots,I_F_blockedShotAttempts,I_F_shotAttempts,I_F_points,I_F_goals,I_F_rebounds,I_F_reboundGoals,I_F_freeze,I_F_playStopped,I_F_playContinuedInZone,I_F_playContinuedOutsideZone,I_F_savedShotsOnGoal,I_F_savedUnblockedShotAttempts,penalties,I_F_penalityMinutes,I_F_faceOffsWon,I_F_hits,I_F_takeaways,I_F_giveaways,I_F_lowDangerShots,I_F_mediumDangerShots,I_F_highDangerShots,I_F_lowDangerxGoals,I_F_mediumDangerxGoals,I_F_highDangerxGoals,I_F_lowDangerGoals,I_F_mediumDangerGoals,I_F_highDangerGoals,I_F_scoreAdjustedShotsAttempts,I_F_unblockedShotAttempts,I_F_scoreAdjustedUnblockedShotAttempts,I_F_dZoneGiveaways,I_F_xGoalsFromxReboundsOfShots,I_F_xGoalsFromActualReboundsOfShots,I_F_reboundxGoals,I_F_xGoals_with_earned_rebounds,I_F_xGoals_with_earned_rebounds_scoreAdjusted,I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,I_F_shifts,I_F_oZoneShiftStarts,I_F_dZoneShiftStarts,I_F_neutralZoneShiftStarts,I_F_flyShiftStarts,I_F_oZoneShiftEnds,I_F_dZoneShiftEnds,I_F_neutralZoneShiftEnds,I_F_flyShiftEnds,faceoffsWon,faceoffsLost,timeOnBench,penalityMinutes,penalityMinutesDrawn,penaltiesDrawn,shotsBlockedByPlayer,OnIce_F_xOnGoal,OnIce_F_xGoals,OnIce_F_flurryAdjustedxGoals,OnIce_F_scoreVenueAdjustedxGoals,OnIce_F_flurryScoreVenueAdjustedxGoals,OnIce_F_shotsOnGoal,OnIce_F_missedShots,OnIce_F_blockedShotAttempts,OnIce_F_shotAttempts,OnIce_F_goals,OnIce_F_rebounds,OnIce_F_reboundGoals,OnIce_F_lowDangerShots,OnIce_F_mediumDangerShots,OnIce_F_highDangerShots,OnIce_F_lowDangerxGoals,OnIce_F_mediumDangerxGoals,OnIce_F_highDangerxGoals,OnIce_F_lowDangerGoals,OnIce_F_mediumDangerGoals,OnIce_F_highDangerGoals,OnIce_F_scoreAdjustedShotsAttempts,OnIce_F_unblockedShotAttempts,OnIce_F_scoreAdjustedUnblockedShotAttempts,OnIce_F_xGoalsFromxReboundsOfShots,OnIce_F_xGoalsFromActualReboundsOfShots,OnIce_F_reboundxGoals,OnIce_F_xGoals_with_earned_rebounds,OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OnIce_A_xOnGoal,OnIce_A_xGoals,OnIce_A_flurryAdjustedxGoals,OnIce_A_scoreVenueAdjustedxGoals,OnIce_A_flurryScoreVenueAdjustedxGoals,OnIce_A_shotsOnGoal,OnIce_A_missedShots,OnIce_A_blockedShotAttempts,OnIce_A_shotAttempts,OnIce_A_goals,OnIce_A_rebounds,OnIce_A_reboundGoals,OnIce_A_lowDangerShots,OnIce_A_mediumDangerShots,OnIce_A_highDangerShots,OnIce_A_lowDangerxGoals,OnIce_A_mediumDangerxGoals,OnIce_A_highDangerxGoals,OnIce_A_lowDangerGoals,OnIce_A_mediumDangerGoals,OnIce_A_highDangerGoals,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts,birthDate,weight,height,nationality,shoots,avg_ice_time/shift (s),avg_shifts_per_game,age,age_group
0,8480950,2022,Ilya Lyubushkin,TOR,D,all,77,80654.0,1788.0,11.65,0.42,0.51,0.43,0.5,0.43,0.51,360.0,51.47,1.91,2.99,14.42,1.77,31.36,22.54,1.85,1.91,1.84,5.0,8.0,54.0,21.0,29.0,104.0,15.0,2.0,0.0,1.0,10.0,2.0,26.0,35.0,52.0,73.0,24.0,51.0,0.0,187.0,15.0,37.0,73.0,2.0,0.0,1.65,0.26,0.0,2.0,0.0,0.0,103.01,75.0,74.08,29.0,0.6,0.0,0.15,2.35,2.34,2.27,1788.0,115.0,287.0,254.0,1132.0,178.0,264.0,208.0,1138.0,0.0,0.0,199043.0,51.0,23.0,10.0,92.0,563.07,49.05,47.39,49.4,47.74,553.0,232.0,248.0,1033.0,52.0,32.0,7.0,597.0,152.0,36.0,17.15,18.76,13.14,23.0,17.0,12.0,1031.01,785.0,785.03,7.56,6.4,6.4,50.22,50.66,49.98,749.21,67.52,64.21,68.04,64.72,774.0,265.0,311.0,1350.0,79.0,64.0,8.0,790.0,183.0,66.0,22.58,21.45,23.49,27.0,24.0,28.0,1367.66,1039.0,1049.98,11.96,12.62,12.57,66.91,67.46,65.22,167.08,159.7,3052.0,2994.0,0.0,0.0,0.0,0.0,0.0,0.0,1994-04-06 00:00:00,201.0,"6' 2""",RUS,R,45.0,23.0,28.0,Prime Age
1,8476952,2022,Dominic Toninato,WPG,C,all,77,38801.0,962.0,7.34,0.36,0.51,0.36,0.51,0.36,0.51,796.0,48.28,6.85,3.49,9.91,1.56,22.74,19.45,6.67,6.93,6.75,6.0,1.0,44.0,20.0,24.0,88.0,14.0,7.0,1.0,0.0,9.0,0.0,16.0,31.0,37.0,57.0,11.0,22.0,87.0,92.0,11.0,10.0,34.0,19.0,11.0,1.32,2.41,3.12,1.0,4.0,2.0,89.82,64.0,64.83,9.0,0.75,0.19,2.24,5.37,5.39,5.35,962.0,32.0,112.0,102.0,716.0,141.0,136.0,111.0,574.0,87.0,118.0,241615.0,22.0,10.0,5.0,32.0,207.57,20.61,20.3,20.74,20.43,214.0,73.0,107.0,394.0,20.0,13.0,0.0,208.0,57.0,22.0,5.92,7.45,7.23,5.0,7.0,8.0,401.81,287.0,291.14,2.85,3.21,3.21,20.24,20.36,20.25,372.15,36.42,34.73,36.69,34.95,381.0,136.0,177.0,694.0,36.0,32.0,6.0,381.0,99.0,37.0,10.92,12.08,13.41,8.0,12.0,16.0,693.1,517.0,517.36,6.49,6.78,6.78,36.13,36.29,35.22,235.39,223.36,4033.0,3827.0,0.0,0.0,0.0,0.0,0.0,0.0,1994-03-09 00:00:00,200.0,"6' 2""",USA,L,40.0,12.0,28.0,Prime Age
2,8477210,2022,Buddy Robinson,ANA,R,all,32,17754.0,436.0,6.14,0.46,0.42,0.45,0.46,0.47,0.46,360.0,35.32,3.27,2.1,7.55,1.13,16.51,15.44,3.06,3.38,3.17,3.0,2.0,38.0,8.0,5.0,51.0,6.0,1.0,4.0,1.0,11.0,1.0,11.0,18.0,37.0,45.0,5.0,19.0,0.0,56.0,7.0,6.0,32.0,11.0,3.0,0.88,1.4,0.99,0.0,0.0,1.0,51.05,46.0,46.42,4.0,0.44,1.04,1.21,2.51,2.58,2.53,436.0,41.0,43.0,74.0,278.0,68.0,79.0,40.0,249.0,0.0,6.0,99265.0,19.0,17.0,4.0,12.0,146.97,10.84,10.34,11.13,10.62,159.0,39.0,45.0,243.0,8.0,11.0,2.0,156.0,32.0,10.0,3.98,4.1,2.76,2.0,3.0,3.0,245.1,198.0,200.1,1.85,2.65,2.65,10.04,10.25,10.07,159.91,12.75,12.42,12.59,12.26,164.0,62.0,71.0,297.0,7.0,9.0,0.0,171.0,47.0,8.0,5.06,5.56,2.13,3.0,3.0,1.0,294.87,226.0,225.08,2.31,1.96,1.96,13.1,12.97,12.69,66.97,93.05,1411.0,1653.0,0.0,0.0,0.0,0.0,0.0,0.0,1991-09-30 00:00:00,232.0,"6' 6""",USA,R,41.0,14.0,31.0,Vet
3,8481186,2022,Logan O'Connor,COL,R,all,81,67415.0,1539.0,29.36,0.43,0.56,0.44,0.55,0.44,0.54,610.0,117.94,12.23,8.13,24.34,3.54,57.52,50.24,12.05,12.19,12.02,12.0,4.0,127.0,29.0,46.0,202.0,24.0,8.0,10.0,1.0,23.0,8.0,49.0,58.0,119.0,148.0,16.0,38.0,5.0,100.0,38.0,17.0,102.0,42.0,12.0,3.56,5.1,3.57,2.0,5.0,1.0,205.59,156.0,157.43,5.0,1.71,1.68,2.13,11.81,11.85,11.75,1539.0,123.0,243.0,265.0,908.0,257.0,224.0,190.0,868.0,5.0,17.0,227042.0,38.0,54.0,24.0,59.0,467.79,44.2,43.5,44.52,43.83,504.0,142.0,223.0,869.0,42.0,31.0,5.0,473.0,129.0,44.0,14.19,15.75,14.26,13.0,14.0,15.0,888.78,646.0,656.13,6.46,6.23,5.93,44.73,45.25,44.77,590.34,57.84,54.71,57.58,54.48,607.0,214.0,299.0,1120.0,54.0,45.0,8.0,611.0,154.0,56.0,17.55,18.85,21.43,17.0,18.0,19.0,1101.68,821.0,811.71,9.92,11.92,11.92,55.84,55.56,53.65,228.73,181.34,4223.0,3403.0,0.0,0.0,0.0,0.0,0.0,0.0,1996-08-14 00:00:00,175.0,"6' 0""",USA,R,44.0,19.0,26.0,Young Pro
4,8476391,2022,T.J. Tynan,LAK,C,all,2,1301.0,31.0,0.21,0.49,0.37,0.53,0.5,0.54,0.54,21.0,2.13,0.04,0.11,0.54,0.06,1.34,0.91,0.04,0.04,0.04,0.0,0.0,3.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,3.0,3.0,0.0,0.0,6.0,1.0,0.0,0.0,3.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,4.08,3.0,3.1,0.0,0.02,0.0,0.0,0.06,0.06,0.06,31.0,6.0,0.0,5.0,20.0,3.0,3.0,2.0,23.0,6.0,7.0,5899.0,0.0,0.0,0.0,2.0,9.58,0.69,0.68,0.74,0.73,10.0,3.0,5.0,18.0,0.0,0.0,0.0,10.0,2.0,1.0,0.2,0.2,0.28,0.0,0.0,0.0,18.76,13.0,13.69,0.13,0.0,0.0,0.82,0.88,0.87,8.0,0.72,0.72,0.71,0.71,7.0,4.0,5.0,16.0,0.0,0.0,0.0,8.0,2.0,1.0,0.29,0.22,0.21,0.0,0.0,0.0,15.99,11.0,10.85,0.1,0.0,0.0,0.82,0.8,0.8,3.55,5.97,94.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,1992-02-25 00:00:00,165.0,"5' 8""",USA,R,42.0,16.0,30.0,Prime Age


In [199]:
# Columns not to included in the processing:
col_not_processed = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age']

### The transformed dataframes for the recommender:

In [200]:
MP_AS_stats_transformed = MP_pipeline.fit_transform(MP_AS_stats.drop(columns=col_not_processed)) # All Situations
MP_5on5_stats_transformed = MP_pipeline.fit_transform(MP_5on5_stats.drop(columns=col_not_processed)) # 5on5
MP_4on5_stats_transformed = MP_pipeline.fit_transform(MP_4on5_stats.drop(columns=col_not_processed)) # 4on5
MP_5on4_stats_transformed = MP_pipeline.fit_transform(MP_5on4_stats.drop(columns=col_not_processed)) # 5on4
MP_OS_stats_transformed = MP_pipeline.fit_transform(MP_OS_stats.drop(columns=col_not_processed)) # Other Situations

## Running the recommmender engine:

In [201]:
def MP_recommend_skaters(original_gamestate_df, processed_gamestate_df, season, player_index, top_n=6):
    """
    Recommends skaters based on their stats using a preprocessed PCA features.

    Args:
    - original_gamestate_df (pd.DataFrame): DataFrame containing the original skater stats.
        Acceptable inputs for original_gamestate_df are: [MP_AS_stats, MP_5on5_stats, MP_4on5_stats, MP_5on4_stats, MP_OS_stats]
    - processed_gamestate_df (pd.DataFrame): PCA-transformed and scaled features of the skaters.
        Acceptable inputs for processed_gamestate_df are: 
        [MP_AS_processed_data, MP_5on5_processed_data, MP_4on5_processed_data, MP_5on4_processed_data, MP_OS_processed_data]
    - season (int): The target season for comparison.
        Acceptable inputs for season are: 2021, 2022, 2023 
    - player_index (int): Index of the player in the DataFrame to get recommendations for.
        player_index as accessed through the function: MP_get_index_all_gamestates() 
    - top_n (int): Number of top recommendations to return.

    Returns:
    - pd.DataFrame: DataFrame containing the top_n recommended skaters for the given player in the specified season.
    """

    # Filter DataFrame for the target season
    target_season_data = processed_gamestate_df[original_gamestate_df['season'] == season]

    # Compute pairwise distances between all skaters and those from the target season
    distances = pairwise_distances(processed_gamestate_df, target_season_data)

    # Find the indices of the closest skaters
    indices = np.argsort(distances, axis=1)[:, :top_n]

    # Retrieve the recommendations from the original stats DataFrame
    MP_recommended_skaters = original_gamestate_df[original_gamestate_df['season'] == season].iloc[indices[player_index], :]

    return MP_recommended_skaters

In [202]:
# AS
nick_suzuki_AS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_AS_stats,
                                                  processed_gamestate_df=MP_AS_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#5on5
nick_suzuki_5on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on5_stats,
                                                  processed_gamestate_df=MP_5on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#4on5
nick_suzuki_4on5_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_4on5_stats,
                                                  processed_gamestate_df=MP_4on5_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#5on4
nick_suzuki_5on4_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_5on4_stats,
                                                  processed_gamestate_df=MP_5on4_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)

#OS
nick_suzuki_OS_sim_skaters = MP_recommend_skaters(original_gamestate_df=MP_OS_stats,
                                                  processed_gamestate_df=MP_OS_stats_transformed,
                                                  season=2024,
                                                  player_index=2496,
                                                  top_n=6)


In [203]:
print("Nick Suzuki top 5 AS similar players in the 2024 season are:", nick_suzuki_AS_sim_skaters['name'])
print("Nick Suzuki top 5 5on5 similar players in the 2024 season are:", nick_suzuki_5on5_sim_skaters['name'])
print("Nick Suzuki top 5 5on4 similar players in the 2024 season are:", nick_suzuki_5on4_sim_skaters['name'])
print("Nick Suzuki top 5 4on5 similar players in the 2024 season are:", nick_suzuki_4on5_sim_skaters['name'])
print("Nick Suzuki top 5 OS similar players in the 2024 season are:", nick_suzuki_OS_sim_skaters['name'])

Nick Suzuki top 5 AS similar players in the 2024 season are: 2496       Nick Suzuki
2476         Bo Horvat
2619     Robert Thomas
2798      Dylan Larkin
2550      Sean Monahan
2315    Mika Zibanejad
Name: name, dtype: object
Nick Suzuki top 5 5on5 similar players in the 2024 season are: 2496      Nick Suzuki
2619    Robert Thomas
2476        Bo Horvat
2393    Nick Bjugstad
2550     Sean Monahan
2074     Dylan Strome
Name: name, dtype: object
Nick Suzuki top 5 5on4 similar players in the 2024 season are: 2496           Nick Suzuki
2476             Bo Horvat
2713           Kevin Fiala
2002         Mathew Barzal
2551    Yegor Sharangovich
2809        Clayton Keller
Name: name, dtype: object
Nick Suzuki top 5 4on5 similar players in the 2024 season are: 2496         Nick Suzuki
2387      Tanner Pearson
2476           Bo Horvat
2472       Nico Hischier
2172        Alex Killorn
2234    Michael McCarron
Name: name, dtype: object
Nick Suzuki top 5 OS similar players in the 2024 season are: 249