In [21]:
import pandas as pd
import numpy as np
import os
import sys
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Any, Dict, List, Optional, Set, Union
import unicodedata
from datetime import datetime, timedelta

# Configure pandas to display all columns
pd.set_option('display.max_columns', None)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Add project root to sys.path
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.entities.lineup import Lineup, Player, Position

from src.db.nst_db_utils import *
from src.db.nhl_db_utils import get_player_full_name, insert_player_data
from src.data_processing.nst_scraper import *
from src.data_processing.pbp_utils import *
from src.data_processing.game_utils import *
from src.data_processing.team_utils import *
from src.data_processing.player_utils import *

In [22]:
db_prefix = 'NHL_DB_'

def extract_team_goalies(team: str, reference_date: Optional[str] = None) -> Lineup:
    """
    Gets stats for goalies in the lineup, maintaining lineup order.
    
    Args:
        team (str): The three-letter team code (e.g., 'TOR').
        reference_date (Optional[str]): The reference date in 'YYYY-MM-DD' format. Defaults to yesterday's date.
    
    Returns:
        Lineup: A `Lineup` object containing the team's players from the most recent game.
    
    Raises:
        ValueError: If no recent game is found for the team or if the team is not part of the retrieved game.
    """
    # Step 1: Determine the reference date
    if reference_date is None:
        today_datetime = datetime.now()
        yesterday_datetime = today_datetime - timedelta(days=1, hours=6)  # Adjust for UTC offset if necessary
        reference_date = yesterday_datetime.strftime('%Y-%m-%d')

    # Step 2: Retrieve the most recent game ID for the team
    game_id, back_to_back = get_most_recent_game_id(team, reference_date)
    if game_id is None:
        raise ValueError(f"No recent game found for team '{team}' before {reference_date}.")

    # Print the game_id
    print(f"Game ID: {game_id}")

    # Step 3: Fetch the game boxscore data
    game_data = get_game_boxscore(game_id, clean=False)

    # Step 4: Process the boxscore to obtain skaters and goalies
    away_skaters, away_goalies, home_skaters, home_goalies = display_boxscore(game_data)

    # Extract team abbreviations to determine if the team is home or away
    away_team_code = game_data.get('awayTeam', {}).get('abbrev')
    home_team_code = game_data.get('homeTeam', {}).get('abbrev')

    if not away_team_code or not home_team_code:
        raise ValueError("Team abbreviations not found in game data.")

    if team.upper() == away_team_code.upper():
        team_side = 'Away'
        skaters = away_skaters
        goalies = away_goalies
    elif team.upper() == home_team_code.upper():
        team_side = 'Home'
        skaters = home_skaters
        goalies = home_goalies
    elif team.upper() == 'UTA':
        # Since UTA did not exist in the previous season, match with ARI
        if 'ARI' == away_team_code.upper():
            team_side = 'Away'
            skaters = away_skaters
            goalies = away_goalies
        elif 'ARI' == home_team_code.upper():
            team_side = 'Home'
            skaters = home_skaters
            goalies = home_goalies
        else:
            raise ValueError(f"Team '{team}' not found in game ID {game_id}.")
    else:
        raise ValueError(f"Team '{team}' not found in game ID {game_id}.")

    # Step 5: Construct the Lineup object
    lineup = Lineup(name=f"{team.upper()} Lineup from Game {game_id}")
    print(f"Back to back: {back_to_back}")
    lineup.back_to_back = back_to_back  # Assign back_to_back to the Lineup

    # Add Goalies to the Lineup
    for _, goalie in goalies.iterrows():
        player = Player(
            player_id=goalie['playerId'],
            name=get_player_full_name(goalie['playerId'], db_prefix, suppress_log=True),
            team=team.upper(),
            position=Position.G
        )
        try:
            empty_slot = next(i for i, p in enumerate(lineup.goalies) if p is None)
            lineup.set_goalie(player, empty_slot)
        except StopIteration:
            print(f"No available goalie slot to add player '{player.name}'.")

    return lineup
# col_lineup = extract_team_lineup('COL', '2024-12-10')

In [23]:
def get_pregame_matchup_stats(input_date: str, team: str, last_n: int=None, team_5v5=None, team_pp=None, team_pk=None, home_away_split: bool=False) -> Dict[str, Any]:
    """
    Retrieves and compiles pre-game statistics for both teams in a matchup.
    
    This function performs the following steps:
    1. Calculates reference date and retrieves team statistics
    2. Identifies the matchup game and opponent team
    3. Compiles team-level statistics for both teams
    4. Identifies starting goalies for both teams
    
    Args:
        input_date (str): The game date in 'YYYY-MM-DD' format
        team (str): The three-letter team code (e.g., 'TOR')
        last_n (int, optional): Number of previous games to consider for team statistics
        team_5v5 (dict, optional): Pre-loaded 5v5 team statistics with 'home', 'away', and/or 'both' keys
        team_pp (dict, optional): Pre-loaded power play team statistics with 'home', 'away', and/or 'both' keys
        team_pk (dict, optional): Pre-loaded penalty kill team statistics with 'home', 'away', and/or 'both' keys
        home_away_split (bool, optional): Whether to use home/away split stats (True) or combined stats (False)
    
    Returns:
        pd.DataFrame: A DataFrame with two rows (one per team) containing:
            - Game context (date, game_id, home/away)
            - Team statistics (5v5, PP, PK)
            - Starting goalie information (name, team, id)
    """
    try:
        # Step 1a: Calculate the date minus one day
        reference_datetime = datetime.strptime(input_date, '%Y-%m-%d') - timedelta(days=1)
        reference_date_str = reference_datetime.strftime('%Y-%m-%d')
        print(f"Fetching data for reference date: {reference_date_str}")

        # Create a dictionary to store all team stats
        team_stats = {}

        # Define the stat configurations
        stat_configs = [
            ('5v5', team_5v5),
            ('pp', team_pp),
            ('pk', team_pk)
        ]

        # Fetch any stats that weren't provided
        for sit_value, existing_stats in stat_configs:
            if existing_stats is None:
                if home_away_split:
                    team_stats[sit_value] = {
                        'away': nst_team_on_ice_scraper(
                            startdate='',
                            enddate=reference_date_str,
                            stype=2,
                            sit=sit_value,
                            last_n=last_n,
                            loc='A'
                        ),
                        'home': nst_team_on_ice_scraper(
                            startdate='',
                            enddate=reference_date_str,
                            stype=2,
                            sit=sit_value,
                            last_n=last_n,
                            loc='H'
                        ),
                        'both': pd.DataFrame()
                    }
                else:
                    team_stats[sit_value] = {
                        'both': nst_team_on_ice_scraper(
                            startdate='',
                            enddate=reference_date_str,
                            stype=2,
                            sit=sit_value,
                            last_n=last_n,
                            loc='B'
                        ),
                        'away': pd.DataFrame(),
                        'home': pd.DataFrame()
                    }
            else:
                team_stats[sit_value] = existing_stats

        # Assign back to original variables
        team_5v5 = team_stats['5v5']
        team_pp = team_stats['pp']
        team_pk = team_stats['pk']

        # Step 2: Retrieve matchup games for the input date
        print(f"Retrieving matchup games for date {input_date}.")
        temp_data = get_matchup_games(input_date, input_date)
        game_ids = temp_data.get('game_ids', {}).get('id', [])
        game_dates = temp_data.get('game_ids', {}).get('date', [])

        # Initialize variables
        game_id = None
        opponent_team_tricode = None
        side = None

        # Step 3: Identify the game_id involving the specified team
        for gid, gdate in zip(game_ids, game_dates):
            print(f"Checking Game ID: {gid} on Date: {gdate}")
            boxscore = get_game_boxscore(gid, clean=True)
            away_team = boxscore.get('away_team')
            home_team = boxscore.get('home_team')
            print(f"Away Team: {away_team}, Home Team: {home_team}")
            
            if team.upper() == away_team.upper():
                opponent_team_tricode = home_team.upper()
                game_id = gid
                side = 'away'  # Team is away, opponent is home
                print(f"Team {team} found as Away Team in Game ID {gid}. Opponent TriCode: {opponent_team_tricode}")
                # Check back-to-back status for both teams
                _, team_b2b = get_most_recent_game_id(team, input_date)
                _, opponent_b2b = get_most_recent_game_id(opponent_team_tricode, input_date)
                break
            elif team.upper() == home_team.upper():
                opponent_team_tricode = away_team.upper()
                game_id = gid
                side = 'home'  # Team is home, opponent is away
                print(f"Team {team} found as Home Team in Game ID {gid}. Opponent TriCode: {opponent_team_tricode}")
                # Check back-to-back status for both teams
                _, team_b2b = get_most_recent_game_id(team, input_date)
                _, opponent_b2b = get_most_recent_game_id(opponent_team_tricode, input_date)
                break

        if not game_id or not opponent_team_tricode:
            raise ValueError(f"Team {team} did not play on {input_date} or could not determine opponent.")
        
        # Step 4: Get full names for both teams
        team_fullname = get_fullname_by_tricode(team)
        opponent_team_fullname = get_fullname_by_tricode(opponent_team_tricode)

        if team_fullname is None or opponent_team_fullname is None:
            raise ValueError(f"Could not find full names for teams: {team} or {opponent_team_tricode}")
        
        # Remove accent marks and punctuation from both team names
        team_fullname = ''.join(
            c for c in unicodedata.normalize('NFD', team_fullname)
            if unicodedata.category(c) != 'Mn' and (c.isalnum() or c.isspace())
        )
        opponent_team_fullname = ''.join(
            c for c in unicodedata.normalize('NFD', opponent_team_fullname)
            if unicodedata.category(c) != 'Mn' and (c.isalnum() or c.isspace())
        )
        
        # Step 5: Get the appropriate stats based on the side and split type
        # For team: use the identified side (home/away) or 'both'
        # For opponent: use the opposite side or 'both'
        opponent_side = 'home' if side == 'away' else 'away'
        
        # Determine which dataframes to use based on home_away_split
        if home_away_split:
            team_df_key = side
            opponent_df_key = opponent_side
        else:
            team_df_key = 'both'
            opponent_df_key = 'both'
        
        # Verify team column exists in all dataframes
        if 'team' not in team_5v5[team_df_key].columns:
            raise KeyError(f"Column 'team' not found in team_5v5[{team_df_key}]. Please verify the scraped data.")
        
        # Filter 5v5 stats for both teams using the appropriate dataframe
        team_5v5_row = team_5v5[team_df_key][team_5v5[team_df_key]['team'].str.lower() == team_fullname.lower()]
        opponent_5v5_row = team_5v5[opponent_df_key][team_5v5[opponent_df_key]['team'].str.lower() == opponent_team_fullname.lower()]
        
        if team_5v5_row.empty or opponent_5v5_row.empty:
            raise ValueError(f"Could not find 5v5 statistics for one or both teams")
        
        # Step 6: Create base matchup DataFrame
        # First row: Keep team name but use opponent's stats
        first_row = opponent_5v5_row.copy()
        first_row['team'] = team
        
        # Second row: Keep opponent name but use team's stats
        second_row = team_5v5_row.copy() 
        second_row['team'] = opponent_team_tricode
        
        # Create the matchup dataframe with exactly 2 rows
        matchup_df = pd.concat([first_row, second_row], ignore_index=True)
        
        # Step 7: Add PP stats if available
        if team_pp is not None and 'team' in team_pp[team_df_key].columns:
            # Filter PP stats for both teams using the appropriate dataframe
            team_pp_row = team_pp[team_df_key][team_pp[team_df_key]['team'].str.lower() == team_fullname.lower()]
            opponent_pp_row = team_pp[opponent_df_key][team_pp[opponent_df_key]['team'].str.lower() == opponent_team_fullname.lower()]
            
            if not team_pp_row.empty and not opponent_pp_row.empty:
                # Add prefix to PP columns to avoid conflicts
                pp_cols_to_exclude = ['team', 'gp','w', 'l', 'otl', 'row', 'points', 'point_pct', 'last_game_date', 'season'] # Common columns to exclude from renaming
                
                # Create new column names with 'pp_' prefix
                pp_renamed_cols = {}
                for col in team_pp_row.columns:
                    if col not in pp_cols_to_exclude:
                        pp_renamed_cols[col] = f'pp_{col}'
                
                # Create copies with renamed columns
                team_pp_renamed = team_pp_row.copy().rename(columns=pp_renamed_cols)
                opponent_pp_renamed = opponent_pp_row.copy().rename(columns=pp_renamed_cols)
                
                # Create a DataFrame with all PP data
                pp_data = pd.DataFrame(index=[0, 1])
                for col in pp_renamed_cols.values():
                    if col in team_pp_renamed.columns:
                        # CHANGE: Swap the assignment - opponent's PP stats in first row, team's PP stats in second row
                        pp_data.loc[0, col] = opponent_pp_renamed[col].values[0]  # Changed from team_pp_renamed
                        pp_data.loc[1, col] = team_pp_renamed[col].values[0]      # Changed from opponent_pp_renamed
                
                # Join PP data to matchup_df
                matchup_df = pd.concat([matchup_df, pp_data], axis=1)
        
        # Step 8: Add PK stats if available
        if team_pk is not None and 'team' in team_pk[team_df_key].columns:
            # Filter PK stats for both teams using the appropriate dataframe
            team_pk_row = team_pk[team_df_key][team_pk[team_df_key]['team'].str.lower() == team_fullname.lower()]
            opponent_pk_row = team_pk[opponent_df_key][team_pk[opponent_df_key]['team'].str.lower() == opponent_team_fullname.lower()]
            
            if not team_pk_row.empty and not opponent_pk_row.empty:
                # Add prefix to PK columns to avoid conflicts
                pk_cols_to_exclude = ['team', 'gp', 'w', 'l', 'otl', 'row', 'points', 'point_pct', 'last_game_date', 'season']  # Common columns to exclude from renaming
                
                # Create new column names with 'pk_' prefix
                pk_renamed_cols = {}
                for col in team_pk_row.columns:
                    if col not in pk_cols_to_exclude:
                        pk_renamed_cols[col] = f'pk_{col}'
                
                # Create copies with renamed columns
                team_pk_renamed = team_pk_row.copy().rename(columns=pk_renamed_cols)
                opponent_pk_renamed = opponent_pk_row.copy().rename(columns=pk_renamed_cols)
                
                # Create a DataFrame with all PK data
                pk_data = pd.DataFrame(index=[0, 1])
                for col in pk_renamed_cols.values():
                    if col in team_pk_renamed.columns:
                        # CHANGE: Swap the assignment - opponent's PK stats in first row, team's PK stats in second row
                        pk_data.loc[0, col] = opponent_pk_renamed[col].values[0]  # Changed from team_pk_renamed
                        pk_data.loc[1, col] = team_pk_renamed[col].values[0]      # Changed from opponent_pk_renamed
                
                # Join PK data to matchup_df
                matchup_df = pd.concat([matchup_df, pk_data], axis=1)
        
        # Step 9: Add additional context columns
        matchup_df['home'] = [side == 'home', side == 'away']  # Updated to use the new side variable
        matchup_df['game_id'] = game_id
        matchup_df['game_date'] = input_date
        matchup_df['b2b'] = [team_b2b, opponent_b2b]  # First row has team's b2b, second row has opponent's b2b
        matchup_df['opp_b2b'] = [opponent_b2b, team_b2b]  # First row has opponent's b2b, second row has team's b2b

        # Ensure team names are correct
        matchup_df.loc[0, 'team'] = team
        matchup_df.loc[1, 'team'] = opponent_team_tricode

        # Step 10: Drop specified columns
        columns_to_drop = ['w', 'l', 'otl', 'row', 'points', 'point_pct'] #'gp', 'toi', added back for feature extraction
        matchup_df = matchup_df.drop(columns=columns_to_drop, errors='ignore')

        # Step 11: Reorder columns to put game_date and game_id first, and move last_game_date and season after opp_b2b
        cols = matchup_df.columns.tolist()
        
        # First remove all the columns we want to reorder
        for col in ['game_date', 'game_id', 'home', 'last_game_date', 'season']:
            if col in cols:
                cols.remove(col)
        
        # Then insert them in the desired order
        cols = ['game_date', 'game_id', 'home'] + cols
        
        # Find the position of opp_b2b to insert last_game_date and season after it
        if 'opp_b2b' in cols:
            opp_b2b_pos = cols.index('opp_b2b')
            if 'last_game_date' in matchup_df.columns:
                cols.insert(opp_b2b_pos + 1, 'last_game_date')
            if 'season' in matchup_df.columns:
                cols.insert(opp_b2b_pos + 2 if 'last_game_date' in matchup_df.columns else opp_b2b_pos + 1, 'season')
        
        # Apply the new column order
        matchup_df = matchup_df[cols]
        
        # Step 12: Get goalie information for both teams
        goalie_info = []
        for idx, row in matchup_df.iterrows():
            try:
                # If the game is in the past, just read the goalie from the boxscore
                if input_date < datetime.now().strftime('%Y-%m-%d'):
                    boxscore = get_game_boxscore(row['game_id'], clean=False)
                    # Use the correct team type based on home/away status
                    team_type = 'awayTeam' if not row['home'] else 'homeTeam'
                    goalies = boxscore['playerByGameStats'][team_type]['goalies']
                    
                    if goalies:
                        # Sort goalies by TOI (descending) and take the one with most ice time
                        starting_goalie = max(goalies, key=lambda x: x.get('toi', '00:00'))
                        if starting_goalie.get('toi') != '00:00':
                            goalie_name = get_player_full_name(starting_goalie.get('playerId'), 'NHL_DB_', suppress_log=True)
                            if goalie_name is None:
                                player_data = fetch_player_data(starting_goalie.get('playerId'))
                                if player_data:
                                    insert_player_data(player_data, 'NHL_DB_')
                                goalie_name = get_player_full_name(starting_goalie.get('playerId'), 'NHL_DB_', suppress_log=True)
                            goalie_info.append({
                                'goalie_name': goalie_name,
                                'goalie_team': row['team'],
                                'goalie_id': starting_goalie.get('playerId')
                            })
                        else:
                            goalie_info.append({
                                'goalie_name': None,
                                'goalie_team': None,
                                'goalie_id': None
                            })
                    else:
                        goalie_info.append({
                            'goalie_name': None,
                            'goalie_team': None,
                            'goalie_id': None
                        })
                else:
                    lineup = extract_team_goalies(row['team'], input_date)
                    if lineup.goalies[0] is not None:  # Get starting goalie
                        goalie = lineup.goalies[0]
                        goalie_info.append({
                            'goalie_name': goalie.name,
                            'goalie_team': goalie.team,
                            'goalie_id': goalie.player_id
                        })
                    else:
                        goalie_info.append({
                            'goalie_name': None,
                            'goalie_team': None,
                            'goalie_id': None
                        })
            except Exception as e:
                print(f"Error getting goalie for {row['team']}: {e}")
                goalie_info.append({
                    'goalie_name': None,
                    'goalie_team': None,
                    'goalie_id': None
                })
        
        # Step 13: Add goalie information to matchup_df
        goalie_data = pd.DataFrame(goalie_info)
        if not goalie_data.empty:
            for col in goalie_data.columns:
                matchup_df[col] = goalie_data[col].values
        
        # Step 14: Ensure we only have 2 rows
        if len(matchup_df) > 2:
            print(f"Warning: Matchup dataframe has {len(matchup_df)} rows, expected 2. Keeping only the first 2 rows.")
            matchup_df = matchup_df.iloc[:2]
        
        # Step 15: Create a clean copy to defragment the DataFrame
        matchup_df = matchup_df.copy()
        
        return matchup_df

    except Exception as e:
        print(f"An error occurred during processing: {e}")
        import traceback
        traceback.print_exc()
        return None

In [24]:
# # # # Example usage of the process_team_and_opponent function
# input_date = '2025-02-08'
# team = 'CHI' 
# last_n = 10
# team_5v5 = {
#     'away': get_team_stats(end_date=input_date, last_n=last_n, db_prefix="NST_DB_", situation="5v5", stype=2, side="away"),
#     'home': get_team_stats(end_date=input_date, last_n=last_n, db_prefix="NST_DB_", situation="5v5", stype=2, side="home")
# }
# team_pk = {
#     'away': get_team_stats(end_date=input_date, last_n=last_n, db_prefix="NST_DB_", situation="pk", stype=2, side="away"),
#     'home': get_team_stats(end_date=input_date, last_n=last_n, db_prefix="NST_DB_", situation="pk", stype=2, side="home")
# }
# team_pp = {
#     'away': get_team_stats(end_date=input_date, last_n=last_n, db_prefix="NST_DB_", situation="pp", stype=2, side="away"),
#     'home': get_team_stats(end_date=input_date, last_n=last_n, db_prefix="NST_DB_", situation="pp", stype=2, side="home")
# }
# matchup_data = get_pregame_matchup_stats(input_date, team, last_n=last_n, team_5v5=team_5v5, team_pk=team_pk, team_pp=team_pp)
# matchup_data

In [25]:
def calculate_goalie_rolling_stats(df, player_name: str, window_size: int = 10, date: str = None):
    """Create rolling averages and statistics for a specific goalie
    
    Args:
        df (pd.DataFrame): Input dataframe with goalie statistics
        player_name (str): Name of the player to process
        window_size (int, optional): Size of rolling window for statistics. Defaults to 10.
        date (str, optional): If provided, filter out data on or after this date
        
    Returns:
        pd.DataFrame: Processed statistics for the specified player or league averages if player not found
    """
    # Make a copy and filter for specific player
    player_df = df[df['player'] == player_name].copy()
    
    # If no data found for player, use league averages
    if player_df.empty:
        print(f"No data found for player: {player_name}. Using league averages.")
        
        # Calculate league averages
        league_df = df.copy()
        league_df['date'] = pd.to_datetime(league_df['date'])
        
        if date:
            cutoff_date = pd.to_datetime(date)
            league_df = league_df[league_df['date'] < cutoff_date]
        
        # Convert numeric columns from object to float
        numeric_cols = ['sv_pct', 'gaa', 'gsaa', 'xg_against', 'hdsv_pct', 
                       'mdsv_pct', 'ldsv_pct', 'avg_shot_distance', 'avg_goal_distance']
        for col in numeric_cols:
            league_df[col] = pd.to_numeric(league_df[col], errors='coerce')
        
        # Calculate league averages for all relevant columns
        feature_columns = [
            'sa', 'sv_pct', 'gaa', 'gsaa',
            'xg_against', 
            'hd_sa', 'hdsv_pct',
            'md_sa', 'mdsv_pct',
            'ld_sa', 'ldsv_pct'
        ]
        
        # Map original column names to shortened versions
        col_mapping = {
            'shots_against': 'sa',
            'hd_shots_against': 'hd_sa',
            'md_shots_against': 'md_sa',
            'ld_shots_against': 'ld_sa'
        }
        
        # Create a single row DataFrame with league averages
        league_averages = pd.DataFrame([{
            f'{col}_roll_avg': league_df[col_mapping.get(col, col)].mean() for col in feature_columns
        }])
        
        # Add standard deviations
        for col in feature_columns:
            league_averages[f'{col}_roll_std'] = league_df[col_mapping.get(col, col)].std()
        
        # Add workload features (use median values) with shortened names
        league_averages['rest'] = 3.0  # typical rest between games
        league_averages['l7'] = 2.0  # typical games in 7 days
        
        return league_averages
    
    # If we have player data, proceed with normal calculations
    player_df['date'] = pd.to_datetime(player_df['date'])
    player_df = player_df.sort_values('date')
    
    # Convert numeric columns from object to float
    numeric_cols = ['sv_pct', 'gaa', 'gsaa', 'xg_against', 'hdsv_pct', 
                   'mdsv_pct', 'ldsv_pct', 'avg_shot_distance', 'avg_goal_distance']
    for col in numeric_cols:
        player_df[col] = pd.to_numeric(player_df[col], errors='coerce')
    
    # Rename columns with shortened names
    rename_dict = {
        'shots_against': 'sa',
        'hd_shots_against': 'hd_sa',
        'md_shots_against': 'md_sa',
        'ld_shots_against': 'ld_sa'
    }
    player_df = player_df.rename(columns=rename_dict)
    
    feature_columns = [
        'sa', 'sv_pct', 'gaa', 'gsaa',
        'xg_against', 
        'hd_sa', 'hdsv_pct',
        'md_sa', 'mdsv_pct',
        'ld_sa', 'ldsv_pct'
    ]
    
    try:
        print(f"Processing player: {player_name}")
        print(f"Data shape: {player_df.shape}")
        
        # Calculate rolling statistics with updated names
        for col in feature_columns:
            player_df[f'{col}_roll_avg'] = player_df[col].rolling(
                window=window_size, min_periods=1
            ).mean()
            player_df[f'{col}_roll_std'] = player_df[col].rolling(
                window=window_size, min_periods=1
            ).std()
        
        # Add workload features with shortened names
        player_df['rest'] = player_df['date'].diff().dt.days
        player_df['l7'] = player_df.rolling('7D', on='date')['date'].count()
        
        print(f"Successfully processed {player_name}")
        
        # Filter out dates if date is provided
        if date:
            cutoff_date = pd.to_datetime(date)
            player_df = player_df[player_df['date'] < cutoff_date]
            if player_df.empty:
                print(f"No data found for {player_name} before {date}. Using league averages.")
                return calculate_goalie_rolling_stats(df, player_name, window_size)  # Recursive call without date
            player_df = player_df.tail(1).reset_index(drop=True)

        # Keep only date, rolling features, and workload features
        rolling_cols = [col for col in player_df.columns if '_roll_' in col]
        cols_to_keep = ['date', 'player', 'team'] + rolling_cols + ['rest', 'l7']
        player_df = player_df[cols_to_keep]
            
        return player_df
        
    except Exception as e:
        print(f"Error processing player {player_name}: {str(e)}")
        print("Data types:", player_df.dtypes)
        raise

In [26]:
def enrich_matchup_with_goalie_stats(
    matchup_df: pd.DataFrame,
    goalie_stats_5v5: pd.DataFrame,
    goalie_stats_pk: pd.DataFrame,
    window_size: int = 10,
    home_away_split: bool = False
) -> pd.DataFrame:
    """
    Enriches matchup data with rolling goalie statistics.
    
    Args:
        matchup_df (pd.DataFrame): Output from get_pregame_matchup_stats
        goalie_stats_df (pd.DataFrame): Raw goalie statistics
        window_size (int): Window size for rolling calculations
        
    Returns:
        pd.DataFrame: Enhanced matchup data with goalie rolling statistics
    """
    # Create a copy to avoid modifying original
    enriched_df = matchup_df.copy()
    
    # Process each goalie in the matchup
    for idx, row in enriched_df.iterrows():
        if pd.isna(row['goalie_name']):
            continue
        if row['home'] == True:
            if home_away_split:
                side = 'home'
            else:
                side = 'both'
        else:  
            if home_away_split:
                side = 'away'
            else:
                side = 'both'
        try:
            # Calculate rolling stats for this goalie, fetching the appropriate side (home or away)
            goalie_rolling_5v5 = calculate_goalie_rolling_stats(
                goalie_stats_5v5[side],
                player_name=row['goalie_name'],
                window_size=window_size,
                date=row['game_date']
            )
            
            goalie_rolling_pk = calculate_goalie_rolling_stats(
                goalie_stats_pk[side],
                player_name=row['goalie_name'],
                window_size=window_size,
                date=row['game_date']
            )
            
            
            if not goalie_rolling_5v5.empty and not goalie_rolling_pk.empty:
                # Add prefix to rolling columns to avoid confusion with team stats
                rolling_cols_5v5 = [col for col in goalie_rolling_5v5.columns 
                                  if col not in ['date', 'player', 'team']]
                rolling_cols_pk = [col for col in goalie_rolling_pk.columns
                                 if col not in ['date', 'player', 'team']]
                
                # Add 5v5 stats, assume that no prefix is 5v5 like team stats
                for col in rolling_cols_5v5:
                    enriched_df.at[idx, f'g_{col}'] = goalie_rolling_5v5.iloc[0][col]
                
                # Add PK stats    
                for col in rolling_cols_pk:
                    # Skip rest and l7 columns as they're redundant
                    if col not in ['rest', 'l7']:
                        enriched_df.at[idx, f'g_pk_{col}'] = goalie_rolling_pk.iloc[0][col]
                    
        except Exception as e:
            print(f"Error processing goalie {row['goalie_name']}: {e}")
            # Continue with next goalie if one fails
            continue
    
    return enriched_df

# matchup_data = enrich_matchup_with_goalie_stats(matchup_data, goalie_stats)

In [27]:
def get_matchup_goalie_results(matchup_df: pd.DataFrame) -> pd.DataFrame:
    """
    Retrieves actual game results for goalies from a matchup DataFrame.
    
    Args:
        matchup_df (pd.DataFrame): DataFrame containing matchup data with game_id column
        
    Returns:
        pd.DataFrame: Original matchup data with additional columns for actual game results
    """
    # Get the boxscore data
    game_id = matchup_df['game_id'].iloc[0]
    game_data = get_game_boxscore(game_id, clean=False)
    player_stats = game_data.get('playerByGameStats', {})
    
    # Create a copy of the input DataFrame
    enriched_df = matchup_df.copy()
    
    # Process each team's data
    for idx, row in enriched_df.iterrows():
        team_type = 'homeTeam' if row['home'] else 'awayTeam'
        goalies = player_stats.get(team_type, {}).get('goalies', [])
        
        if goalies:
            # Find the goalie whose name matches the one in enriched_df
            matching_goalie = None
            for g in goalies:
                goalie_name = get_player_full_name(g.get('playerId'), 'NHL_DB_', suppress_log=True)
                if goalie_name == row['goalie_name']:
                    matching_goalie = g
                    break
            
            # If no matching goalie found, use the first one (as before)
            goalie = matching_goalie or goalies[0]
            
            shots = goalie.get('shotsAgainst', 0)
            saves = goalie.get('saves', 0)
            
            enriched_df.loc[idx, 'res_sv'] = saves
            enriched_df.loc[idx, 'res_sa'] = shots
            enriched_df.loc[idx, 'res_sv_pct'] = round(saves / shots if shots > 0 else 0.0, 3)
            enriched_df.loc[idx, 'res_ga'] = goalie.get('goalsAgainst', 0)
            enriched_df.loc[idx, 'res_des'] = goalie.get('decision', 'N/A')
            enriched_df.loc[idx, 'res_toi'] = goalie.get('toi', '00:00')  # Add time on ice
    
    return enriched_df

In [28]:
def process_matchups_for_date(input_date: str, last_n: int = None, home_away_split: bool=False ) -> pd.DataFrame:
    """
    Processes all matchup games for a given date by getting matchup stats, 
    enriching with goalie stats, and getting actual game results.
    
    Args:
        input_date (str): The reference date in 'YYYY-MM-DD' format
        last_n (int, optional): Number of last games to consider for stats
    
    Returns:
        pd.DataFrame: A DataFrame containing all matchups for the date with 
                     pre-game stats and actual results
    """
    try:
        # Step 1: Get matchup games for the date
        temp_data = get_matchup_games(input_date, input_date)
        game_ids = temp_data.get('game_ids', {}).get('id', [])
        
        if not game_ids:
            print(f"No games found for the date {input_date}.")
            return pd.DataFrame()
       
        # Parse the input date and subtract one day
        day_before = (datetime.strptime(input_date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
        
        # print(f"Input date: {input_date}, Day before: {day_before}")

        # Step 2: Get goalie stats for enrichment
        # This creates a nested dictionary structure where:
        # - The outer key is the stat_type ('5v5' or 'pk')
        # - The inner key is the side ('both', 'away', or 'home')
        # - The value is the DataFrame of goalie stats for that combination
        goalie_stats = {}
        for stat_type, situation in [('5v5', '5v5'), ('pk', 'pk')]:
            if home_away_split:
                goalie_stats[stat_type] = {
                    'both': pd.DataFrame(),
                    'away': get_goalie_stats(
                        end_date=day_before,
                        situation=situation,
                        side='away'
                    ),
                    'home': get_goalie_stats(
                        end_date=day_before,
                        situation=situation,
                        side='home'
                    )
                }
            else:
                goalie_stats[stat_type] = {
                    'both': get_goalie_stats(
                        end_date=day_before,
                        situation=situation,
                        side=None
                    ),
                    'away': pd.DataFrame(),
                    'home': pd.DataFrame()
                }

        # Step 3: Try to get team stats from database first, fall back to scraper if needed
        # This creates a nested dictionary structure where:
        # - The outer key is the situation ('5v5', 'pp', or 'pk')
        # - The inner key is the side ('both', 'away', or 'home')
        # - The value is the DataFrame of team stats for that combination
        team_stats = {}
        try:
            print(f"Attempting to retrieve team stats from database for date: {day_before}")
            
            # Get team stats for each situation and side from database
            for situation in ['5v5', 'pp', 'pk']:
                if home_away_split:
                    team_stats[situation] = {
                        'both': pd.DataFrame(),
                        'away': get_team_stats(
                            end_date=day_before,
                            last_n=last_n,
                            db_prefix="NST_DB_",
                            situation=situation,
                            stype=2,
                            side='away'
                        ),
                        'home': get_team_stats(
                            end_date=day_before,
                            last_n=last_n,
                            db_prefix="NST_DB_", 
                            situation=situation,
                            stype=2,
                            side='home'
                        )
                    }
                else:
                    team_stats[situation] = {
                        'both': get_team_stats(
                            end_date=day_before,
                            last_n=last_n,
                            db_prefix="NST_DB_",
                            situation=situation, 
                            stype=2,
                            side=None
                        ),
                        'away': pd.DataFrame(),
                        'home': pd.DataFrame()
                    }
            
            print(f"Successfully retrieved all team stats from database.")
        except Exception as e:
            print(f"Could not retrieve team stats from database: {e}")
            print(f"Falling back to scraper to get team stats...")
            
            # Fall back to scraper for each situation and side
            for situation in ['5v5', 'pp', 'pk']:
                if home_away_split:
                    team_stats[situation] = {
                        'both': pd.DataFrame(),
                        'away': nst_team_on_ice_scraper(
                            startdate='',
                            enddate=day_before,
                            stype=2,
                            sit=situation,
                            last_n=last_n,
                            loc='A'
                        ),
                        'home': nst_team_on_ice_scraper(
                            startdate='',
                            enddate=day_before,
                            stype=2,
                            sit=situation,
                            last_n=last_n,
                            loc='H'
                        )
                    }
                else:
                    team_stats[situation] = {
                        'both': nst_team_on_ice_scraper(
                            startdate='',
                            enddate=day_before,
                            stype=2,
                            sit=situation,
                            last_n=last_n,
                            loc='B'
                        ),
                        'away': pd.DataFrame(),
                        'home': pd.DataFrame()
                    }

        results = []
        
        # Step 4: Process each game
        for game_id in game_ids:
            print(f"\nProcessing Game ID: {game_id}")
            
            # Get boxscore to determine teams
            boxscore = get_game_boxscore(game_id, clean=True)
            away_team = boxscore.get('away_team')
            home_team = boxscore.get('home_team')
            
            if not away_team or not home_team:
                print(f"Could not extract teams for Game ID: {game_id}. Skipping.")
                continue
                
            # Process teams (both home and away)
            try:
                matchup = get_pregame_matchup_stats(input_date, away_team, last_n=last_n, team_5v5=team_stats['5v5'], team_pp=team_stats['pp'], team_pk=team_stats['pk'], home_away_split=home_away_split)
                matchup = enrich_matchup_with_goalie_stats(matchup, goalie_stats['5v5'], goalie_stats['pk'], window_size=last_n, home_away_split=home_away_split)
                matchup = get_matchup_goalie_results(matchup)
                results.append(matchup)
            except Exception as e:
                print(f"Error processing away team {away_team}: {e}")

        # Combine all results
        if results:
            final_df = pd.concat(results, ignore_index=True)
            print("\nAll matchups processed successfully.")
            return final_df
        else:
            print("No matchups were successfully processed.")
            return pd.DataFrame()

    except Exception as e:
        print(f"An error occurred during processing: {e}")
        return pd.DataFrame()

In [None]:
matchups = process_matchups_for_date('2024-12-11', last_n=10)

In [None]:
matchups

In [31]:
def process_matchups_for_date_range(start_date: str, end_date: str, last_n: int = None) -> pd.DataFrame:
    """
    Processes matchup games for a range of dates by calling process_matchups_for_date for each date.
    
    Args:
        start_date (str): Start date in 'YYYY-MM-DD' format
        end_date (str): End date in 'YYYY-MM-DD' format
        last_n (int, optional): Number of last games to consider for stats
    
    Returns:
        pd.DataFrame: A DataFrame containing all matchups for the date range with 
                     pre-game stats and actual results
    """
    try:
        # Convert dates to datetime objects
        start = pd.to_datetime(start_date)
        end = pd.to_datetime(end_date)
        
        # Generate list of dates
        dates = pd.date_range(start=start, end=end, freq='D')
        
        all_results = []
        
        # Process each date
        for date in dates:
            date_str = date.strftime('%Y-%m-%d')
            print(f"\nProcessing date: {date_str}")
            
            # Process matchups for this date
            daily_results = process_matchups_for_date(date_str, last_n=last_n)
            if not daily_results.empty:
                all_results.append(daily_results)

            time.sleep(random.randint(1, 2))
        
        # Combine all results
        if all_results:
            final_df = pd.concat(all_results, ignore_index=True)
            print(f"\nProcessed {len(dates)} days of matchups successfully.")
            return final_df
        else:
            print("No matchups were successfully processed for the date range.")
            return pd.DataFrame()
            
    except Exception as e:
        print(f"An error occurred during processing: {e}")
        raise

# Example usage:
# matchups_range = process_matchups_for_date_range('2023-01-01', '2023-01-31', last_n=15)

In [32]:
# matchups_range.to_csv('../data/g_15_01_23.csv', index=False)

In [33]:
from src.data_processing.season_utils import get_season_end_date, get_season_start_date
import calendar

def process_season_by_month(year, last_n=15):
    """
    Process an entire NHL season by month and save each month's data to a separate CSV file.
    
    Args:
        year (int): The starting year of the season (e.g., 2021 for the 2021-2022 season)
        last_n (int): Number of previous games to consider for rolling stats
        
    Returns:
        dict: Dictionary with month names as keys and DataFrames as values
    """
    season = int(f"{year}{year+1}")
    
    # Determine season start and end dates
    if year >= 2021:
        # Regular seasons since 2021-2022
        start_date = f"{year}-10-01"  # Approximate start in October
    elif year == 2020:
        # COVID-shortened 2020-2021 season
        start_date = "2021-01-13"
    else:
        start_date = f"{year}-10-01"  # Default for earlier seasons
    
    try:
        # Try to get the official end date from season_utils
        end_date = get_season_end_date(season, stype=2)  # Regular season end
    except ValueError:
        # If not available, use a reasonable default (end of April next year)
        end_date = f"{year+1}-04-30"
    
    print(f"Processing {year}-{year+1} NHL season from {start_date} to {end_date}")
    
    # Convert dates to datetime for easier manipulation
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    
    # Process each month in the season
    current = start
    results = {}
    
    while current <= end:
        year_month = current.strftime('%Y-%m')
        month_name = current.strftime('%b').lower()
        year_short = current.strftime('%y')
        
        # Get the last day of the current month
        if current.month == 12:
            last_day = 31
        else:
            last_day = calendar.monthrange(current.year, current.month)[1]
        
        month_start = f"{year_month}-01"
        month_end = f"{year_month}-{last_day}"
        
        # Adjust if this is the start or end of the season
        if pd.to_datetime(month_start) < start:
            month_start = start.strftime('%Y-%m-%d')
        if pd.to_datetime(month_end) > end:
            month_end = end.strftime('%Y-%m-%d')
        
        print(f"\nProcessing month: {month_name.upper()} {current.year}")
        
        # Process the month
        try:
            monthly_data = process_matchups_for_date_range(month_start, month_end, last_n=last_n)
            
            if not monthly_data.empty:
                # Save to CSV
                filename = f"../data/g_{last_n}-1_{month_name}_{year_short}.csv"
                monthly_data.to_csv(filename, index=False)
                print(f"Saved {len(monthly_data)} matchups to {filename}")
                results[f"{month_name}_{year_short}"] = monthly_data
            else:
                print(f"No data found for {month_name.upper()} {current.year}")
        except Exception as e:
            print(f"Error processing {month_name.upper()} {current.year}: {e}")
        
        # Move to next month
        if current.month == 12:
            current = pd.Timestamp(year=current.year + 1, month=1, day=1)
        else:
            current = pd.Timestamp(year=current.year, month=current.month + 1, day=1)
    
    return results

# Example usage:
# season_2022_2023 = process_season_by_month(2022, last_n=15)


In [None]:
season_2023_2024 = process_season_by_month(2023, last_n=7)
season_2022_2023 = process_season_by_month(2022, last_n=7)
season_2021_2022 = process_season_by_month(2021, last_n=7)
season_2020_2021 = process_season_by_month(2020, last_n=7)

In [None]:
matchups_range1 = process_matchups_for_date_range('2024-10-04', '2024-10-31', last_n=7)
matchups_range1.to_csv('../data/g_7-1_10_24.csv', index=False)
matchups_range2 = process_matchups_for_date_range('2024-11-01', '2024-11-30', last_n=7)
matchups_range2.to_csv('../data/g_7-1_11_24.csv', index=False)
matchups_range3 = process_matchups_for_date_range('2024-12-01', '2024-12-31', last_n=7)
matchups_range3.to_csv('../data/g_7-1_12_24.csv', index=False)
matchups_range4 = process_matchups_for_date_range('2025-01-01', '2025-01-31', last_n=7)
matchups_range4.to_csv('../data/g_7-1_01_25.csv', index=False)
matchups_range5 = process_matchups_for_date_range('2025-02-01', '2025-02-28', last_n=7)
matchups_range5.to_csv('../data/g_7-1_02_25.csv', index=False)

In [None]:
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# from sklearn.feature_selection import SelectKBest, f_regression


# def analyze_features_for_saves_prediction(df):
#     # Remove columns we don't want to consider as features
#     exclude_cols = [
#         'game_date', 'game_id', 'team', 'goalie_name', 'goalie_team', 'goalie_id', 
#         'res_decision', 'res_saves', 'res_shots_against', 'res_save_pct', 'res_goals_against'
#     ]
    
#     # Convert boolean columns to int
#     df['home'] = df['home'].astype(int)
#     df['b2b'] = df['b2b'].astype(int)
#     df['opp_b2b'] = df['opp_b2b'].astype(int)
    
#     # Select numeric columns
#     numeric_cols = df.select_dtypes(include=[np.number]).columns
#     feature_cols = [col for col in numeric_cols if col not in exclude_cols]
    
#     # Create feature matrix and target vector
#     X = df[feature_cols].copy()
#     y = df['res_saves']
    
#     # Handle missing values
#     X = X.fillna(X.mean())
    
#     # Scale the features
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X)
    
#     # 1. PCA Analysis
#     pca = PCA()
#     X_pca = pca.fit_transform(X_scaled)
    
#     # 2. Direct correlation with target
#     correlations = []
#     for col in feature_cols:
#         corr = df[col].corr(y)
#         correlations.append((col, abs(corr)))
    
#     correlation_df = pd.DataFrame(correlations, columns=['Feature', 'Correlation'])
#     correlation_df = correlation_df.sort_values('Correlation', ascending=False)
    
#     # 3. Feature selection using f_regression
#     selector = SelectKBest(score_func=f_regression, k='all')
#     selector.fit(X_scaled, y)
#     f_scores = selector.scores_
    
#     # Combine all feature importance metrics
#     feature_importance_df = pd.DataFrame({
#         'Feature': feature_cols,
#         'PCA_Importance': np.abs(pca.components_[0]),
#         'Correlation': [abs(df[col].corr(y)) for col in feature_cols],
#         'F_Score': f_scores
#     })
    
#     # Normalize F-scores
#     feature_importance_df['F_Score_Norm'] = feature_importance_df['F_Score'] / feature_importance_df['F_Score'].max()
    
#     # Calculate combined importance score
#     feature_importance_df['Combined_Score'] = (
#         feature_importance_df['PCA_Importance'] + 
#         feature_importance_df['Correlation'] + 
#         feature_importance_df['F_Score_Norm']
#     ) / 3
    
#     feature_importance_df = feature_importance_df.sort_values('Combined_Score', ascending=False)
    
#     # Plotting
#     plt.figure(figsize=(15, 10))
    
#     # Plot 1: Top 20 Features by Combined Score
#     plt.subplot(2, 1, 1)
#     sns.barplot(data=feature_importance_df.head(20), 
#                 x='Combined_Score', y='Feature')
#     plt.title('Top 20 Features by Combined Importance Score')
    
#     # Plot 2: Correlation Matrix of Top Features with Target
#     plt.subplot(2, 1, 2)
#     top_features = feature_importance_df.head(10)['Feature'].tolist() + ['res_saves']
#     correlation_matrix = df[top_features].corr()
#     sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
#     plt.title('Correlation Matrix of Top 10 Features with Saves')
    
#     plt.tight_layout()
#     plt.show()
    
#     # Print detailed analysis
#     print("\nTop 20 Most Important Features for Predicting Saves:")
#     print(feature_importance_df[['Feature', 'Combined_Score', 'Correlation', 'PCA_Importance', 'F_Score_Norm']].head(20))
    
#     # Comprehensive feature grouping
#     feature_groups = {
#         'Goalie Performance History': [col for col in feature_cols if col.startswith('g_')],
#         'Game Context': ['home', 'b2b', 'opp_b2b'],
        
#         # Team possession metrics
#         'Possession Metrics': ['cf', 'ca', 'cf%', 'ff', 'fa', 'ff%'],
        
#         # Shot metrics
#         'Shot Metrics': ['sf', 'sa', 'sf%', 'sh%', 'sv%', 'pdo'],
        
#         # Goal metrics
#         'Goal Metrics': ['gf', 'ga', 'gf%'],
        
#         # Expected goals metrics
#         'Expected Goals': ['xgf', 'xga', 'xgf%'],
        
#         # Scoring chances metrics
#         'Scoring Chances': ['scf', 'sca', 'scf%', 'scsf', 'scsa', 'scsf%', 'scgf', 'scga', 'scgf%', 'scsh%', 'scsv%'],
        
#         # High-danger chances metrics
#         'High-Danger Chances': ['hdcf', 'hdca', 'hdcf%', 'hdsf', 'hdsa', 'hdsf%', 'hdgf', 'hdga', 'hdgf%', 'hdsh%', 'hdsv%'],
        
#         # Medium-danger chances metrics
#         'Medium-Danger Chances': ['mdcf', 'mdca', 'mdcf%', 'mdsf', 'mdsa', 'mdsf%', 'mdgf', 'mdga', 'mdgf%', 'mdsh%', 'mdsv%'],
        
#         # Low-danger chances metrics
#         'Low-Danger Chances': ['ldcf', 'ldca', 'ldcf%', 'ldsf', 'ldsa', 'ldsf%', 'ldgf', 'ldga', 'ldgf%', 'ldsh%', 'ldsv%']
#     }
    
#     # Calculate and print average importance by feature group
#     print("\nAverage Importance by Feature Group:")
#     group_importance_data = []
    
#     for group, features in feature_groups.items():
#         valid_features = [f for f in features if f in feature_importance_df['Feature'].values]
#         if valid_features:
#             group_importance = feature_importance_df[
#                 feature_importance_df['Feature'].isin(valid_features)
#             ]['Combined_Score'].mean()
            
#             # Get top feature in this group
#             top_feature = feature_importance_df[
#                 feature_importance_df['Feature'].isin(valid_features)
#             ].iloc[0]['Feature'] if len(valid_features) > 0 else "None"
            
#             # Count features in this group
#             feature_count = len(valid_features)
            
#             group_importance_data.append({
#                 'Group': group,
#                 'Average_Importance': group_importance,
#                 'Feature_Count': feature_count,
#                 'Top_Feature': top_feature
#             })
            
#             print(f"{group}: {group_importance:.4f} (Top: {top_feature}, Count: {feature_count})")
    
#     # Create a DataFrame for group importance and plot it
#     group_importance_df = pd.DataFrame(group_importance_data)
#     group_importance_df = group_importance_df.sort_values('Average_Importance', ascending=False)
    
#     plt.figure(figsize=(12, 8))
#     sns.barplot(data=group_importance_df, x='Average_Importance', y='Group')
#     plt.title('Average Feature Importance by Feature Group')
#     plt.tight_layout()
#     plt.show()
    
#     # Print detailed feature analysis for each group
#     print("\nDetailed Feature Analysis by Group:")
#     for group, features in feature_groups.items():
#         valid_features = [f for f in features if f in feature_importance_df['Feature'].values]
#         if valid_features:
#             print(f"\n{group}:")
#             group_df = feature_importance_df[feature_importance_df['Feature'].isin(valid_features)]
#             group_df = group_df.sort_values('Combined_Score', ascending=False)
#             print(group_df[['Feature', 'Combined_Score', 'Correlation']].to_string(index=False))
    
#     return feature_importance_df

# # Run the analysis
# feature_importance_df = analyze_features_for_saves_prediction(processed_df)

In [4]:
def predict_next_game(goalie_history, model, scaler, window_size=10):
    """Predict save percentage for next game"""
    X, _ = prepare_features(goalie_history, window_size)
    X_latest = X.iloc[[-1]]  # Get most recent game's features
    X_scaled = scaler.transform(X_latest)
    
    predicted_sv = model.predict(X_scaled)[0]
    return predicted_sv

def calculate_performance_scalar(predicted_sv, league_avg_sv=0.910):
    """Convert predicted save percentage to performance scalar"""
    sv_diff = predicted_sv - league_avg_sv
    return 1 - sv_diff

In [None]:
# Now let's evaluate the model
processed_data = prepare_game_data(goalie_stats)
model, scaler, metrics = train_model(processed_data)

# Print evaluation metrics
print("Model Evaluation Metrics:")
print(f"R² Score: {metrics['r2_score']:.4f}")
print(f"Mean Absolute Error: {metrics['mae']:.4f}")
print(f"Root Mean Squared Error: {metrics['rmse']:.4f}")

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 6))
importance_df = pd.DataFrame({
    'feature': list(metrics['feature_importance'].keys()),
    'importance': list(metrics['feature_importance'].values())
}).sort_values('importance', ascending=False)

sns.barplot(data=importance_df.head(10), x='importance', y='feature')
plt.title('Top 10 Most Important Features')
plt.tight_layout()
plt.show()

In [None]:
# Plot actual vs predicted values for a specific goalie
goalie_name = 'Philipp Grubauer'
goalie_data = processed_data[processed_data['player'] == goalie_name].copy()  # Make a copy
X, processed_df = prepare_features(goalie_data)
X_scaled = scaler.transform(X)
predictions = model.predict(X_scaled)

# Print dimensions to debug
print(f"Original data length: {len(goalie_data)}")
print(f"Predictions length: {len(predictions)}")
print(f"X_scaled shape: {X_scaled.shape}")

# Create the plot with aligned data
plt.figure(figsize=(10, 6))
# Use the same date range for both actual and predicted values
dates = goalie_data['date'].iloc[:-1]  # Remove last date
actual_values = goalie_data['sv_pct'].iloc[:-1]  # Remove last actual value

plt.plot(dates, actual_values, label='Actual', marker='o')
plt.plot(dates, predictions[:-1], label='Predicted Next Game', marker='o')
plt.title(f'Actual vs Predicted Save Percentage - {goalie_name}')
plt.xlabel('Date')
plt.ylabel('Save Percentage')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print the prediction for the next game
print(f"\nPredicted save percentage for {goalie_name}'s next game: {predictions[-1]:.3f}")

In [None]:
performance_scalar

In [None]:
g