In [1]:
import pandas as pd
import numpy as np
import os
import sys
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from typing import Tuple, List

# Configure pandas to display all columns
pd.set_option('display.max_columns', None)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Add project root to sys.path
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.db.nst_db_utils import *

In [2]:
goalie_stats = get_goalie_stats(
    start_date="2024-10-01",
    end_date="2025-01-15"
)

INFO:src.db.base_utils:Database connection established.
INFO:src.db.base_utils:Database connection closed.


In [3]:
goalie_stats

Unnamed: 0,date,player,team,toi,shots_against,saves,goals_against,sv_pct,gaa,gsaa,xg_against,hd_shots_against,hd_saves,hd_goals_against,hdsv_pct,md_shots_against,md_saves,md_goals_against,mdsv_pct,ld_shots_against,ld_saves,ld_goals_against,ldsv_pct,rush_attempts_against,rebound_attempts_against,avg_shot_distance,avg_goal_distance
0,2025-01-15,Ukko-Pekka Luukkonen,BUF,60.0,37,35,2,0.946,2.0,1.67,3.65,4,4,0,1.0,6,6,0,1.0,24,22,2,0.917,5,13,47.43,47.0
1,2025-01-15,Filip Gustavsson,MIN,57.6,36,31,5,0.861,5.21,-1.43,2.51,7,5,2,0.714,4,4,0,1.0,24,21,3,0.875,3,1,33.81,15.4
2,2025-01-15,Calvin Pickard,EDM,60.0,34,31,3,0.912,3.0,0.37,2.72,6,5,1,0.833,11,10,1,0.909,16,15,1,0.938,4,5,35.0,14.67
3,2025-01-15,Dustin Tokarski,CAR,57.72,24,21,3,0.875,3.12,-0.62,2.3,7,6,1,0.857,6,4,2,0.667,6,6,0,1.0,4,3,54.58,27.67
4,2025-01-14,Justus Annunen,NSH,60.0,22,19,3,0.864,3.0,-0.93,2.26,5,4,1,0.8,10,8,2,0.8,5,5,0,1.0,3,6,38.5,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,2024-10-08,Philipp Grubauer,SEA,58.3,25,22,3,0.88,3.09,0.05,1.91,5,3,2,0.6,8,8,0,1.0,11,10,1,0.909,2,2,42.28,20.67
1478,2024-10-05,Devon Levi,BUF,56.77,37,34,3,0.919,3.17,-0.31,3.41,9,7,2,0.778,11,10,1,0.909,15,15,0,1.0,3,6,35.08,19.33
1479,2024-10-05,Jake Allen,NJD,60.0,18,17,1,0.944,1.0,0.31,1.78,3,2,1,0.667,1,1,0,1.0,12,12,0,1.0,1,6,52.94,7.0
1480,2024-10-04,Ukko-Pekka Luukkonen,BUF,59.08,22,19,3,0.864,3.05,-1.34,2.61,2,1,1,0.5,9,8,1,0.889,11,10,1,0.909,4,8,36.27,36.67


In [4]:
def prepare_rolling_features(df, window_size=10):
    """Create rolling averages and statistics for each goalie"""
    feature_columns = [
        'shots_against', 'sv_pct', 'gaa', 'gsaa',
        'xg_against', 
        'hd_shots_against', 'hdsv_pct',
        'md_shots_against', 'mdsv_pct',
        'ld_shots_against', 'ldsv_pct',
        'toi'  # Removed columns that don't exist in the data
    ]
    
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Convert TOI to minutes if it's in HH:MM:SS format
    if df['toi'].dtype == 'object':
        df['toi'] = df['toi'].apply(lambda x: sum(int(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
    
    # Verify we have the required columns
    missing_cols = [col for col in feature_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns: {missing_cols}")
        
    df = df.sort_values(['player', 'date']).reset_index(drop=True)
    
    rolling_stats = []
    for name, group in df.groupby('player'):
        # Make a copy of the group to avoid SettingWithCopyWarning
        group = group.copy()
        
        try:
            # Calculate rolling averages with exponential weighting
            for col in feature_columns:
                group[f'{col}_rolling_avg'] = group[col].ewm(
                    span=window_size, min_periods=1
                ).mean()
                
                group[f'{col}_rolling_std'] = group[col].ewm(
                    span=window_size, min_periods=1
                ).std()
            
            # Calculate advanced trend features
            group['sv_pct_trend'] = group['sv_pct'].diff()
            group['gsaa_trend'] = group['gsaa'].diff()
            group['xg_diff'] = group['goals_against'] - group['xg_against']
            group['xg_diff_rolling'] = group['xg_diff'].rolling(
                window=window_size, min_periods=1
            ).mean()
            
            # Add workload features
            group['days_rest'] = group['date'].diff().dt.days
            group['games_last_7'] = group.rolling('7D', on='date')['date'].count()
            
            rolling_stats.append(group)
            
        except Exception as e:
            print(f"Error processing player {name}: {str(e)}")
    
    if not rolling_stats:
        raise ValueError("No data was processed successfully")
        
    return pd.concat(rolling_stats)

def prepare_features(df, window_size=10):
    """Prepare features for model training or prediction"""
    rolling_df = prepare_rolling_features(df, window_size)
    
    # Create feature matrix
    feature_cols = [col for col in rolling_df.columns 
                   if any(feat in col for feat in 
                       ['rolling_avg', 'rolling_std', 'trend', 'games_last_7', 'days_rest'])]
    
    X = rolling_df[feature_cols].fillna(0)
    return X, rolling_df

def train_model(historical_data, window_size=10):
    """Train the model on historical goalie data"""
    X, processed_data = prepare_features(historical_data, window_size)
    y = processed_data['sv_pct'].shift(-1)  # Next game's save percentage
    
    # Remove last row of each goalie (no next game to predict)
    mask = processed_data.groupby('player').tail(1).index
    X = X.drop(mask)
    y = y.drop(mask)
    
    # Remove any remaining NaN values
    mask = ~y.isna()
    X = X[mask]
    y = y[mask]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train model
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
    rf_model.fit(X_scaled, y)
    
    return rf_model, scaler

def predict_next_game(goalie_history, model, scaler, window_size=10):
    """Predict save percentage for next game"""
    X, _ = prepare_features(goalie_history, window_size)
    X_latest = X.iloc[[-1]]  # Get most recent game's features
    X_scaled = scaler.transform(X_latest)
    
    predicted_sv = model.predict(X_scaled)[0]
    return predicted_sv

def calculate_performance_scalar(predicted_sv, league_avg_sv=0.910):
    """Convert predicted save percentage to performance scalar"""
    sv_diff = predicted_sv - league_avg_sv
    return 1 - sv_diff

def prepare_game_data(df):
    """Prepare scraped game data for model input"""
    # Convert date to datetime if not already
    if df['date'].dtype != 'datetime64[ns]':
        df['date'] = pd.to_datetime(df['date'])
    
    # Ensure all numeric columns are float
    numeric_cols = [
        'toi', 'shots_against', 'saves', 'goals_against', 'sv_pct', 
        'gaa', 'gsaa', 'xg_against',  # Added xg_against
        'hd_shots_against', 'hd_saves', 'hd_goals_against', 'hdsv_pct',
        'md_shots_against', 'md_saves', 'md_goals_against', 'mdsv_pct',
        'ld_shots_against', 'ld_saves', 'ld_goals_against', 'ldsv_pct'
    ]
    
    # First convert to string to handle any Decimal objects
    for col in numeric_cols:
        df[col] = df[col].astype(str).astype(float)
    
    return df

In [5]:
goalie_stats

Unnamed: 0,date,player,team,toi,shots_against,saves,goals_against,sv_pct,gaa,gsaa,xg_against,hd_shots_against,hd_saves,hd_goals_against,hdsv_pct,md_shots_against,md_saves,md_goals_against,mdsv_pct,ld_shots_against,ld_saves,ld_goals_against,ldsv_pct,rush_attempts_against,rebound_attempts_against,avg_shot_distance,avg_goal_distance
0,2025-01-15,Ukko-Pekka Luukkonen,BUF,60.0,37,35,2,0.946,2.0,1.67,3.65,4,4,0,1.0,6,6,0,1.0,24,22,2,0.917,5,13,47.43,47.0
1,2025-01-15,Filip Gustavsson,MIN,57.6,36,31,5,0.861,5.21,-1.43,2.51,7,5,2,0.714,4,4,0,1.0,24,21,3,0.875,3,1,33.81,15.4
2,2025-01-15,Calvin Pickard,EDM,60.0,34,31,3,0.912,3.0,0.37,2.72,6,5,1,0.833,11,10,1,0.909,16,15,1,0.938,4,5,35.0,14.67
3,2025-01-15,Dustin Tokarski,CAR,57.72,24,21,3,0.875,3.12,-0.62,2.3,7,6,1,0.857,6,4,2,0.667,6,6,0,1.0,4,3,54.58,27.67
4,2025-01-14,Justus Annunen,NSH,60.0,22,19,3,0.864,3.0,-0.93,2.26,5,4,1,0.8,10,8,2,0.8,5,5,0,1.0,3,6,38.5,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,2024-10-08,Philipp Grubauer,SEA,58.3,25,22,3,0.88,3.09,0.05,1.91,5,3,2,0.6,8,8,0,1.0,11,10,1,0.909,2,2,42.28,20.67
1478,2024-10-05,Devon Levi,BUF,56.77,37,34,3,0.919,3.17,-0.31,3.41,9,7,2,0.778,11,10,1,0.909,15,15,0,1.0,3,6,35.08,19.33
1479,2024-10-05,Jake Allen,NJD,60.0,18,17,1,0.944,1.0,0.31,1.78,3,2,1,0.667,1,1,0,1.0,12,12,0,1.0,1,6,52.94,7.0
1480,2024-10-04,Ukko-Pekka Luukkonen,BUF,59.08,22,19,3,0.864,3.05,-1.34,2.61,2,1,1,0.5,9,8,1,0.889,11,10,1,0.909,4,8,36.27,36.67


In [7]:
# Prepare your data
processed_data = prepare_game_data(goalie_stats)

# Train the model
model, scaler = train_model(processed_data)

# Make predictions for a specific goalie
goalie_data = processed_data[processed_data['player'] == 'Adin Hill']
predicted_sv = predict_next_game(goalie_data, model, scaler)
performance_scalar = calculate_performance_scalar(predicted_sv)

In [11]:
predicted_sv

np.float64(0.8912598003037444)

In [10]:
performance_scalar

np.float64(1.0187401996962557)

In [None]:
g