In [17]:
import os
import sys
import pandas as pd
import requests
import logging
from datetime import datetime, timedelta
import random
import time
import psycopg2
from io import StringIO
from dotenv import load_dotenv

# Add project root to sys.path
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Verify sys.path
print("Current sys.path:", sys.path)

from src.data_processing.nst_scraper import nst_on_ice_scraper, nst_team_on_ice_scraper
from src.db.base_utils import connect_db, disconnect_db

pd.set_option('display.max_columns', None)


Current sys.path: ['C:\\Python39\\python39.zip', 'C:\\Python39\\DLLs', 'C:\\Python39\\lib', 'C:\\Python39', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv', '', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages', 'c:\\users\\bills\\documents\\python\\accurateshothelper\\structureboost', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper']


In [18]:
# nst_team_on_ice_scraper(startdate='', enddate='2024-10-12',last_n=30)
# df = nst_on_ice_scraper(startdate='', enddate='2024-10-09', last_n=30, rate='y', lines='single') 

In [19]:
goalie_stats_df = nst_on_ice_scraper(
            startdate='',
            enddate='2024-10-12',
            pos='G',
            rate='y',
            stdoi='g',
            last_n=30,
            lines='single'
        )


In [20]:
goalie_stats_df

Unnamed: 0,player,team,gp,toi,toi/gp,shots_against/60,saves/60,sv%,gaa,gsaa/60,xg_against/60,hd_shots_against/60,hd_saves/60,hdsv%,hdgaa,hdgsaa/60,md_shots_against/60,md_saves/60,mdsv%,mdgaa,mdgsaa/60,ld_shots_against/60,ld_saves/60,ldsv%,ldgaa,ldgsaa/60,rush_attempts_against/60,rebound_attempts_against/60,avg._shot_distance,avg._goal_distance
0,Marc-Andre Fleury,MIN,5,246.83,49.37,28.68,26.01,0.907,2.67,-0.14,2.02,6.32,4.38,0.692,1.94,-0.81,5.10,4.38,0.857,0.73,-0.13,15.07,15.07,1.000,0.00,0.50,1.46,2.67,38.13,12.18
1,Jonathan Quick,NYR,3,146.33,48.78,28.29,24.19,0.855,4.10,-1.60,2.15,4.51,2.46,0.545,2.05,-1.24,9.02,7.38,0.818,1.64,-0.58,13.12,12.71,0.969,0.41,0.02,3.28,3.69,37.06,19.80
2,James Reimer,DET,3,157.83,52.61,28.13,24.71,0.878,3.42,-0.93,2.80,5.70,3.80,0.667,1.90,-0.88,9.88,8.36,0.846,1.52,-0.35,11.78,11.78,1.000,0.00,0.39,1.14,4.18,32.19,14.67
3,Semyon Varlamov,NYI,9,450.05,50.01,28.93,27.20,0.940,1.73,0.83,2.34,7.33,6.40,0.873,0.93,0.38,6.40,5.87,0.917,0.53,0.22,13.33,13.07,0.980,0.27,0.17,1.87,4.53,35.89,20.54
4,Jacob Markstrom,"CGY, NJD",8,382.83,47.85,27.58,24.92,0.903,2.66,-0.22,2.69,7.37,6.27,0.851,1.10,0.22,6.74,5.64,0.837,1.10,-0.30,12.38,11.91,0.962,0.47,-0.06,2.35,5.33,35.61,23.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,Jesper Wallstedt,MIN,2,109.18,54.59,26.93,25.83,0.959,1.10,1.28,1.48,2.20,2.20,1.000,0.00,0.39,7.14,6.59,0.923,0.55,0.29,17.04,16.49,0.968,0.55,0.01,2.75,3.30,40.69,44.00
74,Arvid Soderblom,CHI,6,305.97,50.99,31.96,30.00,0.939,1.96,0.87,2.82,6.86,5.88,0.857,0.98,0.25,8.24,7.65,0.929,0.59,0.38,15.30,14.90,0.974,0.39,0.11,1.37,4.90,34.98,18.10
75,Jet Greaves,CBJ,6,297.05,49.51,36.76,33.53,0.912,3.23,0.02,3.19,8.69,6.67,0.767,2.02,-0.46,7.88,7.27,0.923,0.61,0.32,18.18,17.57,0.967,0.61,-0.01,1.82,5.45,37.15,19.88
76,Dennis Hildeby,TOR,1,44.27,44.27,18.98,17.62,0.929,1.36,0.32,1.96,4.07,4.07,1.000,0.00,0.73,9.49,8.13,0.857,1.36,-0.24,5.42,5.42,1.000,0.00,0.18,1.36,8.13,34.86,31.00


In [21]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def insert_goalie_stats_df(df, conn, cursor):
    """
    Insert goalie stats dataframe into database using psycopg2
    """
    # Clean column names to match PostgreSQL table
    df.columns = (
        df.columns
        .str.replace('/', '_per_', regex=False)      # 'toi/gp' -> 'toi_per_gp'
        .str.replace('%', '_pct', regex=False)       # 'sv%' -> 'sv_pct'
        .str.replace(r'[^a-zA-Z0-9]', '_', regex=True)  # Replace other special chars
        .str.replace(r'_+', '_', regex=True)             # Collapse multiple '_'
        .str.strip('_')
        .str.lower()
    )
    
    # Log cleaned columns for verification
    logger.info(f"Cleaned columns: {df.columns.tolist()}")
    
    # Ensure required columns are present
    required_columns = [
        'player', 'team', 'gp', 'toi', 'toi_per_gp',
        'shots_against_per_60', 'saves_per_60', 'sv_pct', 'gaa',
        'gsaa_per_60', 'xg_against_per_60', 'hd_shots_against_per_60',
        'hd_saves_per_60', 'hdsv_pct', 'hdgaa', 'hdgsaa_per_60',
        'md_shots_against_per_60', 'md_saves_per_60', 'mdsv_pct',
        'mdgaa', 'mdgsaa_per_60', 'ld_shots_against_per_60',
        'ld_saves_per_60', 'ldsv_pct', 'ldgaa', 'ldgsaa_per_60',
        'rush_attempts_against_per_60', 'rebound_attempts_against_per_60',
        'avg_shot_distance', 'avg_goal_distance',
        'date', 'last_n_games'
    ]
    
    missing = [col for col in required_columns if col not in df.columns]
    if missing:
        logger.error(f"Missing columns after cleaning: {missing}")
        raise KeyError(f"Missing columns: {missing}")
    
    # Prepare the INSERT statement
    insert_query = """
    INSERT INTO goalie_stats (
        player, team, gp, toi, toi_per_gp,
        shots_against_per_60, saves_per_60, sv_pct, gaa, gsaa_per_60,
        xg_against_per_60, hd_shots_against_per_60, hd_saves_per_60, hdsv_pct, hdgaa,
        hdgsaa_per_60, md_shots_against_per_60, md_saves_per_60, mdsv_pct, mdgaa,
        mdgsaa_per_60, ld_shots_against_per_60, ld_saves_per_60, ldsv_pct, ldgaa,
        ldgsaa_per_60, rush_attempts_against_per_60, rebound_attempts_against_per_60,
        avg_shot_distance, avg_goal_distance,
        date, last_n_games
    ) VALUES (
        %(player)s, %(team)s, %(gp)s, %(toi)s, %(toi_per_gp)s,
        %(shots_against_per_60)s, %(saves_per_60)s, %(sv_pct)s, %(gaa)s, %(gsaa_per_60)s,
        %(xg_against_per_60)s, %(hd_shots_against_per_60)s, %(hd_saves_per_60)s, %(hdsv_pct)s, %(hdgaa)s,
        %(hdgsaa_per_60)s, %(md_shots_against_per_60)s, %(md_saves_per_60)s, %(mdsv_pct)s, %(mdgaa)s,
        %(mdgsaa_per_60)s, %(ld_shots_against_per_60)s, %(ld_saves_per_60)s, %(ldsv_pct)s, %(ldgaa)s,
        %(ldgsaa_per_60)s, %(rush_attempts_against_per_60)s, %(rebound_attempts_against_per_60)s,
        %(avg_shot_distance)s, %(avg_goal_distance)s,
        %(date)s, %(last_n_games)s
    )
    ON CONFLICT (player, gp) DO UPDATE SET
        team = EXCLUDED.team,
        toi = EXCLUDED.toi,
        toi_per_gp = EXCLUDED.toi_per_gp,
        shots_against_per_60 = EXCLUDED.shots_against_per_60,
        saves_per_60 = EXCLUDED.saves_per_60,
        sv_pct = EXCLUDED.sv_pct,
        gaa = EXCLUDED.gaa,
        gsaa_per_60 = EXCLUDED.gsaa_per_60,
        xg_against_per_60 = EXCLUDED.xg_against_per_60,
        hd_shots_against_per_60 = EXCLUDED.hd_shots_against_per_60,
        hd_saves_per_60 = EXCLUDED.hd_saves_per_60,
        hdsv_pct = EXCLUDED.hdsv_pct,
        hdgaa = EXCLUDED.hdgaa,
        hdgsaa_per_60 = EXCLUDED.hdgsaa_per_60,
        md_shots_against_per_60 = EXCLUDED.md_shots_against_per_60,
        md_saves_per_60 = EXCLUDED.md_saves_per_60,
        mdsv_pct = EXCLUDED.mdsv_pct,
        mdgaa = EXCLUDED.mdgaa,
        mdgsaa_per_60 = EXCLUDED.mdgsaa_per_60,
        ld_shots_against_per_60 = EXCLUDED.ld_shots_against_per_60,
        ld_saves_per_60 = EXCLUDED.ld_saves_per_60,
        ldsv_pct = EXCLUDED.ldsv_pct,
        ldgaa = EXCLUDED.ldgaa,
        ldgsaa_per_60 = EXCLUDED.ldgsaa_per_60,
        rush_attempts_against_per_60 = EXCLUDED.rush_attempts_against_per_60,
        rebound_attempts_against_per_60 = EXCLUDED.rebound_attempts_against_per_60,
        avg_shot_distance = EXCLUDED.avg_shot_distance,
        avg_goal_distance = EXCLUDED.avg_goal_distance,
        date = EXCLUDED.date,
        last_n_games = EXCLUDED.last_n_games;
    """
    
    # Convert DataFrame to list of dictionaries
    records = df.to_dict('records')
    
    # Execute batch insert
    cursor.executemany(insert_query, records)
    conn.commit()

def scrape_goalie_stats_range(start_date: str, end_date: str, 
                             db_prefix: str = "NST_DB_",
                             delay_min: int = 3, 
                             delay_max: int = 7):
    """
    Scrape goalie stats across a date range and save to database
    
    Args:
        start_date (str): Start date in 'YYYY-MM-DD' format
        end_date (str): End date in 'YYYY-MM-DD' format
        db_prefix (str): Prefix for database environment variables
        delay_min (int): Minimum delay between requests in seconds
        delay_max (int): Maximum delay between requests in seconds
    """
    # Convert dates to datetime objects
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    # Initialize counters
    successful_scrapes = 0
    failed_scrapes = 0
    
    current_date = start
    while current_date <= end:
        conn = None
        cursor = None
        try:
            # Format date for scraper
            current_date_str = current_date.strftime('%Y-%m-%d')
            logger.info(f"Scraping data for date: {current_date_str}")
            
            # Scrape data
            goalie_stats_df = nst_on_ice_scraper(
                startdate='',  # Empty for rolling stats
                enddate=current_date_str,
                pos='G',
                rate='y',
                stdoi='g',
                last_n=30,  # Rolling 30-game window
                lines='single'
            )
            
            # Add date information
            goalie_stats_df['date'] = current_date.date()
            goalie_stats_df['last_n_games'] = 30
            
            # Determine season
            year = current_date.year
            month = current_date.month
            season = f"{year-1}-{str(year)[2:]}" if month < 7 else f"{year}-{str(year+1)[2:]}"
            goalie_stats_df['season'] = season
            
            # Log DataFrame info
            logger.info(f"Goalie Stats DataFrame shape: {goalie_stats_df.shape}")
            logger.info(f"Goalie Stats DataFrame columns: {goalie_stats_df.columns.tolist()}")
            
            # Connect to database
            conn = connect_db(db_prefix)
            if conn is None:
                raise Exception("Failed to establish database connection")
            
            cursor = conn.cursor()
            
            # Insert data
            insert_goalie_stats_df(goalie_stats_df, conn, cursor)
            
            successful_scrapes += 1
            logger.info(f"Successfully saved data for {current_date_str}")
            
        except Exception as e:
            failed_scrapes += 1
            logger.error(f"Error processing data for {current_date_str}: {str(e)}")
            if conn:
                conn.rollback()
            
        finally:
            # Close cursor and connection
            if cursor:
                cursor.close()
            if conn:
                disconnect_db(conn)
            
            # Random delay between requests
            if current_date < end:  # Only delay if not the last iteration
                delay = random.uniform(delay_min, delay_max)
                logger.info(f"Waiting {delay:.1f} seconds before next request...")
                time.sleep(delay)
            
            # Move to next date
            current_date += timedelta(days=1)
    
    # Final summary
    logger.info(f"""
    Scraping completed:
    - Successful scrapes: {successful_scrapes}
    - Failed scrapes: {failed_scrapes}
    - Date range: {start_date} to {end_date}
    """)

In [22]:
# Load environment variables from .env file
load_dotenv()

db_prefix = 'NST_DB_'

In [23]:
scrape_goalie_stats_range(
        start_date='2024-10-04',
        end_date='2024-10-04',
        db_prefix=db_prefix  # Make sure this matches your environment variables
    )

INFO:__main__:Scraping data for date: 2024-10-04
INFO:__main__:Goalie Stats DataFrame shape: (76, 33)
INFO:__main__:Goalie Stats DataFrame columns: ['player', 'team', 'gp', 'toi', 'toi/gp', 'shots_against/60', 'saves/60', 'sv%', 'gaa', 'gsaa/60', 'xg_against/60', 'hd_shots_against/60', 'hd_saves/60', 'hdsv%', 'hdgaa', 'hdgsaa/60', 'md_shots_against/60', 'md_saves/60', 'mdsv%', 'mdgaa', 'mdgsaa/60', 'ld_shots_against/60', 'ld_saves/60', 'ldsv%', 'ldgaa', 'ldgsaa/60', 'rush_attempts_against/60', 'rebound_attempts_against/60', 'avg._shot_distance', 'avg._goal_distance', 'date', 'last_n_games', 'season']
INFO:src.db.base_utils:Database connection established.
INFO:__main__:Cleaned columns: ['player', 'team', 'gp', 'toi', 'toi_per_gp', 'shots_against_per_60', 'saves_per_60', 'sv_pct', 'gaa', 'gsaa_per_60', 'xg_against_per_60', 'hd_shots_against_per_60', 'hd_saves_per_60', 'hdsv_pct', 'hdgaa', 'hdgsaa_per_60', 'md_shots_against_per_60', 'md_saves_per_60', 'mdsv_pct', 'mdgaa', 'mdgsaa_per_60'