In [1]:
import os
import sys
import pandas as pd
import requests
import logging
from datetime import datetime, timedelta
import random
import time
import psycopg2
from io import StringIO
from dotenv import load_dotenv

# Add project root to sys.path
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Verify sys.path
print("Current sys.path:", sys.path)

from src.data_processing.nst_scraper import nst_on_ice_scraper, nst_team_on_ice_scraper
from src.db.base_utils import connect_db, disconnect_db

pd.set_option('display.max_columns', None)


Current sys.path: ['C:\\Python39\\python39.zip', 'C:\\Python39\\DLLs', 'C:\\Python39\\lib', 'C:\\Python39', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv', '', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages', 'c:\\users\\bills\\documents\\python\\accurateshothelper\\structureboost', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages\\win32', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages\\win32\\lib', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper\\.venv\\lib\\site-packages\\Pythonwin', 'c:\\Users\\bills\\Documents\\python\\accurateshothelper']


In [2]:
# nst_team_on_ice_scraper(startdate='', enddate='2024-10-12',last_n=30)
# df = nst_on_ice_scraper(startdate='', enddate='2024-10-09', last_n=30, rate='y', lines='single') 

In [3]:
goalie_stats_df = nst_on_ice_scraper(
            startdate='2024-10-12',
            enddate='2024-10-12',
            pos='G',
            rate='n',
            stdoi='g',
            lines='single'
        )


In [4]:
goalie_stats_df.columns

Index(['player', 'team', 'gp', 'toi', 'shots_against', 'saves',
       'goals_against', 'sv%', 'gaa', 'gsaa', 'xg_against', 'hd_shots_against',
       'hd_saves', 'hd_goals_against', 'hdsv%', 'hdgaa', 'hdgsaa',
       'md_shots_against', 'md_saves', 'md_goals_against', 'mdsv%', 'mdgaa',
       'mdgsaa', 'ld_shots_against', 'ld_saves', 'ld_goals_against', 'ldsv%',
       'ldgaa', 'ldgsaa', 'rush_attempts_against', 'rebound_attempts_against',
       'avg._shot_distance', 'avg._goal_distance'],
      dtype='object')

In [5]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
def insert_goalie_stats_df(df, conn, cursor):
    """
    Insert goalie stats dataframe into database using psycopg2.
    
    This revised version cleans and maps the DataFrame's columns to match the new schema.
    It now expects the following columns (after cleaning):
      - player, team, gp, toi,
      - shots_against, saves, goals_against, sv_pct,
      - gaa, gsaa, xg_against, hd_shots_against, hd_saves,
      - hd_goals_against, hdsv_pct, hdgaa, hdgsaa,
      - md_shots_against, md_saves, md_goals_against, mdsv_pct,
      - mdgaa, mdgsaa, ld_shots_against, ld_saves, ld_goals_against,
      - ldsv_pct, ldgaa, ldgsaa, rush_attempts_against, rebound_attempts_against,
      - avg_shot_distance, avg_goal_distance,
      - date
      
    The INSERT uses (player, date) as the conflict target.
    """
    # Clean column names to match PostgreSQL table. For example,
    # "sv%" becomes "sv_pct" and "avg._shot_distance" becomes "avg_shot_distance".
    df.columns = (
        df.columns
        .str.replace('/', '_per_', regex=False)      # In case any '/' exists
        .str.replace('%', '_pct', regex=False)         # Replace % with _pct
        .str.replace(r'[^a-zA-Z0-9]', '_', regex=True)  # Replace other special characters
        .str.replace(r'_+', '_', regex=True)            # Collapse multiple underscores
        .str.strip('_')
        .str.lower()
    )
    
    # Replace placeholder '-' with None so that numeric columns are handled properly.
    df.replace("-", None, inplace=True)
    
    # Optionally, convert expected numeric columns explicitly,
    # so that any remaining non-numeric entries become NaN.
    numeric_columns = [
        'gp', 'toi', 'shots_against', 'saves', 'goals_against', 'sv_pct',
        'gaa', 'gsaa', 'xg_against', 'hd_shots_against', 'hd_saves', 'hd_goals_against',
        'hdsv_pct', 'hdgaa', 'hdgsaa', 'md_shots_against', 'md_saves', 'md_goals_against',
        'mdsv_pct', 'mdgaa', 'mdgsaa', 'ld_shots_against', 'ld_saves', 'ld_goals_against',
        'ldsv_pct', 'ldgaa', 'ldgsaa', 'rush_attempts_against', 'rebound_attempts_against',
        'avg_shot_distance', 'avg_goal_distance'
    ]
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Log cleaned columns for verification
    logger.info(f"Cleaned columns: {df.columns.tolist()}")
    
    # Ensure required columns are present (last_n_games removed)
    required_columns = [
        'player', 'team', 'gp', 'toi',
        'shots_against', 'saves', 'goals_against', 'sv_pct',
        'gaa', 'gsaa', 'xg_against', 'hd_shots_against', 'hd_saves',
        'hd_goals_against', 'hdsv_pct', 'hdgaa', 'hdgsaa',
        'md_shots_against', 'md_saves', 'md_goals_against', 'mdsv_pct',
        'mdgaa', 'mdgsaa', 'ld_shots_against', 'ld_saves', 'ld_goals_against',
        'ldsv_pct', 'ldgaa', 'ldgsaa', 'rush_attempts_against', 'rebound_attempts_against',
        'avg_shot_distance', 'avg_goal_distance',
        'date'
    ]
    
    missing = [col for col in required_columns if col not in df.columns]
    if missing:
        logger.error(f"Missing columns after cleaning: {missing}")
        raise KeyError(f"Missing columns: {missing}")
    
    # Prepare the INSERT statement with the updated column names.
    insert_query = """
    INSERT INTO goalie_stats (
        player, team, gp, toi,
        shots_against, saves, goals_against, sv_pct,
        gaa, gsaa, xg_against, hd_shots_against, hd_saves,
        hd_goals_against, hdsv_pct, hdgaa, hdgsaa,
        md_shots_against, md_saves, md_goals_against, mdsv_pct,
        mdgaa, mdgsaa, ld_shots_against, ld_saves, ld_goals_against,
        ldsv_pct, ldgaa, ldgsaa, rush_attempts_against, rebound_attempts_against,
        avg_shot_distance, avg_goal_distance,
        date
    ) VALUES (
        %(player)s, %(team)s, %(gp)s, %(toi)s,
        %(shots_against)s, %(saves)s, %(goals_against)s, %(sv_pct)s,
        %(gaa)s, %(gsaa)s, %(xg_against)s, %(hd_shots_against)s, %(hd_saves)s,
        %(hd_goals_against)s, %(hdsv_pct)s, %(hdgaa)s, %(hdgsaa)s,
        %(md_shots_against)s, %(md_saves)s, %(md_goals_against)s, %(mdsv_pct)s,
        %(mdgaa)s, %(mdgsaa)s, %(ld_shots_against)s, %(ld_saves)s, %(ld_goals_against)s,
        %(ldsv_pct)s, %(ldgaa)s, %(ldgsaa)s, %(rush_attempts_against)s, %(rebound_attempts_against)s,
        %(avg_shot_distance)s, %(avg_goal_distance)s,
        %(date)s
    )
    ON CONFLICT (player, date) DO UPDATE SET
        team = EXCLUDED.team,
        gp = EXCLUDED.gp,
        toi = EXCLUDED.toi,
        shots_against = EXCLUDED.shots_against,
        saves = EXCLUDED.saves,
        goals_against = EXCLUDED.goals_against,
        sv_pct = EXCLUDED.sv_pct,
        gaa = EXCLUDED.gaa,
        gsaa = EXCLUDED.gsaa,
        xg_against = EXCLUDED.xg_against,
        hd_shots_against = EXCLUDED.hd_shots_against,
        hd_saves = EXCLUDED.hd_saves,
        hd_goals_against = EXCLUDED.hd_goals_against,
        hdsv_pct = EXCLUDED.hdsv_pct,
        hdgaa = EXCLUDED.hdgaa,
        hdgsaa = EXCLUDED.hdgsaa,
        md_shots_against = EXCLUDED.md_shots_against,
        md_saves = EXCLUDED.md_saves,
        md_goals_against = EXCLUDED.md_goals_against,
        mdsv_pct = EXCLUDED.mdsv_pct,
        mdgaa = EXCLUDED.mdgaa,
        mdgsaa = EXCLUDED.mdgsaa,
        ld_shots_against = EXCLUDED.ld_shots_against,
        ld_saves = EXCLUDED.ld_saves,
        ld_goals_against = EXCLUDED.ld_goals_against,
        ldsv_pct = EXCLUDED.ldsv_pct,
        ldgaa = EXCLUDED.ldgaa,
        ldgsaa = EXCLUDED.ldgsaa,
        rush_attempts_against = EXCLUDED.rush_attempts_against,
        rebound_attempts_against = EXCLUDED.rebound_attempts_against,
        avg_shot_distance = EXCLUDED.avg_shot_distance,
        avg_goal_distance = EXCLUDED.avg_goal_distance;
    """
    
    # Convert DataFrame to a list of dictionaries for batch insertion.
    records = df.to_dict('records')
    cursor.executemany(insert_query, records)
    conn.commit()

In [7]:
def scrape_goalie_stats_range(start_date: str, end_date: str, 
                              db_prefix: str = "NST_DB_",
                              delay_min: int = 3, 
                              delay_max: int = 7):
    """
    Scrape goalie stats across a date range and save to the database.
    
    This function iterates through each day in the provided date range.
    For each day, it calls the scraper to retrieve data by setting both startdate
    and enddate to the same value, then inserts that day's records into the database.
    The conflict target is (player, date).
    
    Args:
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.
        db_prefix (str): Prefix for database environment variables.
        delay_min (int): Minimum delay between requests (seconds).
        delay_max (int): Maximum delay between requests (seconds).
    """
    # Convert dates to datetime objects
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end   = datetime.strptime(end_date, '%Y-%m-%d')
    
    # Initialize counters for logging purposes
    successful_scrapes = 0
    failed_scrapes = 0
    
    current_date = start
    while current_date <= end:
        conn = None
        cursor = None
        try:
            # Format the current day as a string
            current_date_str = current_date.strftime('%Y-%m-%d')
            logger.info(f"Scraping data for date: {current_date_str}")
            
            # Scrape data for the day; by setting both startdate and enddate to current_date_str,
            # the scraper returns data only for that day.
            goalie_stats_df = nst_on_ice_scraper(
                startdate=current_date_str,
                enddate=current_date_str,
                pos='G',
                rate='n',   # Use 'n' to indicate non-rolling metrics (day-by-day)
                stdoi='g',
                lines='single'
            )
            
            # Add date information so that the INSERT can use these for conflict targeting.
            goalie_stats_df['date'] = current_date.date()
            
            # Optionally, add season information for logging or downstream processing.
            year = current_date.year
            month = current_date.month
            season = f"{year-1}-{str(year)[2:]}" if month < 7 else f"{year}-{str(year+1)[2:]}"
            goalie_stats_df['season'] = season
            
            # Log details about the returned DataFrame for debugging.
            logger.info(f"Goalie Stats DataFrame shape: {goalie_stats_df.shape}")
            logger.info(f"Goalie Stats DataFrame columns: {goalie_stats_df.columns.tolist()}")
            
            # Connect to the database.
            conn = connect_db(db_prefix)
            if conn is None:
                raise Exception("Failed to establish database connection")
            cursor = conn.cursor()
            
            # Insert (or update) the day's data into the database.
            insert_goalie_stats_df(goalie_stats_df, conn, cursor)
            
            successful_scrapes += 1
            logger.info(f"Successfully saved data for {current_date_str}")
        except Exception as e:
            failed_scrapes += 1
            logger.error(f"Error processing data for {current_date_str}: {str(e)}")
            if conn:
                conn.rollback()
        finally:
            # Close the database cursor and connection
            if cursor:
                cursor.close()
            if conn:
                disconnect_db(conn)
            
            # If not processing the last date, wait a random delay between requests.
            if current_date < end:
                delay = random.uniform(delay_min, delay_max)
                logger.info(f"Waiting {delay:.1f} seconds before next request...")
                time.sleep(delay)
            
            # Move on to the next day.
            current_date += timedelta(days=1)
    
    # Log the final summary of the scrape process.
    logger.info(f"""
    Scraping completed:
    - Successful scrapes: {successful_scrapes}
    - Failed scrapes: {failed_scrapes}
    - Date range: {start_date} to {end_date}
    """)

In [8]:
# Load environment variables from .env file
load_dotenv()

db_prefix = 'NST_DB_'

In [9]:
# NHL_SEASONS = {
#     20242025: {
#         'start': '2024-10-04',
#         'regular_end': '2025-04-18',  # Estimated
#         'playoff_end': '2025-06-30'  # Estimated
#     },
#     20232024: {
#         'start': '2023-10-10',
#         'regular_end': '2024-04-18',
#         'playoff_end': '2024-06-24'
#     },
#     20222023: {
#         'start': '2022-10-07',
#         'regular_end': '2023-04-14',
#         'playoff_end': '2023-06-13'
#     },
#     20212022: {
#         'start': '2021-10-12',
#         'regular_end': '2022-04-29',
#         'playoff_end': '2022-06-26'
#     },
#     20202021: {
#         'start': '2021-01-13',
#         'regular_end': '2021-05-19',
#         'playoff_end': '2021-07-07'  # Covid-shortened season
#     }

In [10]:
scrape_goalie_stats_range(
        start_date='2023-10-10',
        end_date='2023-10-18',
        db_prefix=db_prefix  # Make sure this matches your environment variables
    )

INFO:__main__:Scraping data for date: 2023-10-10
INFO:__main__:Goalie Stats DataFrame shape: (6, 35)
INFO:__main__:Goalie Stats DataFrame columns: ['player', 'team', 'gp', 'toi', 'shots_against', 'saves', 'goals_against', 'sv%', 'gaa', 'gsaa', 'xg_against', 'hd_shots_against', 'hd_saves', 'hd_goals_against', 'hdsv%', 'hdgaa', 'hdgsaa', 'md_shots_against', 'md_saves', 'md_goals_against', 'mdsv%', 'mdgaa', 'mdgsaa', 'ld_shots_against', 'ld_saves', 'ld_goals_against', 'ldsv%', 'ldgaa', 'ldgsaa', 'rush_attempts_against', 'rebound_attempts_against', 'avg._shot_distance', 'avg._goal_distance', 'date', 'season']
INFO:src.db.base_utils:Database connection established.
INFO:__main__:Cleaned columns: ['player', 'team', 'gp', 'toi', 'shots_against', 'saves', 'goals_against', 'sv_pct', 'gaa', 'gsaa', 'xg_against', 'hd_shots_against', 'hd_saves', 'hd_goals_against', 'hdsv_pct', 'hdgaa', 'hdgsaa', 'md_shots_against', 'md_saves', 'md_goals_against', 'mdsv_pct', 'mdgaa', 'mdgsaa', 'ld_shots_against', 

KeyboardInterrupt: 