In [None]:
# imports
from utils.utils import *
import pandas as pd
import time
import os
from tqdm import tqdm
import requests

In [6]:
# constants 
tov_cols = ['team', 'player_id', 'player', 'type', 'period', 'clock', 'gameId', 'actionNumber', 'next_pos_points', 'shot_clock', 'opp_team']


# set year here
year = '2014_15'

In [8]:
def get_season_tov_df(year = '2024-25'):

    # get dataframe of all games
    season_games = get_game_df(year.replace('_', '-'))

    # Load or initialize data
    tov_save_path = f'tov_data_{year}.csv'
    failed_save_path = f'failed_games_{year}.csv'

    if os.path.exists(tov_save_path):
        tov_df = pd.read_csv(tov_save_path)
        processed_games = set(tov_df['gameId'].unique())
    else:
        tov_df = pd.DataFrame(columns=tov_cols)
        processed_games = set()

    if os.path.exists(failed_save_path):
        failed_games = pd.read_csv(failed_save_path)['gameId'].tolist()
    else:
        failed_games = []

    fails = 0
    # Main loop
    for row in tqdm(season_games.itertuples(index=False)):
        game = row.GAME_ID
        teams = row.TEAM_ABBREVIATION

        if int(game) in processed_games:
            continue  # Skip already processed games

        time.sleep(1)

        max_retries = 3
        retries = 0

        while retries < max_retries:
            try:
                play_by_play_df = get_play_df(str(game), timeout=2)
                fails = 0
                processed_df = tov_processor(play_by_play_df, teams)

                tov_df = pd.concat([tov_df, processed_df], ignore_index=True)

                # Save progress after each successful game
                tov_df.to_csv(tov_save_path, index=False)

                break
            except (requests.exceptions.Timeout, requests.exceptions.RequestException) as e:
                print(f"Timeout or error for game {game}: {e}. Retrying...")
                time.sleep((retries + 5) * 2)
                retries += 1
        else:
            print(f"Failed to fetch data for game {game} after {max_retries} retries.")
            failed_games.append(game)
            pd.DataFrame({'gameId': failed_games}).to_csv(failed_save_path, index=False)
            fails += 1

        if fails >= 3:
            time.sleep(500)


1230it [00:00, 1657241.86it/s]


In [None]:
# if several games in a row fail, stop execution and continue it after 1+ hours
# progress is saved automatically
df = get_season_tov_df(year = year)
df.head()