In [19]:
# Calculate pitch data from json

import json
import os # Useful for path handling
import math

def read_pitch_data_from_json(file_path):
    """
    Reads the pitch data from a specified JSON file.
    
    Args:
        file_path (str): The full path to the JSON file.
        
    Returns:
        list: The list of pitch dictionaries (ptsOptions) or an empty list if an error occurs.
    """
    
    # Check if the file exists before attempting to open it
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return None
    
    try:
        # 'with open(...)' ensures the file is closed automatically
        with open(file_path, 'r', encoding='utf-8') as f:
            # Use json.load() to read and parse the entire JSON file
            full_data = json.load(f)
            
            # The data you provided shows the pitch list is nested under "ptsOptions"
            # We assume the top level of the JSON is a dictionary.
            #pitch_data_list = full_data.get("ptsOptions", [])
            
            return full_data
            
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from file: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def calculate_plate_height(pitch_data):
    """
    Calculates the flight time and the corrected vertical position (z_plate)
    from a dictionary containing the pitch trajectory data.
    """
    
    # Extract required parameters, converting them to float to be safe
    try:
        y0 = pitch_data['y0']
        vy0 = pitch_data['vy0']
        ay = pitch_data['ay']
        z0 = pitch_data['z0']
        vz0 = pitch_data['vz0']
        az = pitch_data['az']
    except KeyError as e:
        print(f"Missing required key in pitch data: {e}")
        return None

    # --- Step 1: Calculate Flight Time (t) ---
    # 0 = (0.5 * ay) * t^2 + vy0 * t + y0
    a = 0.5 * ay
    b = vy0
    c = y0
    
    discriminant = (b**2) - (4 * a * c)
    
    if discriminant < 0:
        return {"error": "Invalid trajectory data (negative discriminant)."}
    
    # Using the 'minus' branch for the physical solution (time to reach plate)
    time_of_flight = (-b - math.sqrt(discriminant)) / (2 * a)
    
    # --- Step 2: Calculate Vertical Position (z_plate) ---
    # z_plate = z0 + vz0 * t + 0.5 * az * t^2
    z_plate = z0 + (vz0 * time_of_flight) + (0.5 * az * (time_of_flight**2))

    classify_pitch_location(pitch_data, z_plate)

    return {
        "time_of_flight": time_of_flight,
        "z_plate": z_plate
    }

def classify_pitch_location(pitch_data, calculated_z):
    """
    Determines if a pitch is a strike based on X (horizontal) and Z (vertical) coordinates.
    """
    
    # 1. Define Horizontal Boundaries (17 inches / 2 = 8.5 inches = 0.708333 ft)
    ZONE_LEFT = -0.70835
    ZONE_RIGHT = 0.70835
    
    # 2. Get Vertical Boundaries from data
    topSz = pitch_data['topSz']
    bottomSz = pitch_data['bottomSz']
    
    crossPlateX = pitch_data['crossPlateX']
    
    # --- Horizontal Classification ---
    is_x_strike = ZONE_LEFT <= crossPlateX <= ZONE_RIGHT

    if pitch_data['stance'] == "L":
        if crossPlateX > ZONE_RIGHT:
            x_status = "Inside Out"
        elif crossPlateX < ZONE_LEFT:
            x_status = "Outside Out"
        else:
            x_status = "Within Strikezone X"
    else:
        if crossPlateX > ZONE_RIGHT:
            x_status = "Outside Out"
        elif crossPlateX < ZONE_LEFT:
            x_status = "Inside Out"
        else:
            x_status = "Within Strikezone X"
        
    # --- Vertical Classification ---
    is_z_strike = bottomSz <= calculated_z <= topSz
    
    if calculated_z > topSz:
        z_status = "High Out"
    elif calculated_z < bottomSz:
        z_status = "Low Out"
    else:
        z_status = "Within Strikezone Z"
        
    # --- Final Strike Zone Determination ---
    final_status = ""
    if is_x_strike and is_z_strike:
        final_status = "STRIKE"
    else:
        final_status = "BALL"

    #print(f"Pitch Status: {final_status}")
    #print(f"Horizontal: {x_status} (X: {pitch_data['crossPlateX']:.4f})")
    #print(f"Vertical: {z_status} (Z: {calculated_z:.4f})")
    
    return {
        "x_location": x_status,
        "z_location": z_status,
        "is_strike": final_status
    }

# 1. Define the path to your file
json_file_path = '251007 nc vs ss inn1.json'

# 2. Call the function to get the list of pitches
obj = read_pitch_data_from_json(json_file_path)
eighth = obj['result']['textRelayData']['textRelays'][0]
textOptions = eighth['textOptions']
ptsOptions = eighth['ptsOptions']

processed_pitches = []

for pitch in ptsOptions:
    # Calculate the corrected height
    calculation_result = calculate_plate_height(pitch)

    pitch_summary = pitch.copy()
    
    if calculation_result:
        # Add the new calculated values to the dictionary
        pitch_summary['calculated_z_plate'] = calculation_result.get('z_plate')
        pitch_summary['calculated_time'] = calculation_result.get('time_of_flight')
    else:
        pitch_summary['calculated_z_plate'] = None
        pitch_summary['calculated_time'] = None
        
    processed_pitches.append(pitch_summary)

# --- Output the Results ---
print("\n--- Processed Pitch Data ---")
for p in processed_pitches:
    print(f"Pitch ID: {p['pitchId']} | Inning: {p['inn']} | Ball Count: {p['ballcount']}")
    print(f"  Cross Plate X: {p['crossPlateX']:.4f}")
    print(f"  Cross Plate Z: {p['calculated_z_plate']:.4f}")
    print("-" * 20)


--- Processed Pitch Data ---
Pitch ID: 251007_151509 | Inning: 1 | Ball Count: 1
  Cross Plate X: -0.0287
  Cross Plate Z: 1.2181
--------------------
Pitch ID: 251007_151529 | Inning: 1 | Ball Count: 2
  Cross Plate X: 0.0875
  Cross Plate Z: 1.3088
--------------------
Pitch ID: 251007_151553 | Inning: 1 | Ball Count: 3
  Cross Plate X: -0.2572
  Cross Plate Z: 2.5155
--------------------
Pitch ID: 251007_151616 | Inning: 1 | Ball Count: 4
  Cross Plate X: -0.2342
  Cross Plate Z: 2.3427
--------------------


In [54]:
# Get all pitch data from 1 inning from json

import json
import os
import math
import pandas as pd

# https://api-gw.sports.naver.com/schedule/games/44441007NCSS02025/relay?inning=1

# --- KINEMATICS AND ZONE CLASSIFICATION FUNCTIONS ---
def calculate_plate_height(pitch_data):
    """Calculates the flight time and the corrected vertical position (z_plate)."""
    try:
        y0 = pitch_data['y0']
        vy0 = pitch_data['vy0']
        ay = pitch_data['ay']
        z0 = pitch_data['z0']
        vz0 = pitch_data['vz0']
        az = pitch_data['az']
    except KeyError as e:
        # print(f"Missing required key in pitch data: {e}") # Suppress during bulk processing
        return None

    # Step 1: Calculate Flight Time (t)
    a = 0.5 * ay
    b = vy0
    c = y0
    discriminant = (b**2) - (4 * a * c)
    
    if discriminant < 0 or a == 0:
        return {"error": "Invalid kinematics data."}
    
    # Use the minus branch for the time to the plate (t > 0)
    time_of_flight = (-b - math.sqrt(discriminant)) / (2 * a)
    
    # Step 2: Calculate Vertical Position (z_plate)
    z_plate = z0 + (vz0 * time_of_flight) + (0.5 * az * (time_of_flight**2))

    return {
        "time_of_flight": time_of_flight,
        "z_plate": z_plate
    }

def classify_5x5_zone(crossPlateX, calculated_z, topSz, bottomSz):
    """
    Classifies a pitch into one of 25 zones (11 to 55) based on a 5x5 grid 
    (3x3 zone + 1 block shadow), and returns a flag if it is outside even the shadow.
    """
    
    # 1. Define Standard Baseball Constants
    PLATE_WIDTH_FT = 1.4167  # 17 inches
    HALF_PLATE = PLATE_WIDTH_FT / 2 # 0.70835
    
    # 2. Define Zone Block Sizes
    X_BLOCK = PLATE_WIDTH_FT / 3
    Z_BLOCK = (topSz - bottomSz) / 3
    
    # 3. Define X Boundaries (6 boundaries create 5 zones)
    x_boundaries = [
        -HALF_PLATE - X_BLOCK,   # X1: Far Left Shadow boundary
        -HALF_PLATE,             # X2: Left edge of plate
        -HALF_PLATE + X_BLOCK,   # X3: Left-center boundary
        HALF_PLATE - X_BLOCK,    # X4: Right-center boundary
        HALF_PLATE,              # X5: Right edge of plate
        HALF_PLATE + X_BLOCK     # X6: Far Right Shadow boundary
    ]
    
    # 4. Define Z Boundaries (6 boundaries create 5 zones)
    z_boundaries = [
        bottomSz - Z_BLOCK,      # Z1: Far Low Shadow boundary
        bottomSz,                # Z2: Bottom of Zone
        bottomSz + Z_BLOCK,      # Z3: Low-mid boundary
        topSz - Z_BLOCK,         # Z4: Mid-high boundary
        topSz,                   # Z5: Top of Zone
        topSz + Z_BLOCK          # Z6: Far High Shadow boundary
    ]
    
    # Determine the X-Index -1, (0 to 4), 5
    x_index = -1
    is_out_bound = False
    for i, boundary in enumerate(x_boundaries):
        if crossPlateX < boundary:
            x_index = i
            break
    else:
        x_index = 5

    # Determine the Z-Index -1, (0 to 4), 5
    z_index = -1
    for i, boundary in enumerate(z_boundaries):
        if calculated_z < boundary:
            z_index = i
            break
    else:
        z_index = 5


    # --- Determine Outside Boundary Flag ---
    # The flag is True if the pitch is outside the 5x5 grid (index 0 or 5)
    is_outside_boundary = (x_index == 0) or (x_index == 5) or \
                              (z_index == 0) or (z_index == 5)
    
    # --- Calculate Final Zone ID (clamping index to 1-5) ---
    # Clamp the index between 1 and 5 (index 0 maps to zone 1, index 5 maps to zone 5)
    # This prevents the final Zone ID from being 00 or 66, but still flags the issue.
    # Note: Zone ID 1x is the low row, 5x is the high row.
    final_z_index = max(1, min(z_index, 5))
    final_x_index = max(1, min(x_index, 5))

    # Zone ID (e.g., Row 1 x 10 + Col 1 = 11, Row 5 x 10 + Col 5 = 55)
    zone_id = final_z_index * 10 + final_x_index
    
    return {
        "zone_5x5_id": str(zone_id),
        "is_outside_boundary": is_outside_boundary,
        "raw_x_index": x_index,
        "raw_z_index": z_index
    }

def read_pitch_data_from_json(file_path):
    """Reads the full JSON object."""
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return None
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"An error occurred while reading/decoding the file: {e}")
        return None

# --- MAIN PROCESSING SCRIPT ---

# 1. Define the path to your file
json_file_path = '251007 nc vs ss inn1.json'

# 2. Load the full JSON object
full_data_obj = read_pitch_data_from_json(json_file_path)

# Assuming the structure: obj -> 'result' -> 'textRelayData' -> 'textRelays' (list of PAs)
try:
    home_pitcher_list = full_data_obj['result']['textRelayData']['homeLineup']['pitcher']
    away_pitcher_list = full_data_obj['result']['textRelayData']['awayLineup']['pitcher']
    all_pitchers_list = away_pitcher_list + home_pitcher_list
    pitcher_lookup = {}
    for p in all_pitchers_list:
        pitcher_lookup[p.get('pcode')] = p
    
    pa_list = full_data_obj['result']['textRelayData']['textRelays']
except (KeyError, TypeError):
    print("Error: Could not find 'textRelays' list. Check JSON path.")
    pa_list = []

all_processed_pitches = []
# Loop through every Plate Appearance (PA) in the list
for pa_data in pa_list:
    
    # 1. Create a dictionary map for easy look-up of pitch details
    # We want: {"251007_151509": {"speed": "148", "stuff": "직구", ...}}
    pitch_details_map = {}
    for detail in pa_data.get('textOptions', []):
        if detail.get('type') == 1: # Filter for the pitch info entries
            pitch_details_map[detail.get('ptsPitchId')] = detail

    # 2. Extract PA-level context (assuming PA data has these keys)
    batter_title = pa_data.get('title', 'N/A').split(' ') # Example: "8번타자 류지혁"
    batter_name = batter_title[-1]
    batter_lineup_pos = batter_title[0][0]
    is_batter_home = pa_data.get('homeOrAway') == "1"
    
    # Loop through every pitch with trajectory data within this PA
    pts_pitches = pa_data.get('ptsOptions', [])
    if not isinstance(pts_pitches, list):
         # Handle the scenario where 'ptsOptions' might be a dictionary with pitchId keys, 
         # but based on your structure, it looks like a list. We'll stick to list.
         pass 

    for pitch in pts_pitches:
        pitch_summary = {}
        
        # Add Trajectory Data (ptsOptions)
        pitch_summary.update(pitch)
        
        # Add Text Data (textOptions) via ptsPitchId lookup
        pitch_id = pitch.get('pitchId')
        details = pitch_details_map.get(pitch_id, {})
        
        # Add game context
        currentGameState = details.get('currentGameState')
        pitch_summary['home_score'] = currentGameState.get('homeScore')
        pitch_summary['away_score'] = currentGameState.get('awayScore')
        pitch_summary['strike'] = currentGameState.get('strike')
        pitch_summary['ball'] = currentGameState.get('ball')
        pitch_summary['out'] = currentGameState.get('out')
        pitch_summary['base1'] = currentGameState.get('base1')
        pitch_summary['base2'] = currentGameState.get('base2')
        pitch_summary['base3'] = currentGameState.get('base3')

        # Add Batter info
        pitch_summary['is_batter_home'] = is_batter_home
        pitch_summary['batter_id'] = currentGameState.get('batter')
        pitch_summary['batter_name'] = batter_name
        pitch_summary['batter_lineup_pos'] = batter_lineup_pos

        # Add Pitcher info
        pitcher_id = currentGameState.get('pitcher')
        pitcher_data = pitcher_lookup.get(pitcher_id, {})
        
        pitch_summary['pitcher_id'] = pitcher_id
        pitch_summary['pitcher_name'] = pitcher_data.get('name', 'N/A')
        
        # Check for the '우' (Right) character in 'hitType' X투X타
        hit_type = pitcher_data.get('hitType', 'L') 
        pitch_summary['pitcher_stance'] = 'R' if hit_type.startswith('우') else 'L'
        
        pitch_summary['pitch_type'] = details.get('stuff', 'N/A')
        pitch_summary['pitch_speed_kph'] = details.get('speed', 'N/A')
        pitch_summary['pitch_result'] = details.get('pitchResult', 'N/A')
        pitch_summary['is_throwing_stretch'] = (pitch_summary['base1'] != "0") or \
                              (pitch_summary['base2'] != "0") or (pitch_summary['base3'] != "0")
        
        # --- CALCULATIONS & ENRICHMENT ---
        calculation_result = calculate_plate_height(pitch)
        
        if calculation_result:
            calculated_z = calculation_result.get('z_plate')
            
            # 5. Perform 5x5 zone classification
            zone_results = classify_5x5_zone(
                pitch.get('crossPlateX', 0.0), 
                calculated_z, 
                pitch.get('topSz', 3.3), 
                pitch.get('bottomSz', 1.6)
            )
            
            pitch_summary['plate_z_ft'] = round(calculated_z, 4)
            pitch_summary['zone_5x5_id'] = zone_results['zone_5x5_id']
            pitch_summary['is_outside_boundary'] = zone_results['is_outside_boundary']
            
        all_processed_pitches.append(pitch_summary)

# 5. Convert to Pandas DataFrame
df = pd.DataFrame(all_processed_pitches)

df.rename(columns={
    'pitchId': 'pitch_id',
    'crossPlateX': 'plate_x_ft',
    'ballcount': 'ball_count',
    'crossPlateY': 'plate_y_ft',
    'topSz': 'strikezone_top',
    'bottomSz': 'strikezone_btm',
    'stance': 'batter_stance',
    'inn': 'inning'
}, inplace=True)

# --- Output the Results ---
#print(f"\n--- Final Processed DataFrame Created: {len(df)} Total Pitches ---")
#print("5 rows with key analytical columns:")
#print(df[[
#    'pitch_id', 'pitcher_id', 'batter_id', 'is_batter_home', 'pitcher_name', 'pitcher_stance', 
#].head())
#print(df.tail())
#print(all_processed_pitches[0])
# To save to CSV:
df.to_csv('251007 nc vs ss inn1_pitch_data_processed.csv', index=False)

In [84]:
# Request json from api, get all innings pitch data
import os
import requests
import time
import json
import math
import pandas as pd


# --- Configuration ---
JSON_FOLDER = 'pitch_raw'
CSV_FOLDER = 'pitch_processed'
JSON_SUFFIX = '_raw'
CSV_SUFFIX = '_processed'
# ---------------------

def _extract_game_data(data):
    """Safely extracts max_inning and PA list from the JSON data."""
    text_relay_data = data.get('result', {}).get('textRelayData', {})
    
    # max_inning needs to be extracted as an integer
    try:
        max_inning = int(text_relay_data.get('inn', 0))
    except (TypeError, ValueError):
        max_inning = 0
        
    pa_list = text_relay_data.get('textRelays', [])

    # REVERSE THE LIST FOR CHRONOLOGICAL ORDER
    pa_list.reverse() 
    
    # The full data object is needed later to build the pitcher lookup (from inn 1 data)
    return max_inning, pa_list, data

def get_json_files(game_id):
    """
    Fetches game data by inning, checks for existing files, and merges PA data.
    Returns: master_json_list (list of all PAs) and initial_data (full JSON for Inning 1).
    """
    os.makedirs(JSON_FOLDER, exist_ok=True)
    inning_counter = 1
    max_inning = 1
    master_json_list = []
    initial_data = None # Store the full JSON from inning 1 to get pitcher lineups

    while inning_counter <= max_inning:
        raw_file_path = os.path.join(JSON_FOLDER, f'{game_id}_{inning_counter}{JSON_SUFFIX}.json')
        data = None # Reset data object for each loop iteration

        # 1. CHECK FOR EXISTING FILE
        if os.path.exists(raw_file_path):
            print(f"Inning {inning_counter} JSON already exists. Loading from disk.")
            try:
                with open(raw_file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error loading existing JSON file {raw_file_path}: {e}. Proceeding to refetch.")
                data = None # Set to None to trigger refetch below

        # 2. FETCH NEW DATA if necessary (or if file was corrupt)
        if data is None:
            print(f"Fetching Inning {inning_counter}...")
            url = f"https://api-gw.sports.naver.com/schedule/games/{game_id}/relay?inning={inning_counter}"
            
            try:
                r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
                r.raise_for_status()
                data = r.json()
                
                # Save the Raw JSON for the inning
                with open(raw_file_path, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
                print(f"  -> Successfully fetched and saved raw JSON to {raw_file_path}")
                
            except requests.exceptions.RequestException as e:
                print(f"Error fetching Inning {inning_counter}: {e}")
                break # Stop on any fetching error

        # 3. PROCESS DATA (both loaded and fetched)
        if data:
            current_max_inning, pa_list, full_json_data = _extract_game_data(data)
            
            if inning_counter == 1:
                # Update max_inning based on the first inning's data
                max_inning = current_max_inning
                initial_data = full_json_data # Store the full data for pitcher lookup
                print(f"Max inning set to: {max_inning}")

            if max_inning == 0:
                print(f"Unable to get a valid max_inning (is 0). Stopping.")
                break
                
            if not pa_list:
                print(f"Inning {inning_counter} contains no Plate Appearances. Stopping.")
                break
            
            # Collect the 'textRelays' list for later merging
            master_json_list.extend(pa_list)
            inning_counter += 1
        
        # Add a short delay only after a successful fetch to respect the API limits
        if data and not os.path.exists(raw_file_path): # Check if we just did a fetch
             time.sleep(1)

    print(f"\nFinished data collection. Total PA lists collected: {len(master_json_list)}")
    return master_json_list, initial_data
    
def extract_team_codes(game_id):
    """
    Extracts the 2-character away and home team codes from the game_id string.
    Example: '20250930LTHH02025' -> Away: 'LT', Home: 'HH'
    """
    if len(game_id) < 12:
        return 'N/A', 'N/A' # Handle unexpected format
        
    away_code = game_id[8:10]
    home_code = game_id[10:12]
    
    return away_code, home_code
    
def build_pitcher_lookup(full_inning_1_data):
    """
    Extracts pitcher data from the full Inning 1 game object and creates a lookup dictionary.

    Args:
        full_inning_1_data (dict): The full JSON object from the first inning fetch.

    Returns:
        dict: A dictionary mapping pitcher 'pcode' to their static data (name, stance, etc.).
    """
    pitcher_lookup = {}
    
    try:
        data = full_inning_1_data['result']['textRelayData']
        home_pitcher_list = data['homeLineup']['pitcher']
        away_pitcher_list = data['awayLineup']['pitcher']
        all_pitchers_list = away_pitcher_list + home_pitcher_list
        
        for p in all_pitchers_list:
            pcode = p.get('pcode')
            if pcode:
                # Pre-calculate stance for efficiency
                hit_type = p.get('hitType', 'L')
                p['stance_derived'] = 'R' if hit_type.startswith('우') else 'L'
                pitcher_lookup[pcode] = p
        
        return pitcher_lookup
        
    except (KeyError, TypeError) as e:
        print(f"Error building pitcher lookup from lineup data: {e}")
        return {}

# --- KINEMATICS AND ZONE CLASSIFICATION FUNCTIONS ---

def calculate_plate_height(pitch_data):
    """Calculates the flight time and the corrected vertical position (z_plate)."""
    try:
        y0 = pitch_data['y0']
        vy0 = pitch_data['vy0']
        ay = pitch_data['ay']
        z0 = pitch_data['z0']
        vz0 = pitch_data['vz0']
        az = pitch_data['az']
    except KeyError as e:
        # print(f"Missing required key in pitch data: {e}") # Suppress during bulk processing
        return None

    # Step 1: Calculate Flight Time (t)
    a = 0.5 * ay
    b = vy0
    c = y0
    discriminant = (b**2) - (4 * a * c)
    
    if discriminant < 0 or a == 0:
        return {"error": "Invalid kinematics data."}
    
    # Use the minus branch for the time to the plate (t > 0)
    time_of_flight = (-b - math.sqrt(discriminant)) / (2 * a)
    
    # Step 2: Calculate Vertical Position (z_plate)
    z_plate = z0 + (vz0 * time_of_flight) + (0.5 * az * (time_of_flight**2))

    return {
        "time_of_flight": time_of_flight,
        "z_plate": z_plate
    }

def classify_5x5_zone(crossPlateX, calculated_z, topSz, bottomSz):
    """
    Classifies a pitch into one of 25 zones (11 to 55) based on a 5x5 grid 
    (3x3 zone + 1 block shadow), and returns a flag if it is outside even the shadow.
    """
    
    # 1. Define Standard Baseball Constants
    PLATE_WIDTH_FT = 1.4167  # 17 inches
    HALF_PLATE = PLATE_WIDTH_FT / 2 # 0.70835
    
    # 2. Define Zone Block Sizes
    X_BLOCK = PLATE_WIDTH_FT / 3
    Z_BLOCK = (topSz - bottomSz) / 3
    
    # 3. Define X Boundaries (6 boundaries create 5 zones)
    x_boundaries = [
        -HALF_PLATE - X_BLOCK,   # X1: Far Left Shadow boundary
        -HALF_PLATE,             # X2: Left edge of plate
        -HALF_PLATE + X_BLOCK,   # X3: Left-center boundary
        HALF_PLATE - X_BLOCK,    # X4: Right-center boundary
        HALF_PLATE,              # X5: Right edge of plate
        HALF_PLATE + X_BLOCK     # X6: Far Right Shadow boundary
    ]
    
    # 4. Define Z Boundaries (6 boundaries create 5 zones)
    z_boundaries = [
        bottomSz - Z_BLOCK,      # Z1: Far Low Shadow boundary
        bottomSz,                # Z2: Bottom of Zone
        bottomSz + Z_BLOCK,      # Z3: Low-mid boundary
        topSz - Z_BLOCK,         # Z4: Mid-high boundary
        topSz,                   # Z5: Top of Zone
        topSz + Z_BLOCK          # Z6: Far High Shadow boundary
    ]
    
    # Determine the X-Index -1, (0 to 4), 5
    x_index = -1
    is_out_bound = False
    for i, boundary in enumerate(x_boundaries):
        if crossPlateX < boundary:
            x_index = i
            break
    else:
        x_index = 5

    # Determine the Z-Index -1, (0 to 4), 5
    z_index = -1
    for i, boundary in enumerate(z_boundaries):
        if calculated_z < boundary:
            z_index = i
            break
    else:
        z_index = 5


    # --- Determine Outside Boundary Flag ---
    # The flag is True if the pitch is outside the 5x5 grid (index 0 or 5)
    is_outside_boundary = (x_index == 0) or (x_index == 5) or \
                              (z_index == 0) or (z_index == 5)
    
    # --- Calculate Final Zone ID (clamping index to 1-5) ---
    # Clamp the index between 1 and 5 (index 0 maps to zone 1, index 5 maps to zone 5)
    # This prevents the final Zone ID from being 00 or 66, but still flags the issue.
    # Note: Zone ID 1x is the low row, 5x is the high row.
    final_z_index = max(1, min(z_index, 5))
    final_x_index = max(1, min(x_index, 5))

    # Zone ID (e.g., Row 1 x 10 + Col 1 = 11, Row 5 x 10 + Col 5 = 55)
    zone_id = final_z_index * 10 + final_x_index
    
    return {
        "zone_5x5_id": str(zone_id),
        "is_outside_boundary": is_outside_boundary,
        "raw_x_index": x_index,
        "raw_z_index": z_index
    }
    
def process_plate_appearance(pa_data, pitcher_lookup, away_code, home_code):
    """
    Processes all events (type 8 for batter context, type 1 for pitch)
    within a single Plate Appearance (PA), correctly linking context to pitch data.
    Returns a list of dictionaries, one for each processed pitch (type 1).
    """
    processed_pitches_in_pa = []

    # Create a MAP for Trajectory Data (ptsOptions)
    # Key: pitchId (e.g., "251007_151509")
    # Value: The full trajectory dict
    trajectory_map = {
        pitch.get('pitchId'): pitch
        for pitch in pa_data.get('ptsOptions', [])
    }
    
    # Contextual variables to be updated by Type 8 events
    current_batter_name = 'N/A'
    current_batter_lineup_pos = 'N/A'
    current_batter_id = 'N/A' 
    is_batter_home = pa_data.get('homeOrAway') == "1"

    # Determine static PA team codes
    batter_team_code = home_code if is_batter_home else away_code
    pitcher_team_code = away_code if is_batter_home else home_code

    # Check for pitcher change due to injury during a PA before using this
    #pitcher_data = pitcher_lookup.get(pitcher_id, {})
    #pitcher_name = pitcher_data.get('name', 'N/A')
    #pitcher_stance = pitcher_data.get('stance_derived', 'L') # Using pre-derived stance

    # Iterate through Text Events (textOptions)
    for detail in pa_data.get('textOptions', []):
        event_type = detail.get('type')

        # EVENT TYPE 8: BATTER CHANGE (Context Update)
        if event_type == 8:
            batter_record = detail.get('batterRecord', {})
            current_batter_name = batter_record.get('name', 'N/A')
            current_batter_lineup_pos = batter_record.get('batOrder', 'N/A')
            current_batter_id = batter_record.get('pcode', 'N/A')
            
            # The 'currentGameState' here is usually the state *before* the new batter steps in,
            # but we trust the 'batterRecord' for name/order.
            
            continue # Do not process this event as a pitch

        # EVENT TYPE 1: PITCH EVENT (Data Processing)
        elif event_type == 1:
            pitch_id = detail.get('ptsPitchId')
            pitch = trajectory_map.get(pitch_id)

            if pitch is None:
                # This pitch event exists in textOptions but has NO kinematic data.
                # We skip processing it as a tracked pitch, which is correct.
                # print(f"Skipping pitch {pitch_id}: No trajectory data found.") # Optional: Debug print
                continue
                
            pitch_summary = {}

            # --- A. MERGE DATA & CALCULATIONS (from original script) ---
            
            # Add Trajectory Data (ptsOptions)
            pitch_summary.update(pitch)

            # Kinematic Calculations
            calculation_result = calculate_plate_height(pitch)
            
            pitch_summary['plate_z_ft'] = None
            zone_5x5_id = 0
            is_outside_boundary = False

            if calculation_result:
                calculated_z = calculation_result.get('z_plate')
                zone_results = classify_5x5_zone(
                    pitch.get('crossPlateX', 0.0), calculated_z, 
                    pitch.get('topSz', 3.3), pitch.get('bottomSz', 1.6)
                )
                pitch_summary['plate_z_ft'] = round(calculated_z, 4)
                zone_5x5_id = zone_results['zone_5x5_id']
                is_outside_boundary = zone_results['is_outside_boundary']

            # --- B. ADD CONTEXTUAL & PITCH DATA ---
            
            currentGameState = detail.get('currentGameState', {})
            pitcher_id = currentGameState.get('pitcher')
            pitcher_data = pitcher_lookup.get(pitcher_id, {})

            # 1. Game State
            pitch_summary['home_score'] = currentGameState.get('homeScore')
            pitch_summary['away_score'] = currentGameState.get('awayScore')
            pitch_summary['strike'] = currentGameState.get('strike')
            pitch_summary['ball'] = currentGameState.get('ball')
            pitch_summary['out'] = currentGameState.get('out')
            pitch_summary['base1'] = currentGameState.get('base1')
            pitch_summary['base2'] = currentGameState.get('base2')
            pitch_summary['base3'] = currentGameState.get('base3')

            # 3. Pitcher Info (From currentGameState)
            pitch_summary['pitcher_id'] = pitcher_id
            hit_type = pitcher_data.get('hitType', 'L') # Check for the '우' (Right) character in 'hitType' X투X타
            pitch_summary['pitcher_stance'] = 'R' if hit_type.startswith('우') else 'L'
            pitch_summary['pitcher_team_code'] = pitcher_team_code
            pitch_summary['pitcher_name'] = pitcher_data.get('name', 'N/A')
        
            # 2. Batter Info (From Type 8 event)
            pitch_summary['batter_id'] = current_batter_id
            pitch_summary['batter_team_code'] = batter_team_code
            pitch_summary['batter_lineup_pos'] = current_batter_lineup_pos
            pitch_summary['batter_name'] = current_batter_name

            # 4. Pitch Details
            pitch_summary['is_throwing_stretch'] = (pitch_summary['base1'] != "0") or \
                                                 (pitch_summary['base2'] != "0") or \
                                                 (pitch_summary['base3'] != "0")

            
            pitch_summary['pitch_type'] = detail.get('stuff', 'N/A')
            pitch_summary['pitch_speed_kph'] = detail.get('speed', 'N/A')
            pitch_summary['pitch_result'] = detail.get('pitchResult', 'N/A')
            pitch_summary['is_outside_boundary'] = is_outside_boundary
            pitch_summary['zone_5x5_id'] = zone_5x5_id
                
            processed_pitches_in_pa.append(pitch_summary)

    return processed_pitches_in_pa

def main_processing_script(game_id):
    master_pa_list, initial_data = get_json_files(game_id)
    pitcher_lookup = build_pitcher_lookup(initial_data)
    away_code, home_code = extract_team_codes(game_id)

    if not master_pa_list:
        print("No plate appearances found. Exiting.")
        return pd.DataFrame()

    all_processed_pitches = []

    # 3. Process all Plate Appearances
    for i, pa_data in enumerate(master_pa_list):
        # print(f"Processing PA {i+1}/{len(master_pa_list)}...") # Optional: Debug print
        pitches_in_pa = process_plate_appearance(pa_data, pitcher_lookup, away_code, home_code)
        all_processed_pitches.extend(pitches_in_pa)

    print(f"Total pitches processed: {len(all_processed_pitches)}")

    # 4. Convert to Pandas DataFrame and Clean
    df = pd.DataFrame(all_processed_pitches)

    df.rename(columns={
        'pitchId': 'pitch_id',
        'crossPlateX': 'plate_x_ft',
        'ballcount': 'ball_count',
        'crossPlateY': 'plate_y_ft',
        'topSz': 'strikezone_top',
        'bottomSz': 'strikezone_btm',
        'stance': 'batter_stance',
        'inn': 'inning'
    }, inplace=True)
    
    # 5. Output the Results
    output_filename = f'./{CSV_FOLDER}/{game_id}{CSV_SUFFIX}.csv'
    df.to_csv(output_filename, index=False, encoding='utf-8')
    
    print(f"\n--- Final Processed DataFrame Saved to {output_filename} ---")
    print("5 rows with key columns:")
    print(df[[
        'pitch_id', 'pitcher_id', 'batter_id', 'batter_stance', 'pitcher_name',
        'pitcher_stance', 'pitch_type', 'pitcher_team_code', 'batter_team_code'
    ]].head())
    print(f"No. of Columns: {len(all_processed_pitches[0])}")
    print(all_processed_pitches[0])
    
    return df
    
# --- Execution Block ---
if __name__ == '__main__':
    game_id = '20250930LTHH02025'
    
    # Execute the main function
    final_df = main_processing_script(game_id)

Inning 1 JSON already exists. Loading from disk.
Max inning set to: 10
Inning 2 JSON already exists. Loading from disk.
Inning 3 JSON already exists. Loading from disk.
Inning 4 JSON already exists. Loading from disk.
Inning 5 JSON already exists. Loading from disk.
Inning 6 JSON already exists. Loading from disk.
Inning 7 JSON already exists. Loading from disk.
Inning 8 JSON already exists. Loading from disk.
Inning 9 JSON already exists. Loading from disk.
Inning 10 JSON already exists. Loading from disk.

Finished data collection. Total PA lists collected: 94
Total pitches processed: 268

--- Final Processed DataFrame Saved to ./pitch_processed/20250930LTHH02025_processed.csv ---
5 rows with key columns:
        pitch_id pitcher_id batter_id batter_stance pitcher_name  \
0  250930_183034      54755     52568             R          와이스   
1  250930_183050      54755     52568             R          와이스   
2  250930_183104      54755     52568             R          와이스   
3  250930_1

In [90]:
# Request entire 2025 season data
import os
import requests
import time
import json
import math
import random
import pandas as pd


# --- Configuration ---
JSON_FOLDER = 'pitch_raw'
CSV_FOLDER = 'pitch_processed'
JSON_SUFFIX = '_raw'
CSV_SUFFIX = '_processed'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
}
# ---------------------

def _extract_game_data(data):
    """Safely extracts max_inning and PA list from the JSON data."""
    text_relay_data = data.get('result', {}).get('textRelayData', {})
    
    # max_inning needs to be extracted as an integer
    try:
        max_inning = int(text_relay_data.get('inn', 0))
    except (TypeError, ValueError):
        max_inning = 0
        
    pa_list = text_relay_data.get('textRelays', [])

    # REVERSE THE LIST FOR CHRONOLOGICAL ORDER
    pa_list.reverse() 
    
    # The full data object is needed later to build the pitcher lookup (from inn 1 data)
    return max_inning, pa_list, data

def make_safe_request(url, max_retries=5):
    """
    Performs a request with anti-blocking measures: random delay and exponential backoff.
    Returns the Response object on success, or None on failure.
    """
    # Start with a conservative wait time
    wait_time = 2  
    
    for attempt in range(max_retries):
        try:
            # 1. Randomized Delay (The critical anti-blocking measure)
            delay = random.uniform(2.5, 9.5)
            # print(f"Delaying for {delay:.2f} seconds...") # Optional: Debug print
            time.sleep(delay)

            # 2. Make the request with standard headers
            response = requests.get(url, headers=HEADERS, timeout=30)
            
            # 3. Handle success
            if response.status_code == 200:
                return response
            
            # 4. Handle "Too Many Requests" (429) using exponential backoff
            elif response.status_code == 429:
                print(f"[{url}] Received 429 on attempt {attempt + 1}. Waiting {wait_time}s and retrying.")
                time.sleep(wait_time)
                wait_time *= 2  # Double the wait time (5, 10, 20, ...)
            
            # 5. Handle other HTTP errors (404, 500, etc.)
            else:
                response.raise_for_status() # Raises an exception for 4xx/5xx status codes
                
        except requests.exceptions.RequestException as e:
            print(f"[{url}] Request failed on attempt {attempt + 1}: {e}")
            if attempt < max_retries - 1:
                # Wait before retrying on general network errors
                time.sleep(wait_time)
            
        except Exception as e:
             # Handle unexpected exceptions
             print(f"[{url}] Unexpected error: {e}")
             break

    print(f"[{url}] Failed to retrieve data after {max_retries} attempts.")
    return None

def get_json_files(game_id):
    """
    Fetches game data by inning, checks for existing files, and merges PA data.
    Returns: master_json_list (list of all PAs) and initial_data (full JSON for Inning 1).
    """
    os.makedirs(JSON_FOLDER, exist_ok=True)
    inning_counter = 1
    max_inning = 1
    master_json_list = []
    initial_data = None

    while inning_counter <= max_inning:
        raw_file_path = os.path.join(JSON_FOLDER, f'{game_id}_{inning_counter}{JSON_SUFFIX}.json')
        data = None
        fetched_new_data = False

        # 1. CHECK FOR EXISTING FILE
        if os.path.exists(raw_file_path):
            try:
                with open(raw_file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error loading existing JSON file {raw_file_path}: {e}. Proceeding to refetch.")
                data = None # Set to None to trigger refetch below

        # 2. FETCH NEW DATA if necessary
        if data is None:
            print(f"Fetching Inning {inning_counter}...")
            url = f"https://api-gw.sports.naver.com/schedule/games/{game_id}/relay?inning={inning_counter}"

            response = make_safe_request(url)

            if response and response.status_code == 200:
                try:
                    data = response.json()
                    fetched_new_data = True
                    
                    # Save the Raw JSON for the inning
                    with open(raw_file_path, 'w', encoding='utf-8') as f:
                        json.dump(data, f, ensure_ascii=False, indent=4)
                    print(f"  -> Successfully fetched and saved raw JSON to {raw_file_path}")
                        
                except json.JSONDecodeError:
                    print(f"Error decoding JSON for Inning {inning_counter}. Skipping.")
                    break # Stop on corrupted response
            else:
                # The request failed or returned a non-200/429 status after max retries
                print(f"Failed to retrieve data for Inning {inning_counter} after all retries. Stopping.")
                break

        # 3. PROCESS DATA (both loaded and fetched)
        if data:
            current_max_inning, pa_list, full_json_data = _extract_game_data(data)
            
            if inning_counter == 1:
                # Update max_inning based on the first inning's data
                max_inning = current_max_inning
                initial_data = full_json_data # Store the full data for pitcher lookup
                print(f"Max inning set to: {max_inning}")

            if max_inning == 0:
                print(f"Unable to get a valid max_inning (is 0). Stopping.")
                break
                
            if not pa_list:
                print(f"Inning {inning_counter} contains no Plate Appearances. Stopping.")
                break
            
            # Collect the 'textRelays' list for later merging
            master_json_list.extend(pa_list)
            inning_counter += 1

    print(f"\nFinished data collection. Total PA lists collected: {len(master_json_list)}")
    return master_json_list, initial_data
    
def extract_team_codes(game_id):
    """
    Extracts the 2-character away and home team codes from the game_id string.
    Example: '20250930LTHH02025' -> Away: 'LT', Home: 'HH'
    """
    if len(game_id) < 12:
        return 'N/A', 'N/A' # Handle unexpected format
        
    away_code = game_id[8:10]
    home_code = game_id[10:12]
    
    return away_code, home_code
    
def build_pitcher_lookup(full_inning_1_data):
    """
    Extracts pitcher data from the full Inning 1 game object and creates a lookup dictionary.

    Args:
        full_inning_1_data (dict): The full JSON object from the first inning fetch.

    Returns:
        dict: A dictionary mapping pitcher 'pcode' to their static data (name, stance, etc.).
    """
    pitcher_lookup = {}
    
    try:
        data = full_inning_1_data['result']['textRelayData']
        home_pitcher_list = data['homeLineup']['pitcher']
        away_pitcher_list = data['awayLineup']['pitcher']
        all_pitchers_list = away_pitcher_list + home_pitcher_list
        
        for p in all_pitchers_list:
            pcode = p.get('pcode')
            if pcode:
                # Pre-calculate stance for efficiency
                hit_type = p.get('hitType', 'L')
                p['stance_derived'] = 'R' if hit_type.startswith('우') else 'L'
                pitcher_lookup[pcode] = p
        
        return pitcher_lookup
        
    except (KeyError, TypeError) as e:
        print(f"Error building pitcher lookup from lineup data: {e}")
        return {}

# --- KINEMATICS AND ZONE CLASSIFICATION FUNCTIONS ---

def calculate_plate_height(pitch_data):
    """Calculates the flight time and the corrected vertical position (z_plate)."""
    try:
        y0 = pitch_data['y0']
        vy0 = pitch_data['vy0']
        ay = pitch_data['ay']
        z0 = pitch_data['z0']
        vz0 = pitch_data['vz0']
        az = pitch_data['az']
    except KeyError as e:
        # print(f"Missing required key in pitch data: {e}") # Suppress during bulk processing
        return None

    # Step 1: Calculate Flight Time (t)
    a = 0.5 * ay
    b = vy0
    c = y0
    discriminant = (b**2) - (4 * a * c)
    
    if discriminant < 0 or a == 0:
        return {"error": "Invalid kinematics data."}
    
    # Use the minus branch for the time to the plate (t > 0)
    time_of_flight = (-b - math.sqrt(discriminant)) / (2 * a)
    
    # Step 2: Calculate Vertical Position (z_plate)
    z_plate = z0 + (vz0 * time_of_flight) + (0.5 * az * (time_of_flight**2))

    return {
        "time_of_flight": time_of_flight,
        "z_plate": z_plate
    }

def classify_5x5_zone(crossPlateX, calculated_z, topSz, bottomSz):
    """
    Classifies a pitch into one of 25 zones (11 to 55) based on a 5x5 grid 
    (3x3 zone + 1 block shadow), and returns a flag if it is outside even the shadow.
    """
    
    # 1. Define Standard Baseball Constants
    PLATE_WIDTH_FT = 1.4167  # 17 inches
    HALF_PLATE = PLATE_WIDTH_FT / 2 # 0.70835
    
    # 2. Define Zone Block Sizes
    X_BLOCK = PLATE_WIDTH_FT / 3
    Z_BLOCK = (topSz - bottomSz) / 3
    
    # 3. Define X Boundaries (6 boundaries create 5 zones)
    x_boundaries = [
        -HALF_PLATE - X_BLOCK,   # X1: Far Left Shadow boundary
        -HALF_PLATE,             # X2: Left edge of plate
        -HALF_PLATE + X_BLOCK,   # X3: Left-center boundary
        HALF_PLATE - X_BLOCK,    # X4: Right-center boundary
        HALF_PLATE,              # X5: Right edge of plate
        HALF_PLATE + X_BLOCK     # X6: Far Right Shadow boundary
    ]
    
    # 4. Define Z Boundaries (6 boundaries create 5 zones)
    z_boundaries = [
        bottomSz - Z_BLOCK,      # Z1: Far Low Shadow boundary
        bottomSz,                # Z2: Bottom of Zone
        bottomSz + Z_BLOCK,      # Z3: Low-mid boundary
        topSz - Z_BLOCK,         # Z4: Mid-high boundary
        topSz,                   # Z5: Top of Zone
        topSz + Z_BLOCK          # Z6: Far High Shadow boundary
    ]
    
    # Determine the X-Index -1, (0 to 4), 5
    x_index = -1
    is_out_bound = False
    for i, boundary in enumerate(x_boundaries):
        if crossPlateX < boundary:
            x_index = i
            break
    else:
        x_index = 5

    # Determine the Z-Index -1, (0 to 4), 5
    z_index = -1
    for i, boundary in enumerate(z_boundaries):
        if calculated_z < boundary:
            z_index = i
            break
    else:
        z_index = 5


    # --- Determine Outside Boundary Flag ---
    # The flag is True if the pitch is outside the 5x5 grid (index 0 or 5)
    is_outside_boundary = (x_index == 0) or (x_index == 5) or \
                              (z_index == 0) or (z_index == 5)
    
    # --- Calculate Final Zone ID (clamping index to 1-5) ---
    # Clamp the index between 1 and 5 (index 0 maps to zone 1, index 5 maps to zone 5)
    # This prevents the final Zone ID from being 00 or 66, but still flags the issue.
    # Note: Zone ID 1x is the low row, 5x is the high row.
    final_z_index = max(1, min(z_index, 5))
    final_x_index = max(1, min(x_index, 5))

    # Zone ID (e.g., Row 1 x 10 + Col 1 = 11, Row 5 x 10 + Col 5 = 55)
    zone_id = final_z_index * 10 + final_x_index
    
    return {
        "zone_5x5_id": str(zone_id),
        "is_outside_boundary": is_outside_boundary,
        "raw_x_index": x_index,
        "raw_z_index": z_index
    }
    
def process_plate_appearance(pa_data, pitcher_lookup, away_code, home_code):
    """
    Processes all events (type 8 for batter context, type 1 for pitch)
    within a single Plate Appearance (PA), correctly linking context to pitch data.
    Returns a list of dictionaries, one for each processed pitch (type 1).
    """
    processed_pitches_in_pa = []

    # Create a MAP for Trajectory Data (ptsOptions)
    # Key: pitchId (e.g., "251007_151509")
    # Value: The full trajectory dict
    trajectory_map = {
        pitch.get('pitchId'): pitch
        for pitch in pa_data.get('ptsOptions', [])
    }
    
    # Contextual variables to be updated by Type 8 events
    current_batter_name = 'N/A'
    current_batter_lineup_pos = 'N/A'
    current_batter_id = 'N/A' 
    is_batter_home = pa_data.get('homeOrAway') == "1"

    # Determine static PA team codes
    batter_team_code = home_code if is_batter_home else away_code
    pitcher_team_code = away_code if is_batter_home else home_code

    # Check for pitcher change due to injury during a PA before using this
    #pitcher_data = pitcher_lookup.get(pitcher_id, {})
    #pitcher_name = pitcher_data.get('name', 'N/A')
    #pitcher_stance = pitcher_data.get('stance_derived', 'L') # Using pre-derived stance

    # Iterate through Text Events (textOptions)
    for detail in pa_data.get('textOptions', []):
        event_type = detail.get('type')

        # EVENT TYPE 8: BATTER CHANGE (Context Update)
        if event_type == 8:
            batter_record = detail.get('batterRecord', {})
            current_batter_name = batter_record.get('name', 'N/A')
            current_batter_lineup_pos = batter_record.get('batOrder', 'N/A')
            current_batter_id = batter_record.get('pcode', 'N/A')
            
            # The 'currentGameState' here is usually the state *before* the new batter steps in,
            # but we trust the 'batterRecord' for name/order.
            
            continue # Do not process this event as a pitch

        # EVENT TYPE 1: PITCH EVENT (Data Processing)
        elif event_type == 1:
            pitch_id = detail.get('ptsPitchId')
            pitch = trajectory_map.get(pitch_id)

            if pitch is None:
                # This pitch event exists in textOptions but has NO kinematic data.
                # We skip processing it as a tracked pitch, which is correct.
                # print(f"Skipping pitch {pitch_id}: No trajectory data found.") # Optional: Debug print
                continue
                
            pitch_summary = {}

            # --- A. MERGE DATA & CALCULATIONS (from original script) ---
            
            # Add Trajectory Data (ptsOptions)
            pitch_summary.update(pitch)

            # Kinematic Calculations
            calculation_result = calculate_plate_height(pitch)
            
            pitch_summary['plate_z_ft'] = None
            zone_5x5_id = 0
            is_outside_boundary = False

            if calculation_result:
                calculated_z = calculation_result.get('z_plate')
                zone_results = classify_5x5_zone(
                    pitch.get('crossPlateX', 0.0), calculated_z, 
                    pitch.get('topSz', 3.3), pitch.get('bottomSz', 1.6)
                )
                pitch_summary['plate_z_ft'] = round(calculated_z, 4)
                zone_5x5_id = zone_results['zone_5x5_id']
                is_outside_boundary = zone_results['is_outside_boundary']

            # --- B. ADD CONTEXTUAL & PITCH DATA ---
            
            currentGameState = detail.get('currentGameState', {})
            pitcher_id = currentGameState.get('pitcher')
            pitcher_data = pitcher_lookup.get(pitcher_id, {})

            # 1. Game State
            pitch_summary['is_batter_home'] = is_batter_home
            pitch_summary['home_score'] = currentGameState.get('homeScore')
            pitch_summary['away_score'] = currentGameState.get('awayScore')
            pitch_summary['strike'] = currentGameState.get('strike')
            pitch_summary['ball'] = currentGameState.get('ball')
            pitch_summary['out'] = currentGameState.get('out')
            pitch_summary['base1'] = currentGameState.get('base1')
            pitch_summary['base2'] = currentGameState.get('base2')
            pitch_summary['base3'] = currentGameState.get('base3')

            # 3. Pitcher Info (From currentGameState)
            pitch_summary['pitcher_id'] = pitcher_id
            hit_type = pitcher_data.get('hitType', 'L') # Check for the '우' (Right) character in 'hitType' X투X타
            pitch_summary['pitcher_stance'] = 'R' if hit_type.startswith('우') else 'L'
            pitch_summary['pitcher_team_code'] = pitcher_team_code
            pitch_summary['pitcher_name'] = pitcher_data.get('name', 'N/A')
        
            # 2. Batter Info (From Type 8 event)
            pitch_summary['batter_id'] = current_batter_id
            pitch_summary['batter_team_code'] = batter_team_code
            pitch_summary['batter_lineup_pos'] = current_batter_lineup_pos
            pitch_summary['batter_name'] = current_batter_name

            # 4. Pitch Details
            pitch_summary['is_throwing_stretch'] = (pitch_summary['base1'] != "0") or \
                                                 (pitch_summary['base2'] != "0") or \
                                                 (pitch_summary['base3'] != "0")

            
            pitch_summary['pitch_type'] = detail.get('stuff', 'N/A')
            pitch_summary['pitch_speed_kph'] = detail.get('speed', 'N/A')
            pitch_summary['pitch_result'] = detail.get('pitchResult', 'N/A')
            pitch_summary['is_outside_boundary'] = is_outside_boundary
            pitch_summary['zone_5x5_id'] = zone_5x5_id
                
            processed_pitches_in_pa.append(pitch_summary)

    return processed_pitches_in_pa

def main_processing_script(game_id):
    master_pa_list, initial_data = get_json_files(game_id)
    pitcher_lookup = build_pitcher_lookup(initial_data)
    away_code, home_code = extract_team_codes(game_id)

    if not master_pa_list:
        print("No plate appearances found. Exiting.")
        return pd.DataFrame()

    all_processed_pitches = []

    # 3. Process all Plate Appearances
    for i, pa_data in enumerate(master_pa_list):
        # print(f"Processing PA {i+1}/{len(master_pa_list)}...") # Optional: Debug print
        pitches_in_pa = process_plate_appearance(pa_data, pitcher_lookup, away_code, home_code)
        all_processed_pitches.extend(pitches_in_pa)

    print(f"Total pitches processed: {len(all_processed_pitches)}")

    # 4. Convert to Pandas DataFrame and Clean
    df = pd.DataFrame(all_processed_pitches)

    df.rename(columns={
        'pitchId': 'pitch_id',
        'crossPlateX': 'plate_x_ft',
        'ballcount': 'ball_count',
        'crossPlateY': 'plate_y_ft',
        'topSz': 'strikezone_top',
        'bottomSz': 'strikezone_btm',
        'stance': 'batter_stance',
        'inn': 'inning'
    }, inplace=True)
    
    # 5. Output the Results
    output_filename = f'./{CSV_FOLDER}/{game_id}{CSV_SUFFIX}.csv'
    df.to_csv(output_filename, index=False, encoding='utf-8')
    
    print(f"\n--- Final Processed DataFrame Saved to {output_filename} ---")
    print("5 rows with key columns:")
    print(df[[
        'pitch_id', 'pitcher_id', 'batter_id', 'batter_stance', 'pitcher_name',
        'pitcher_stance', 'pitch_type', 'pitcher_team_code', 'batter_team_code'
    ]].head())
    print(f"No. of Columns: {len(all_processed_pitches[0])}")
    for key, value in all_processed_pitches[0].items():
        print(f"{key}: {value}")
    
    return df
    
# --- Execution Block ---
if __name__ == '__main__':
    game_id = '20250929LTSK02025'
    #game_id = '20250930LTHH02025'
    
    # Execute the main function
    final_df = main_processing_script(game_id)

Max inning set to: 9

Finished data collection. Total PA lists collected: 90
Total pitches processed: 276

--- Final Processed DataFrame Saved to ./pitch_processed/20250929LTSK02025_processed.csv ---
5 rows with key columns:
        pitch_id pitcher_id batter_id batter_stance pitcher_name  \
0  250929_183021      54833     52568             R          앤더슨   
1  250929_183037      54833     52568             R          앤더슨   
2  250929_183053      54833     52568             R          앤더슨   
3  250929_183110      54833     52568             R          앤더슨   
4  250929_183125      54833     52568             R          앤더슨   

  pitcher_stance pitch_type pitcher_team_code batter_team_code  
0              R         직구                SK               LT  
1              R         직구                SK               LT  
2              R         커브                SK               LT  
3              R         직구                SK               LT  
4              R         커브              

In [65]:
# fetching all innings json from url

import requests
import json
import os
import time

# --- Configuration ---
JSON_FOLDER = 'pitch_raw'
CSV_FOLDER = 'pitch_processed'
JSON_SUFFIX = '_raw'
CSV_SUFFIX = '_processed'
# ---------------------

def _extract_game_data(data):
    """Safely extracts max_inning and PA list from the JSON data."""
    text_relay_data = data.get('result', {}).get('textRelayData', {})
    
    # max_inning needs to be extracted as an integer
    try:
        max_inning = int(text_relay_data.get('inn', 0))
    except (TypeError, ValueError):
        max_inning = 0
        
    pa_list = text_relay_data.get('textRelays', [])
    
    # The full data object is needed later to build the pitcher lookup (from inn 1 data)
    return max_inning, pa_list, data

def get_json_files(game_id):
    """
    Fetches game data by inning, checks for existing files, and merges PA data.
    Returns: master_json_list (list of all PAs) and initial_data (full JSON for Inning 1).
    """
    os.makedirs(JSON_FOLDER, exist_ok=True)
    inning_counter = 1
    max_inning = 1
    master_json_list = []
    initial_data = None # Store the full JSON from inning 1 to get pitcher lineups

    while inning_counter <= max_inning:
        raw_file_path = os.path.join(JSON_FOLDER, f'{game_id}_{inning_counter}{JSON_SUFFIX}.json')
        data = None # Reset data object for each loop iteration

        # 1. CHECK FOR EXISTING FILE
        if os.path.exists(raw_file_path):
            try:
                with open(raw_file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error loading existing JSON file {raw_file_path}: {e}. Proceeding to refetch.")
                data = None # Set to None to trigger refetch below

        # 2. FETCH NEW DATA if necessary (or if file was corrupt)
        if data is None:
            print(f"Fetching Inning {inning_counter}...")
            url = f"https://api-gw.sports.naver.com/schedule/games/{game_id}/relay?inning={inning_counter}"
            
            try:
                r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
                r.raise_for_status()
                data = r.json()
                
                # Save the Raw JSON for the inning
                with open(raw_file_path, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
                print(f"  -> Successfully fetched and saved raw JSON to {raw_file_path}")
                
            except requests.exceptions.RequestException as e:
                print(f"Error fetching Inning {inning_counter}: {e}")
                break # Stop on any fetching error

        # 3. PROCESS DATA (both loaded and fetched)
        if data:
            current_max_inning, pa_list, full_json_data = _extract_game_data(data)
            
            if inning_counter == 1:
                # Update max_inning based on the first inning's data
                max_inning = current_max_inning
                initial_data = full_json_data # Store the full data for pitcher lookup
                print(f"Max inning set to: {max_inning}")

            if max_inning == 0:
                print(f"Unable to get a valid max_inning (is 0). Stopping.")
                break
                
            if not pa_list:
                print(f"Inning {inning_counter} contains no Plate Appearances. Stopping.")
                break
            
            # Collect the 'textRelays' list for later merging
            master_json_list.extend(pa_list)
            inning_counter += 1
        
        # Add a short delay only after a successful fetch to respect the API limits
        if data and not os.path.exists(raw_file_path): # Check if we just did a fetch
             time.sleep(0.5)

    print(f"\nFinished data collection. Total PA lists collected: {len(master_json_list)}")
    return master_json_list, initial_data

master_pa_list, pitcher_lineup_data = get_json_files('20250930LTHH02025')

Inning 1 JSON already exists. Loading from disk.
Max inning set to: 10
Inning 2 JSON already exists. Loading from disk.
Inning 3 JSON already exists. Loading from disk.
Inning 4 JSON already exists. Loading from disk.
Inning 5 JSON already exists. Loading from disk.
Inning 6 JSON already exists. Loading from disk.
Inning 7 JSON already exists. Loading from disk.
Inning 8 JSON already exists. Loading from disk.
Inning 9 JSON already exists. Loading from disk.
Inning 10 JSON already exists. Loading from disk.

Finished data collection. Total PA lists collected: 94
