In [None]:
import os
import json
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm  # For progress tracking
import time

start_time = time.time()

# Get list of all JSON files in the tests_json directory
# Get the absolute path to the JSON files
import os

# Use absolute path since we know the working directory
workspace_dir = r'C:\Users\abhay\OneDrive\Documents\Stuff 2.0\Programming\The Cricket Project'
json_dir = os.path.join(workspace_dir, 'Tests', 'tests_json')
files = glob(os.path.join(json_dir, '*.json'))

# Print the path being searched for debugging
print(f"Searching for JSON files in: {json_dir}")

if not files:
    raise FileNotFoundError("No JSON files found in Tests/tests_json/. Please check the path.")

all_dfs = []
processed_files = 0
skipped_files = 0

total_deliveries = 0
print(f"\nFound {len(files)} JSON files to process")

for f in tqdm(files, desc="Processing match files"):
    try:
        with open(f, 'r') as json_file:
            data = json.load(json_file)
        
        # Extract match-level information with safe dictionary access
        match_info = {
            'match_id': os.path.basename(f).replace('.json', ''),
            'teams': data.get('info', {}).get('teams', []),
            'venue': data.get('info', {}).get('venue', ''),
            'date': data.get('info', {}).get('dates', [''])[0],
            'winner': data.get('info', {}).get('outcome', {}).get('winner', None)
        }
        
        # Process each innings
        innings_list = data.get('innings', [])
        for innings_idx, innings in enumerate(innings_list):
            try:
                overs = innings.get('overs', [])
                for over in overs:
                    over_num = over.get('over', 0)
                    deliveries = over.get('deliveries', [])
                    
                    for delivery in deliveries:
                        # Add match and innings info to each delivery
                        delivery_info = delivery.copy()  # Create a copy to avoid modifying original
                        delivery_info.update({
                            'match_id': match_info['match_id'],
                            'teams': match_info['teams'],
                            'venue': match_info['venue'],
                            'date': match_info['date'],
                            'winner': match_info['winner'],
                            'innings': innings_idx + 1,
                            'batting_team': innings.get('team', ''),
                            'over_number': over_num
                        })
                        
                        # Create a DataFrame for this delivery
                        df_delivery = pd.json_normalize([delivery_info])
                        all_dfs.append(df_delivery)
            except Exception as e:
                print(f"Error processing innings {innings_idx + 1} in match {match_info['match_id']}: {str(e)}")
                continue
                
        processed_files += 1
        
    except Exception as e:
        print(f"Error processing file {f}: {str(e)}")
        skipped_files += 1
        continue

if not all_dfs:
    raise ValueError("No data was processed successfully. Please check the JSON file format.")

# Combine all deliveries into a single DataFrame
Tests = pd.concat(all_dfs, ignore_index=True)

print(f"Successfully processed {processed_files} files (skipped {skipped_files}) into a DataFrame with {len(Tests)} rows.")

# Calculate and display timing information
end_time = time.time()
total_time = end_time - start_time

print(f"\nProcessing completed in {total_time:.2f} seconds")
print(f"Average time per file: {total_time/processed_files:.2f} seconds")
print(f"Total deliveries processed: {len(Tests):,}")
print(f"Processing speed: {len(Tests)/total_time:.2f} deliveries/second")

# Display the first few rows and basic information about the DataFrame
print("\nDataFrame Info:")
print(Tests.info())
print("\nFirst few rows:")
Tests.head()


Searching for JSON files in: C:\Users\abhay\OneDrive\Documents\Stuff 2.0\Programming\The Cricket Project\Tests\tests_json

Found 864 JSON files to process


Processing match files:   0%|          | 0/864 [00:00<?, ?it/s]