In [4]:
import pandas as pd
import json
import os
from pathlib import Path
from tqdm.auto import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)

# Set up the correct workspace path for Windows
WORKSPACE = r"C:\Users\abhay\OneDrive\Documents\Stuff 2.0\Programming\The Cricket Project"
TESTS_JSON_PATH = os.path.join(WORKSPACE, "Tests", "tests_json")

# Convert to proper Windows path
TESTS_JSON_PATH = os.path.normpath(TESTS_JSON_PATH)

print(f"Looking for JSON files in: {TESTS_JSON_PATH}")

def process_delivery(delivery, match_info, innings_num, batting_team, over_num):
    """Process a single delivery and combine it with match information."""
    try:
        return {
            'match_id': match_info['match_type_number'],
            'team1': match_info['teams'][0],
            'team2': match_info['teams'][1],
            'venue': match_info['venue'],
            'date': match_info['dates'][0],
            'winner': match_info['outcome'].get('winner', None),
            'innings': innings_num,
            'batting_team': batting_team,
            'over_number': over_num,
            'batter': delivery['batter'],
            'bowler': delivery['bowler'],
            'non_striker': delivery['non_striker'],
            'runs_batter': delivery['runs']['batter'],
            'runs_extras': delivery['runs'].get('extras', 0),
            'runs_total': delivery['runs']['total'],
            'extras_type': list(delivery['extras'].keys())[0] if 'extras' in delivery and delivery['extras'] else None,
            'player_out': delivery.get('wicket', {}).get('player_out', None),
            'dismissal_kind': delivery.get('wicket', {}).get('kind', None),
            'fielders': ','.join(f['name'] for f in delivery.get('wicket', {}).get('fielders', [])) if 'wicket' in delivery and 'fielders' in delivery['wicket'] else None
        }
    except Exception as e:
        logger.error(f"Error processing delivery: {str(e)}")
        return None

def process_match(file_path):
    """Process a single match file and return its deliveries."""
    try:
        # Log file processing
        filename = os.path.basename(file_path)
        logger.info(f"Processing: {filename}")
        
        # Read and parse the JSON file
        with open(file_path, 'r', encoding='utf-8') as f:
            match_data = json.load(f)
            
        if 'info' not in match_data or 'innings' not in match_data:
            logger.error(f"Missing required fields in {filename}")
            return []
            
        match_info = match_data['info']
        deliveries = []
        
        # Process each innings
        for innings_num, innings in enumerate(match_data['innings'], 1):
            batting_team = innings['team']
            
            # Process each over
            for over in innings['overs']:
                over_num = over['over']
                
                # Process each delivery
                for delivery in over['deliveries']:
                    delivery_data = process_delivery(
                        delivery,
                        match_info,
                        innings_num,
                        batting_team,
                        over_num
                    )
                    if delivery_data:  # Only append if we got valid data
                        deliveries.append(delivery_data)
                        
        logger.info(f"Completed {filename} with {len(deliveries)} deliveries")
        return deliveries
        
    except Exception as e:
        logger.error(f"Error processing {file_path}: {str(e)}")
        return []

def load_match_data(json_dir=None):
    """
    Load all JSON files from the specified directory and combine them into a single DataFrame.
    Each JSON file represents a single match's ball-by-ball data.
    """
    if json_dir is None:
        json_dir = TESTS_JSON_PATH
        
    # Get list of JSON files
    json_files = [os.path.join(json_dir, f) for f in os.listdir(json_dir) if f.endswith('.json')]
    if not json_files:
        logger.warning(f"No JSON files found in {json_dir}")
        return pd.DataFrame()
        
    logger.info(f"Found {len(json_files)} JSON files")
    
    # Process all files
    all_deliveries = []
    for file_path in tqdm(json_files, desc="Processing matches"):
        deliveries = process_match(file_path)
        all_deliveries.extend(deliveries)
    
    if not all_deliveries:
        logger.warning("No valid match data found")
        return pd.DataFrame()
        
    logger.info(f"Successfully processed {len(all_deliveries)} deliveries")
    
    # Create DataFrame
    df = pd.DataFrame(all_deliveries)
    
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    logger.info(f"Data loading completed successfully. DataFrame shape: {df.shape}")
    return df

# Load and process the match data
Tests = load_match_data(TESTS_JSON_PATH)

# Display info about the loaded data
if not Tests.empty:
    print("\nDataset Info:")
    print(f"Number of matches: {Tests['match_id'].nunique()}")
    print(f"Date range: {Tests['date'].min()} to {Tests['date'].max()}")
    print(f"\nTotal deliveries: {len(Tests)}")
    print(f"Total innings: {Tests['innings'].nunique()}")
    
    print("\nTeams in dataset:")
    all_teams = sorted(set(Tests['team1'].unique()) | set(Tests['team2'].unique()))
    for team in all_teams:
        print(f"- {team}")
    
    print("\nSample data:")
    display(Tests.head())
    
    print("\nBasic Statistics:")
    print(f"Average runs per ball: {Tests['runs_total'].mean():.2f}")
    print(f"Total wickets: {Tests['player_out'].notna().sum()}")
    print(f"Total extras: {Tests['extras_type'].notna().sum()}")
else:
    print("\nNo data loaded. Please check the JSON files in the tests_json directory.")



2025-06-08 22:00:35,497 - Found 864 JSON files


Looking for JSON files in: C:\Users\abhay\OneDrive\Documents\Stuff 2.0\Programming\The Cricket Project\Tests\tests_json


Processing matches:   0%|          | 0/864 [00:00<?, ?it/s]

2025-06-08 22:00:35,507 - Processing: 1000851.json
2025-06-08 22:00:35,520 - Completed 1000851.json with 2499 deliveries
2025-06-08 22:00:35,522 - Processing: 1000853.json
2025-06-08 22:00:35,527 - Completed 1000853.json with 1165 deliveries
2025-06-08 22:00:35,529 - Processing: 1000855.json
2025-06-08 22:00:35,537 - Completed 1000855.json with 1952 deliveries
2025-06-08 22:00:35,539 - Processing: 1000881.json
2025-06-08 22:00:35,550 - Completed 1000881.json with 2227 deliveries
2025-06-08 22:00:35,551 - Processing: 1000883.json
2025-06-08 22:00:35,557 - Completed 1000883.json with 1949 deliveries
2025-06-08 22:00:35,558 - Processing: 1000885.json
2025-06-08 22:00:35,569 - Completed 1000885.json with 2156 deliveries
2025-06-08 22:00:35,571 - Processing: 1019985.json
2025-06-08 22:00:35,584 - Completed 1019985.json with 2394 deliveries
2025-06-08 22:00:35,585 - Processing: 1019987.json
2025-06-08 22:00:35,594 - Completed 1019987.json with 1504 deliveries
2025-06-08 22:00:35,596 - Proces


Dataset Info:
Number of matches: 864
Date range: 2001-12-19 00:00:00 to 2025-05-22 00:00:00

Total deliveries: 1673829
Total innings: 4

Teams in dataset:
- Australia
- Bangladesh
- England
- ICC World XI
- India
- Ireland
- New Zealand
- Pakistan
- South Africa
- Sri Lanka
- West Indies
- Zimbabwe

Sample data:


Unnamed: 0,match_id,team1,team2,venue,date,winner,innings,batting_team,over_number,batter,bowler,non_striker,runs_batter,runs_extras,runs_total,extras_type,player_out,dismissal_kind,fielders
0,2230,Australia,South Africa,Western Australia Cricket Association Ground,2016-11-03,South Africa,1,South Africa,0,SC Cook,MA Starc,D Elgar,0,0,0,,,,
1,2230,Australia,South Africa,Western Australia Cricket Association Ground,2016-11-03,South Africa,1,South Africa,0,SC Cook,MA Starc,D Elgar,0,0,0,,,,
2,2230,Australia,South Africa,Western Australia Cricket Association Ground,2016-11-03,South Africa,1,South Africa,0,SC Cook,MA Starc,D Elgar,0,0,0,,,,
3,2230,Australia,South Africa,Western Australia Cricket Association Ground,2016-11-03,South Africa,1,South Africa,0,SC Cook,MA Starc,D Elgar,0,0,0,,,,
4,2230,Australia,South Africa,Western Australia Cricket Association Ground,2016-11-03,South Africa,1,South Africa,0,HM Amla,MA Starc,D Elgar,0,0,0,,,,



Basic Statistics:
Average runs per ball: 0.54
Total wickets: 0
Total extras: 29857


In [5]:
Tests.tail()

Unnamed: 0,match_id,team1,team2,venue,date,winner,innings,batting_team,over_number,batter,bowler,non_striker,runs_batter,runs_extras,runs_total,extras_type,player_out,dismissal_kind,fielders
1673824,2217,Sri Lanka,Australia,Sinhalese Sports Club Ground,2016-08-13,Sri Lanka,4,Australia,43,JM Holland,MDK Perera,NM Lyon,0,0,0,,,,
1673825,2217,Sri Lanka,Australia,Sinhalese Sports Club Ground,2016-08-13,Sri Lanka,4,Australia,43,JM Holland,MDK Perera,NM Lyon,0,0,0,,,,
1673826,2217,Sri Lanka,Australia,Sinhalese Sports Club Ground,2016-08-13,Sri Lanka,4,Australia,43,JM Holland,MDK Perera,NM Lyon,0,0,0,,,,
1673827,2217,Sri Lanka,Australia,Sinhalese Sports Club Ground,2016-08-13,Sri Lanka,4,Australia,43,JM Holland,MDK Perera,NM Lyon,0,0,0,,,,
1673828,2217,Sri Lanka,Australia,Sinhalese Sports Club Ground,2016-08-13,Sri Lanka,4,Australia,44,NM Lyon,HMRKB Herath,JM Holland,0,0,0,,,,


In [6]:
# Convert date column to datetime for proper date operations
Tests['date'] = pd.to_datetime(Tests['date'])

# Find earliest and latest dates
earliest_date = Tests['date'].min()
latest_date = Tests['date'].max()

print(f"Date Range of Test Matches:")
print(f"Earliest match: {earliest_date.strftime('%Y-%m-%d')}")
print(f"Latest match:   {latest_date.strftime('%Y-%m-%d')}")
print(f"\nTotal timespan: {(latest_date - earliest_date).days} days")


Date Range of Test Matches:
Earliest match: 2001-12-19
Latest match:   2025-05-22

Total timespan: 8555 days
