# Match Data Validation Notebook

This notebook validates the schema consistency of collected match data.
It samples 100 matches per rank and checks for:
- Schema compliance
- Data completeness
- Value ranges
- Statistical summaries


In [None]:
import sys
import json
from pathlib import Path
import pandas as pd
import numpy as np
from pydantic import ValidationError

# Add parent directory to path
sys.path.append('..')

from src.storage.data_storage import DataStorage
from src.transformers.schema import MatchData, validate_match


## 1. Load Data


In [None]:
# Initialize storage
storage = DataStorage(base_path='../data')

# Get statistics
stats = storage.get_statistics()
print("Data Statistics:")
print(f"Total matches: {stats['total_matches']}")
print(f"\nMatches by rank:")
for rank, count in stats['by_rank'].items():
    print(f"  {rank}: {count}")
print(f"\nMatches by patch:")
for patch, count in stats['by_patch'].items():
    print(f"  {patch}: {count}")


## 2. Schema Validation


In [None]:
def validate_sample(rank: str, sample_size: int = 100):
    """
    Validate a sample of matches for a specific rank.
    """
    print(f"\n=== Validating {rank} ===")
    
    try:
        # Load matches
        matches = storage.load_matches(rank, format='parquet')
        
        if not matches:
            print(f"No matches found for {rank}")
            return None
        
        # Sample
        sample = matches[:min(sample_size, len(matches))]
        print(f"Loaded {len(sample)} matches for validation")
        
        # Validate each match
        valid_count = 0
        errors = []
        
        for match in sample:
            try:
                # Already validated during load, but double-check
                assert isinstance(match, MatchData)
                valid_count += 1
            except Exception as e:
                errors.append(str(e))
        
        print(f"Valid matches: {valid_count}/{len(sample)}")
        
        if errors:
            print(f"Errors found: {len(errors)}")
            for i, error in enumerate(errors[:5]):  # Show first 5
                print(f"  {i+1}. {error}")
        
        return sample
        
    except Exception as e:
        print(f"Failed to validate {rank}: {e}")
        return None

# Validate all ranks
ranks = ['IRON', 'BRONZE', 'SILVER', 'GOLD', 'PLATINUM', 'DIAMOND', 'MASTER', 'GRANDMASTER', 'CHALLENGER']
validation_results = {}

for rank in ranks:
    sample = validate_sample(rank, sample_size=100)
    if sample:
        validation_results[rank] = sample


## 3. Data Completeness Check


In [None]:
def check_completeness(matches):
    """
    Check for missing or invalid data.
    """
    issues = []
    
    for i, match in enumerate(matches):
        # Check picks are not zero
        if 0 in match.blue_picks or 0 in match.red_picks:
            issues.append(f"Match {i}: Zero champion in picks")
        
        # Check we have 10 champion stats
        if len(match.champion_stats) != 10:
            issues.append(f"Match {i}: Expected 10 champion stats, got {len(match.champion_stats)}")
        
        # Check shares sum to ~1.0 per team
        blue_stats = match.champion_stats[:5]
        red_stats = match.champion_stats[5:]
        
        blue_dmg_share = sum(s.dmg_share for s in blue_stats)
        blue_gold_share = sum(s.gold_share for s in blue_stats)
        
        if not (0.95 <= blue_dmg_share <= 1.05):
            issues.append(f"Match {i}: Blue damage share = {blue_dmg_share:.2f} (expected ~1.0)")
        
        if not (0.95 <= blue_gold_share <= 1.05):
            issues.append(f"Match {i}: Blue gold share = {blue_gold_share:.2f} (expected ~1.0)")
    
    return issues

print("\n=== Completeness Check ===")
for rank, matches in validation_results.items():
    issues = check_completeness(matches)
    print(f"\n{rank}: {len(issues)} issues found")
    if issues:
        for issue in issues[:5]:  # Show first 5
            print(f"  - {issue}")


## 4. Statistical Summary


In [None]:
def compute_statistics(matches):
    """
    Compute statistical summaries.
    """
    stats = {
        'blue_win_rate': sum(1 for m in matches if m.blue_win) / len(matches),
        'avg_dragons': np.mean([m.blue_objectives.dragons + m.red_objectives.dragons for m in matches]),
        'avg_barons': np.mean([m.blue_objectives.barons + m.red_objectives.barons for m in matches]),
        'avg_towers': np.mean([m.blue_objectives.towers + m.red_objectives.towers for m in matches]),
        'avg_kda': np.mean([s.kda for m in matches for s in m.champion_stats]),
        'avg_cs': np.mean([s.cs for m in matches for s in m.champion_stats]),
        'avg_ap_ad_ratio': np.mean([m.derived_features.ap_ad_ratio for m in matches]),
        'avg_engage_score': np.mean([m.derived_features.engage_score for m in matches]),
        'avg_teamfight_synergy': np.mean([m.derived_features.teamfight_synergy for m in matches]),
    }
    return stats

print("\n=== Statistical Summary ===")
summary_df_data = []

for rank, matches in validation_results.items():
    stats = compute_statistics(matches)
    stats['rank'] = rank
    stats['sample_size'] = len(matches)
    summary_df_data.append(stats)

summary_df = pd.DataFrame(summary_df_data)
summary_df = summary_df[['rank', 'sample_size', 'blue_win_rate', 'avg_dragons', 'avg_barons', 
                         'avg_towers', 'avg_kda', 'avg_cs', 'avg_ap_ad_ratio', 
                         'avg_engage_score', 'avg_teamfight_synergy']]

print(summary_df.to_string(index=False))


## 5. Validation Complete

The dataset is validated and ready for ML training!
