In [12]:
import json
from pathlib import Path
import pandas as pd

print("=== Combining All Metrics ===")

# Load metadata and all metric CSVs
player_metadata = pd.read_csv('../output/player_metadata.csv')
player_sprints = pd.read_csv('../output/player_sprints.csv')
player_runs = pd.read_csv('../output/player_runs.csv')
player_pressing = pd.read_csv('../output/player_pressing.csv')

# Start with metadata (has all player-match records)
player_metrics = player_metadata.copy()

# Left join each metric set
player_metrics = player_metrics.merge(
    player_sprints,
    on=['match_id', 'player_id'],
    how='left'
)

player_metrics = player_metrics.merge(
    player_runs,
    on=['match_id', 'player_id'],
    how='left'
)

player_metrics = player_metrics.merge(
    player_pressing,
    on=['match_id', 'player_id'],
    how='left'
)

# Identify which columns are pure metadata vs metrics
meta_cols = player_metadata.columns.tolist()
metric_cols = [c for c in player_metrics.columns if c not in meta_cols]

# Keep only rows where at least one metric column is non-null
has_any_metrics = player_metrics[metric_cols].notna().any(axis=1)
print(f"Rows before metric filtering: {len(player_metrics)}")
print(f"Rows with at least one metric value: {has_any_metrics.sum()}")

player_metrics = player_metrics[has_any_metrics].copy()

# Save unified output
output_path = Path('../output/player_metrics.csv')
player_metrics.to_csv(output_path, index=False)

print(f"\nSaved unified player metrics to {output_path}")
print(f"Shape: {player_metrics.shape}")
print(f"Total columns: {len(player_metrics.columns)}")
print(f"Player-match records: {len(player_metrics)}")

=== Combining All Metrics ===
Rows before metric filtering: 360
Rows with at least one metric value: 254

Saved unified player metrics to ../output/player_metrics.csv
Shape: (254, 73)
Total columns: 73
Player-match records: 254


In [13]:
# Generate schemas
schema_dir = Path("../schemas")
schema_dir.mkdir(exist_ok=True)

TYPE_MAP = {
    "int64": "integer",
    "float64": "number",
    "bool": "boolean",
    "object": "string"
}

def generate_schema(df: pd.DataFrame, name: str):
    """Generate a JSON schema file for a pandas DataFrame."""
    schema = {
        "name": name,
        "description": f"Schema for {name.replace('_', ' ')}",
        "columns": []
    }
    
    for col in df.columns:
        dtype = str(df[col].dtype)
        json_type = TYPE_MAP.get(dtype, "string")
        
        schema["columns"].append({
            "name": col,
            "type": json_type,
            "nullable": bool(df[col].isna().any())
        })
    
    out_path = schema_dir / f"{name}_schema.json"
    with open(out_path, "w") as f:
        json.dump(schema, f, indent=4)
    
    print(f"✓ Saved schema: {out_path}")

# Load all CSVs for schema generation
sprints_df = pd.read_csv("../output/player_sprints.csv")
runs_df = pd.read_csv("../output/player_runs.csv")
pressing_df = pd.read_csv("../output/player_pressing.csv")
metrics_df = pd.read_csv("../output/player_metrics.csv")
metadata_df = pd.read_csv("../output/player_metadata.csv")

# Generate schemas
generate_schema(sprints_df, "player_sprints")
generate_schema(runs_df, "player_runs")
generate_schema(pressing_df, "player_pressing")
generate_schema(metrics_df, "player_metrics")
generate_schema(metadata_df, "player_metadata")

print("\n✓ All schemas generated")

✓ Saved schema: ../schemas/player_sprints_schema.json
✓ Saved schema: ../schemas/player_runs_schema.json
✓ Saved schema: ../schemas/player_pressing_schema.json
✓ Saved schema: ../schemas/player_metrics_schema.json
✓ Saved schema: ../schemas/player_metadata_schema.json

✓ All schemas generated
