# Step 2 Development - Extract and Merge Sports Data

This notebook will help us build step2.py incrementally.

## Objective
Extract specific fields from step1.json endpoints and merge by match ID.

In [None]:
# Import required libraries
import json
import logging
from datetime import datetime
import pytz
from pprint import pprint

# Set up paths
STEP1_JSON = "/root/6-4-2025/step1.json"
STEP2_JSON = "/root/6-4-2025/step2.json"
TZ = pytz.timezone("America/New_York")

## Step 1: Load step1.json and explore its structure

In [None]:
# Load step1.json
with open(STEP1_JSON, 'r') as f:
    step1_data = json.load(f)

# Check top-level keys
print("Top-level keys in step1.json:")
print(list(step1_data.keys())[:20])  # Show first 20 keys

In [None]:
# Check structure of live_matches
if 'live_matches' in step1_data:
    print("live_matches keys:", list(step1_data['live_matches'].keys()))
    if 'results' in step1_data['live_matches']:
        print(f"Number of live matches: {len(step1_data['live_matches']['results'])}")
        if step1_data['live_matches']['results']:
            print("\nFirst live match sample:")
            pprint(step1_data['live_matches']['results'][0])

In [None]:
# Check structure of match_details
if 'match_details' in step1_data:
    print(f"match_details has {len(step1_data['match_details'])} entries")
    # Get first match ID
    first_match_id = list(step1_data['match_details'].keys())[0] if step1_data['match_details'] else None
    if first_match_id:
        print(f"\nSample match_details for match {first_match_id}:")
        pprint(step1_data['match_details'][first_match_id])

## Step 2: Define extraction functions for each endpoint

### Fields to extract:
- From `/match/detail_live` (Live Matches): `id`, `status_id`
- From `/match/recent/list` (Match Details): `id`, `home_team_id`, `away_team_id`, `competition_id`, `status_id`, `home_scores`, `away_scores`, `home_position`, `away_position`
- From `/odds/history` (Odds): Full structure organized by betting company IDs
- From `/team/additional/list` (Teams): `id`, `name`, `country_id`, `competition_id`
- From `/competition/additional/list` (Competitions): `id`, `name`
- From `/country/list` (Countries): Cache separately

In [None]:
# Initialize result structure
result = {
    "timestamp": datetime.now(TZ).isoformat(),
    "matches_by_id": {},
    "metadata": {
        "processing_time": 0,
        "total_matches": 0,
        "total_teams": 0,
        "total_competitions": 0,
        "total_countries": 0
    }
}

# Track unique entities
unique_teams = set()
unique_competitions = set()
unique_countries = set()

In [None]:
# Extract live matches (id and status_id only)
live_matches = step1_data.get("live_matches", {}).get("results", [])
print(f"Processing {len(live_matches)} live matches...")

for match in live_matches:
    match_id = str(match.get("id", ""))
    if match_id:
        result["matches_by_id"][match_id] = {
            "match_id": match_id,
            "status_id": match.get("status_id")
        }

print(f"Extracted {len(result['matches_by_id'])} matches from live_matches")

In [None]:
# Build team lookup
teams_data = step1_data.get("team_info", {})
teams_lookup = {}

for team_id, team_wrapper in teams_data.items():
    if isinstance(team_wrapper, dict) and "results" in team_wrapper:
        team_list = team_wrapper.get("results", [])
        if team_list and isinstance(team_list, list) and len(team_list) > 0:
            team = team_list[0]
            teams_lookup[team_id] = {
                "id": team.get("id"),
                "name": team.get("name"),
                "country_id": team.get("country_id"),
                "competition_id": team.get("competition_id")
            }
            
            # Track unique countries
            if team.get("country_id"):
                unique_countries.add(team.get("country_id"))

print(f"Built lookup for {len(teams_lookup)} teams")
print(f"Found {len(unique_countries)} unique countries")

In [None]:
# Build competition lookup
competitions_data = step1_data.get("competition_info", {})
competitions_lookup = {}

for comp_id, comp_wrapper in competitions_data.items():
    if isinstance(comp_wrapper, dict) and "results" in comp_wrapper:
        comp_list = comp_wrapper.get("results", [])
        if comp_list and isinstance(comp_list, list) and len(comp_list) > 0:
            comp = comp_list[0]
            competitions_lookup[comp_id] = {
                "id": comp.get("id"),
                "name": comp.get("name")
            }

print(f"Built lookup for {len(competitions_lookup)} competitions")

## Step 3: Process match details and merge with live matches

In [None]:
# Process match details
match_details = step1_data.get("match_details", {})
print(f"Processing {len(match_details)} match details...")

matches_processed = 0
for match_id, details_wrapper in match_details.items():
    if isinstance(details_wrapper, dict) and "results" in details_wrapper:
        details_list = details_wrapper.get("results", [])
        if details_list and isinstance(details_list, list) and len(details_list) > 0:
            details = details_list[0]
            
            # Create match entry if it doesn't exist
            if match_id not in result["matches_by_id"]:
                result["matches_by_id"][match_id] = {"match_id": match_id}
            
            # Extract ONLY the specified fields
            match_data = result["matches_by_id"][match_id]
            
            # Basic match fields
            match_data["home_team_id"] = str(details.get("home_team_id", ""))
            match_data["away_team_id"] = str(details.get("away_team_id", ""))
            match_data["competition_id"] = str(details.get("competition_id", ""))
            match_data["status_id"] = details.get("status_id")
            match_data["home_scores"] = details.get("home_scores")
            match_data["away_scores"] = details.get("away_scores")
            match_data["home_position"] = details.get("home_position")
            match_data["away_position"] = details.get("away_position")
            
            # Track unique teams and competitions
            if match_data.get("home_team_id"):
                unique_teams.add(match_data["home_team_id"])
            if match_data.get("away_team_id"):
                unique_teams.add(match_data["away_team_id"])
            if match_data.get("competition_id"):
                unique_competitions.add(match_data["competition_id"])
            
            matches_processed += 1

print(f"Processed {matches_processed} match details")
print(f"Total unique teams: {len(unique_teams)}")
print(f"Total unique competitions: {len(unique_competitions)}")

In [None]:
# Add team info to matches
matches_with_teams = 0
for match_id, match_data in result["matches_by_id"].items():
    # Add home team info
    if match_data.get("home_team_id") and match_data["home_team_id"] in teams_lookup:
        match_data["home_team"] = teams_lookup[match_data["home_team_id"]]
        matches_with_teams += 1
    
    # Add away team info
    if match_data.get("away_team_id") and match_data["away_team_id"] in teams_lookup:
        match_data["away_team"] = teams_lookup[match_data["away_team_id"]]
    
    # Add competition info
    if match_data.get("competition_id") and match_data["competition_id"] in competitions_lookup:
        match_data["competition"] = competitions_lookup[match_data["competition_id"]]

print(f"Added team info to {matches_with_teams} matches")

## Step 4: Add odds data

In [None]:
# Process odds data
odds_data = step1_data.get("match_odds", {})
matches_with_odds = 0

for match_id, odds in odds_data.items():
    if match_id in result["matches_by_id"]:
        # Keep the original odds structure with company IDs
        result["matches_by_id"][match_id]["odds"] = odds
        matches_with_odds += 1

print(f"Added odds data to {matches_with_odds} matches")

## Step 5: Finalize metadata and save

In [None]:
# Update metadata
result["metadata"]["total_matches"] = len(result["matches_by_id"])
result["metadata"]["total_teams"] = len(unique_teams)
result["metadata"]["total_competitions"] = len(unique_competitions)
result["metadata"]["total_countries"] = len(unique_countries)

print("\nFinal summary:")
print(f"Total matches: {result['metadata']['total_matches']}")
print(f"Total teams: {result['metadata']['total_teams']}")
print(f"Total competitions: {result['metadata']['total_competitions']}")
print(f"Total countries: {result['metadata']['total_countries']}")

In [None]:
# Save to step2.json
with open(STEP2_JSON, 'w') as f:
    json.dump(result, f, indent=2)

print(f"\nData saved to {STEP2_JSON}")

In [None]:
# View a sample match to verify structure
if result["matches_by_id"]:
    sample_match_id = list(result["matches_by_id"].keys())[0]
    print(f"\nSample match structure (ID: {sample_match_id}):")
    pprint(result["matches_by_id"][sample_match_id])