# 01 - Data Ingestion

This notebook loads the raw CSV files from the Transfermarkt dataset, performs initial cleaning and type conversions, and stores everything in a SQLite database for SQL analysis.

**Data Source**: [Football Data from Transfermarkt](https://www.kaggle.com/datasets/davidcariboo/player-scores) (Kaggle)

## Pipeline
1. Load CSV files into pandas DataFrames
2. Inspect data shapes, dtypes, and missing values
3. Perform type conversions (dates, numerics)
4. Load all tables into SQLite database
5. Create indexes for query performance
6. Verify data integrity

In [None]:
import pandas as pd
import sqlite3
import os
import sys
from pathlib import Path

# Project paths
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DB_PATH = DATA_PROCESSED / "football.db"

# Ensure output directory exists
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print(f"Raw data directory: {DATA_RAW}")
print(f"Database path: {DB_PATH}")
print(f"Files in raw directory: {list(DATA_RAW.glob('*.csv'))}")

## 1. Load CSV Files

Load all CSV files from the Kaggle dataset into pandas DataFrames.

In [None]:
# Define expected CSV files and their names
CSV_FILES = {
    "appearances": "appearances.csv",
    "clubs": "clubs.csv",
    "competitions": "competitions.csv",
    "games": "games.csv",
    "players": "players.csv",
    "player_valuations": "player_valuations.csv",
    "transfers": "transfers.csv",
    "club_games": "club_games.csv",
    "game_events": "game_events.csv",
}

# Load all CSVs
dataframes = {}
for name, filename in CSV_FILES.items():
    filepath = DATA_RAW / filename
    if filepath.exists():
        df = pd.read_csv(filepath, low_memory=False)
        dataframes[name] = df
        print(f"  {name:.<30} {df.shape[0]:>10,} rows x {df.shape[1]:>3} cols")
    else:
        print(f"  {name:.<30} FILE NOT FOUND: {filepath}")

print(f"\nLoaded {len(dataframes)} / {len(CSV_FILES)} tables.")

## 2. Data Inspection

Quick overview of each table: columns, data types, missing values.

In [None]:
for name, df in dataframes.items():
    print(f"\n{'=' * 60}")
    print(f"TABLE: {name} ({df.shape[0]:,} rows, {df.shape[1]} columns)")
    print(f"{'=' * 60}")
    
    # Show columns, types, and null counts
    info_df = pd.DataFrame({
        "dtype": df.dtypes,
        "non_null": df.count(),
        "null_count": df.isnull().sum(),
        "null_pct": (df.isnull().sum() / len(df) * 100).round(1),
        "sample": df.iloc[0] if len(df) > 0 else None
    })
    display(info_df)

## 3. Data Type Conversions

Parse date columns and ensure numeric columns have correct types.

In [None]:
# Date columns to parse per table
DATE_COLUMNS = {
    "games": ["date"],
    "appearances": ["date"],
    "players": ["date_of_birth", "contract_expiration_date"],
    "player_valuations": ["date"],
    "transfers": ["transfer_date"],
}

for table, cols in DATE_COLUMNS.items():
    if table in dataframes:
        for col in cols:
            if col in dataframes[table].columns:
                before_nulls = dataframes[table][col].isnull().sum()
                dataframes[table][col] = pd.to_datetime(
                    dataframes[table][col], errors="coerce"
                )
                after_nulls = dataframes[table][col].isnull().sum()
                new_nulls = after_nulls - before_nulls
                print(f"  {table}.{col}: converted to datetime"
                      f" ({new_nulls} unparseable values set to NaT)")

print("\nDate conversions complete.")

In [None]:
# Ensure numeric columns are properly typed
NUMERIC_COLUMNS = {
    "players": ["market_value_in_eur", "highest_market_value_in_eur", "height_in_cm"],
    "player_valuations": ["market_value_in_eur"],
    "transfers": ["transfer_fee", "market_value_in_eur"],
    "clubs": ["total_market_value", "squad_size", "average_age"],
    "games": ["home_club_goals", "away_club_goals", "attendance"],
    "appearances": ["goals", "assists", "minutes_played", "yellow_cards", "red_cards"],
}

for table, cols in NUMERIC_COLUMNS.items():
    if table in dataframes:
        for col in cols:
            if col in dataframes[table].columns:
                dataframes[table][col] = pd.to_numeric(
                    dataframes[table][col], errors="coerce"
                )

print("Numeric type conversions complete.")

## 4. Load into SQLite Database

Store all DataFrames in a SQLite database for SQL analysis in subsequent notebooks.

In [None]:
# Remove existing database to start fresh
if DB_PATH.exists():
    DB_PATH.unlink()
    print("Removed existing database.")

conn = sqlite3.connect(DB_PATH)

for name, df in dataframes.items():
    df.to_sql(name, conn, if_exists="replace", index=False)
    row_count = pd.read_sql_query(f"SELECT COUNT(*) as cnt FROM [{name}]", conn)
    print(f"  {name:.<30} {row_count['cnt'].iloc[0]:>10,} rows loaded")

print(f"\nDatabase created at: {DB_PATH}")
print(f"Database size: {DB_PATH.stat().st_size / 1024 / 1024:.1f} MB")

conn.close()

## 5. Create Indexes

Add indexes for frequently queried columns to improve query performance.

In [None]:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

indexes = [
    # Player valuations -- heavily queried
    "CREATE INDEX IF NOT EXISTS idx_pv_player_id ON player_valuations(player_id)",
    "CREATE INDEX IF NOT EXISTS idx_pv_date ON player_valuations(date)",
    "CREATE INDEX IF NOT EXISTS idx_pv_club_comp ON player_valuations(player_club_domestic_competition_id)",
    
    # Appearances
    "CREATE INDEX IF NOT EXISTS idx_app_player_id ON appearances(player_id)",
    "CREATE INDEX IF NOT EXISTS idx_app_game_id ON appearances(game_id)",
    
    # Games
    "CREATE INDEX IF NOT EXISTS idx_games_season ON games(season)",
    "CREATE INDEX IF NOT EXISTS idx_games_competition ON games(competition_id)",
    "CREATE INDEX IF NOT EXISTS idx_games_date ON games(date)",
    
    # Transfers
    "CREATE INDEX IF NOT EXISTS idx_transfers_player ON transfers(player_id)",
    "CREATE INDEX IF NOT EXISTS idx_transfers_date ON transfers(transfer_date)",
    "CREATE INDEX IF NOT EXISTS idx_transfers_to_club ON transfers(to_club_id)",
    "CREATE INDEX IF NOT EXISTS idx_transfers_from_club ON transfers(from_club_id)",
    
    # Players
    "CREATE INDEX IF NOT EXISTS idx_players_club ON players(current_club_id)",
    "CREATE INDEX IF NOT EXISTS idx_players_position ON players(position)",
    
    # Clubs
    "CREATE INDEX IF NOT EXISTS idx_clubs_competition ON clubs(domestic_competition_id)",
    
    # Club games
    "CREATE INDEX IF NOT EXISTS idx_cg_club ON club_games(club_id)",
    "CREATE INDEX IF NOT EXISTS idx_cg_game ON club_games(game_id)",
    
    # Game events
    "CREATE INDEX IF NOT EXISTS idx_ge_game ON game_events(game_id)",
    "CREATE INDEX IF NOT EXISTS idx_ge_player ON game_events(player_id)",
]

for idx_sql in indexes:
    cursor.execute(idx_sql)
    idx_name = idx_sql.split("EXISTS ")[1].split(" ON")[0]
    print(f"  Created index: {idx_name}")

conn.commit()
conn.close()

print(f"\n{len(indexes)} indexes created.")
print(f"Database size after indexing: {DB_PATH.stat().st_size / 1024 / 1024:.1f} MB")

## 6. Verification

Run basic integrity checks to ensure data was loaded correctly.

In [None]:
sys.path.insert(0, str(Path("..").resolve()))
from notebooks.utils.db_helpers import get_connection, run_query, table_info

# Show all tables with row counts
print("Database Tables:")
print("=" * 40)
display(table_info())

In [None]:
# Quick sanity checks
checks = [
    ("Top 5 leagues exist", """
        SELECT competition_id, name 
        FROM competitions 
        WHERE competition_id IN ('GB1', 'ES1', 'IT1', 'L1', 'FR1')
        ORDER BY name
    """),
    ("Players have market values", """
        SELECT 
            COUNT(*) as total_players,
            SUM(CASE WHEN market_value_in_eur IS NOT NULL THEN 1 ELSE 0 END) as with_value,
            ROUND(AVG(market_value_in_eur), 0) as avg_value
        FROM players
    """),
    ("Transfers have fees", """
        SELECT 
            COUNT(*) as total_transfers,
            SUM(CASE WHEN transfer_fee > 0 THEN 1 ELSE 0 END) as with_fee,
            ROUND(MAX(transfer_fee), 0) as max_fee
        FROM transfers
    """),
    ("Valuation date range", """
        SELECT 
            MIN(date) as earliest,
            MAX(date) as latest,
            COUNT(DISTINCT player_id) as unique_players
        FROM player_valuations
    """),
]

for title, query in checks:
    print(f"\n{title}:")
    display(run_query(query))

print("\n All checks passed. Database is ready for analysis.")