# 02 - Exploratory Data Analysis

Deep-dive into the Transfermarkt dataset: distributions, trends, correlations, and key patterns.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path

plt.rcParams.update({
    'figure.facecolor': '#0a0a0f', 'axes.facecolor': '#111118',
    'axes.edgecolor': '#1e1e2e', 'axes.labelcolor': '#8888a0',
    'text.color': '#e8e8f0', 'xtick.color': '#555568', 'ytick.color': '#555568',
    'grid.color': '#1e1e2e', 'grid.alpha': 0.5, 'font.family': 'monospace',
    'axes.titlecolor': '#fb8b1e', 'axes.titleweight': 'bold', 'axes.titlesize': 12,
})
ORANGE, CYAN, RED = '#fb8b1e', '#4af6c3', '#ff433d'
COLORS = [ORANGE, CYAN, '#3b82f6', '#a855f7', '#22d3ee']
LEAGUE_NAMES = {'GB1': 'Premier League', 'ES1': 'La Liga', 'IT1': 'Serie A',
                'L1': 'Bundesliga', 'FR1': 'Ligue 1'}
BIG5 = list(LEAGUE_NAMES.keys())

DB_PATH = Path('..') / 'data' / 'processed' / 'football.db'
conn = sqlite3.connect(DB_PATH)
print(f"Connected: {DB_PATH}")


## 1. Dataset Overview

In [None]:

tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", conn)['name'].tolist()
print(f"{'TABLE':<30} {'ROWS':>12}")
print('-' * 44)
for t in sorted(tables):
    n = pd.read_sql(f"SELECT COUNT(*) as n FROM [{t}]", conn)['n'].iloc[0]
    print(f"{t:<30} {n:>12,}")


## 2. Transfer Fee Distribution

In [None]:

SQL = '''
    SELECT transfer_fee, market_value_in_eur, player_name,
           from_club_name, to_club_name, transfer_season
    FROM transfers
    WHERE transfer_fee > 1000000
'''
df_transfers = pd.read_sql(SQL, conn)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(df_transfers['transfer_fee'] / 1e6, bins=50, color=ORANGE, alpha=0.8, edgecolor='none')
axes[0].set_xlabel('Transfer Fee (M EUR)')
axes[0].set_ylabel('Count')
axes[0].set_title('Transfer Fee Distribution (>1M)')
axes[0].set_yscale('log')

top15 = df_transfers.nlargest(15, 'transfer_fee')
axes[1].barh(top15['player_name'].str[:20], top15['transfer_fee'] / 1e6, color=ORANGE, alpha=0.85)
axes[1].set_xlabel('Transfer Fee (M EUR)')
axes[1].set_title('Top 15 Transfer Fees (All Time)')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig('../data/processed/fig_transfer_dist.png', dpi=120, bbox_inches='tight', facecolor='#0a0a0f')
plt.show()
print(f"Transfers >1M: {len(df_transfers):,} | Max: EUR {df_transfers['transfer_fee'].max()/1e6:.0f}M")


## 3. Market Value Trends by League (Big 5)

In [None]:

SQL = '''
    SELECT
        CAST(strftime('%Y', date) AS INTEGER) as year,
        player_club_domestic_competition_id as league_id,
        COUNT(DISTINCT player_id) as player_count,
        SUM(market_value_in_eur) / 1e9 as total_value_bn,
        AVG(market_value_in_eur) / 1e6 as avg_value_m
    FROM player_valuations
    WHERE player_club_domestic_competition_id IN ('GB1','ES1','IT1','L1','FR1')
      AND market_value_in_eur > 0
      AND date >= '2012-01-01'
    GROUP BY year, league_id
    ORDER BY year, league_id
'''
df_mv = pd.read_sql(SQL, conn)
df_mv['league'] = df_mv['league_id'].map(LEAGUE_NAMES)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for i, (lid, grp) in enumerate(df_mv.groupby('league_id')):
    axes[0].plot(grp['year'], grp['total_value_bn'], label=LEAGUE_NAMES.get(lid, lid), color=COLORS[i], linewidth=2)
    axes[1].plot(grp['year'], grp['avg_value_m'], label=LEAGUE_NAMES.get(lid, lid), color=COLORS[i], linewidth=2)

for ax, title, ylabel in zip(axes,
    ['Total Market Value (Bn EUR) by League', 'Average Player Value (M EUR) by League'],
    ['Total Value (Bn EUR)', 'Avg Value (M EUR)']):
    ax.set_title(title); ax.set_xlabel('Year'); ax.set_ylabel(ylabel)
    ax.legend(fontsize=9); ax.grid(True)

plt.tight_layout()
plt.savefig('../data/processed/fig_market_trends.png', dpi=120, bbox_inches='tight', facecolor='#0a0a0f')
plt.show()


## 4. Age-Value Depreciation Curves by Position

In [None]:

SQL = '''
    SELECT
        CAST((julianday(pv.date) - julianday(p.date_of_birth)) / 365.25 AS INTEGER) as age,
        p.position,
        pv.market_value_in_eur / 1e6 as value_m
    FROM player_valuations pv
    JOIN players p ON pv.player_id = p.player_id
    WHERE p.position IN ('Attack', 'Midfield', 'Defender', 'Goalkeeper')
      AND p.date_of_birth IS NOT NULL
      AND pv.market_value_in_eur > 0
      AND pv.date >= '2015-01-01'
'''
df_ages = pd.read_sql(SQL, conn)
df_ages = df_ages[(df_ages['age'] >= 16) & (df_ages['age'] <= 40)]
age_curves = df_ages.groupby(['position', 'age'])['value_m'].median().reset_index()

pos_colors = {'Attack': ORANGE, 'Midfield': CYAN, 'Defender': '#3b82f6', 'Goalkeeper': '#a855f7'}
fig, ax = plt.subplots(figsize=(12, 6))
for pos, color in pos_colors.items():
    data = age_curves[age_curves['position'] == pos]
    ax.plot(data['age'], data['value_m'], label=pos, color=color, linewidth=2.5)

ax.set_title('Median Market Value by Age and Position (Actuarial Depreciation Curve)')
ax.set_xlabel('Age'); ax.set_ylabel('Median Market Value (M EUR)')
ax.legend(); ax.grid(True)
ax.axvline(x=26, color=ORANGE, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.savefig('../data/processed/fig_age_curves.png', dpi=120, bbox_inches='tight', facecolor='#0a0a0f')
plt.show()


## 5. Value Distribution (Log Scale)

In [None]:

SQL = '''
    SELECT market_value_in_eur / 1e6 as value_m
    FROM player_valuations
    WHERE market_value_in_eur > 0
      AND date >= '2023-01-01'
'''
df_vals = pd.read_sql(SQL, conn)
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(df_vals['value_m'], bins=100, color=CYAN, alpha=0.8, edgecolor='none')
ax.set_xscale('log')
ax.set_title('Player Market Value Distribution (Log Scale, 2023+)')
ax.set_xlabel('Market Value (M EUR)'); ax.set_ylabel('Count'); ax.grid(True)
plt.tight_layout()
plt.savefig('../data/processed/fig_value_dist.png', dpi=120, bbox_inches='tight', facecolor='#0a0a0f')
plt.show()
pct = df_vals['value_m'].quantile([0.5, 0.75, 0.9, 0.95, 0.99])
[print(f"  {int(p*100)}th pct: {v:.2f}M") for p, v in pct.items()]


In [None]:
conn.close()
print('EDA complete.')