# 03 - SQL Analysis

Demonstrates CTEs, window functions, correlated subqueries, and financial metrics in pure SQL.

In [None]:

import pandas as pd
import sqlite3
import sys
from pathlib import Path

DB_PATH = Path('..') / 'data' / 'processed' / 'football.db'
conn = sqlite3.connect(DB_PATH)
print("Connected.")

def show(title, sql, n=10):
    df = pd.read_sql(sql, conn)
    print(f"\n{'='*60}\n  {title}\n{'='*60}")
    display(df.head(n))
    return df


## 1. YoY Growth -- Window Function (LAG)

In [None]:
show('Year-over-Year Market Value Growth (LAG + Window)', '''
    WITH yearly AS (
        SELECT
            CAST(strftime('%Y', date) AS INTEGER) as year,
            player_club_domestic_competition_id as league_id,
            SUM(market_value_in_eur) / 1e9 as total_value_bn
        FROM player_valuations
        WHERE player_club_domestic_competition_id IN ('GB1','ES1','IT1','L1','FR1')
          AND market_value_in_eur > 0
          AND date >= '2012-01-01'
        GROUP BY year, league_id
    )
    SELECT year, league_id,
        ROUND(total_value_bn, 2) as total_value_bn,
        ROUND(
            (total_value_bn - LAG(total_value_bn) OVER (PARTITION BY league_id ORDER BY year))
            * 100.0
            / NULLIF(LAG(total_value_bn) OVER (PARTITION BY league_id ORDER BY year), 0),
        1) as yoy_growth_pct
    FROM yearly
    ORDER BY league_id, year DESC
    LIMIT 25
''')

## 2. Transfer ROI -- Correlated Subquery + CASE

In [None]:
show('Transfer ROI: Market Value Change 1yr After Transfer', '''
    WITH transfer_roi AS (
        SELECT
            t.player_name, t.transfer_fee, t.transfer_date,
            t.from_club_name, t.to_club_name,
            (SELECT pv.market_value_in_eur
             FROM player_valuations pv
             WHERE pv.player_id = t.player_id
               AND pv.date > t.transfer_date
               AND pv.date <= DATE(t.transfer_date, '+365 days')
             ORDER BY pv.date DESC LIMIT 1) AS mv_after_1yr
        FROM transfers t
        WHERE t.transfer_fee > 10000000
    )
    SELECT
        player_name,
        ROUND(transfer_fee / 1e6, 1) as fee_m,
        ROUND(mv_after_1yr / 1e6, 1) as value_1yr_m,
        ROUND((mv_after_1yr - transfer_fee) * 100.0 / transfer_fee, 1) as roi_pct,
        CASE
            WHEN mv_after_1yr > transfer_fee * 1.5 THEN 'Excellent (>50%)'
            WHEN mv_after_1yr > transfer_fee THEN 'Positive'
            WHEN mv_after_1yr > transfer_fee * 0.7 THEN 'Moderate Loss'
            ELSE 'Significant Loss'
        END as roi_category
    FROM transfer_roi
    WHERE mv_after_1yr IS NOT NULL
    ORDER BY fee_m DESC LIMIT 20
''')

## 3. Peak Age by Position -- PERCENT_RANK + ROW_NUMBER

In [None]:
show('Peak Value Age by Position', '''
    WITH age_values AS (
        SELECT
            p.position,
            CAST((julianday(pv.date) - julianday(p.date_of_birth)) / 365.25 AS INTEGER) as age,
            AVG(pv.market_value_in_eur) as avg_value
        FROM player_valuations pv
        JOIN players p ON pv.player_id = p.player_id
        WHERE p.position IN ('Attack', 'Midfield', 'Defender', 'Goalkeeper')
          AND p.date_of_birth IS NOT NULL
          AND pv.market_value_in_eur > 0
          AND pv.date >= '2015-01-01'
        GROUP BY p.position, age
        HAVING age BETWEEN 17 AND 38
    ),
    ranked AS (
        SELECT position, age,
            ROUND(avg_value / 1e6, 2) as avg_value_m,
            ROUND(PERCENT_RANK() OVER (PARTITION BY position ORDER BY avg_value) * 100, 1) as value_pct_rank,
            ROW_NUMBER() OVER (PARTITION BY position ORDER BY avg_value DESC) as value_rank
        FROM age_values
    )
    SELECT position, age, avg_value_m, value_pct_rank
    FROM ranked WHERE value_rank = 1
    ORDER BY avg_value_m DESC
''')

## 4. Sharpe Ratio -- Financial SQL (Variance, Std Dev in pure SQL)

In [None]:
show('Risk-Adjusted Returns (Sharpe Ratio) by League', '''
    WITH monthly_values AS (
        SELECT
            player_club_domestic_competition_id as league_id,
            strftime('%Y-%m', date) as month,
            AVG(market_value_in_eur) as avg_value
        FROM player_valuations
        WHERE player_club_domestic_competition_id IN ('GB1','ES1','IT1','L1','FR1')
          AND market_value_in_eur > 0 AND date >= '2015-01-01'
        GROUP BY league_id, month
    ),
    monthly_returns AS (
        SELECT league_id, month,
            (avg_value - LAG(avg_value) OVER (PARTITION BY league_id ORDER BY month))
            * 100.0 / NULLIF(LAG(avg_value) OVER (PARTITION BY league_id ORDER BY month), 0)
            as monthly_return_pct
        FROM monthly_values
    )
    SELECT league_id,
        ROUND(AVG(monthly_return_pct), 3) as avg_monthly_return,
        ROUND(SQRT(
            (SUM(monthly_return_pct * monthly_return_pct) / COUNT(*))
            - (AVG(monthly_return_pct) * AVG(monthly_return_pct))
        ), 3) as volatility,
        ROUND(AVG(monthly_return_pct) / NULLIF(SQRT(
            (SUM(monthly_return_pct * monthly_return_pct) / COUNT(*))
            - (AVG(monthly_return_pct) * AVG(monthly_return_pct))
        ), 0), 3) as sharpe_ratio
    FROM monthly_returns
    WHERE monthly_return_pct IS NOT NULL
    GROUP BY league_id
    ORDER BY sharpe_ratio DESC
''')

## 5. Club Net Spend -- FULL OUTER JOIN + Multi-CTE

In [None]:
show('Club Net Transfer Spend (2018+)', '''
    WITH club_spend AS (
        SELECT to_club_name as club, SUM(transfer_fee) as total_spent
        FROM transfers WHERE transfer_fee > 0 AND transfer_date >= '2018-01-01'
        GROUP BY to_club_name
    ),
    club_receipts AS (
        SELECT from_club_name as club, SUM(transfer_fee) as total_received
        FROM transfers WHERE transfer_fee > 0 AND transfer_date >= '2018-01-01'
        GROUP BY from_club_name
    )
    SELECT COALESCE(s.club, r.club) as club,
        ROUND(COALESCE(s.total_spent, 0) / 1e6, 1) as spent_m,
        ROUND(COALESCE(r.total_received, 0) / 1e6, 1) as received_m,
        ROUND((COALESCE(r.total_received, 0) - COALESCE(s.total_spent, 0)) / 1e6, 1) as net_spend_m
    FROM club_spend s
    FULL OUTER JOIN club_receipts r ON s.club = r.club
    ORDER BY spent_m DESC LIMIT 20
''')

In [None]:
conn.close()
print('SQL analysis complete.')