# Milestone 1 — SQL Analysis (SQLite)



In [1]:
# Setup: imports + data path
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Local data path (update if different)
data_dir = Path(r"C:\Users\Adithya\Downloads\Bibhu\SQL\athlete_events\data")
athletes_fp = data_dir / "athlete_events.csv"
noc_fp = data_dir / "noc_regions.csv"

print("Using data path:", athletes_fp)
if not athletes_fp.exists():
    raise FileNotFoundError(f"athlete_events.csv not found at {athletes_fp}. Place the CSV in this folder and re-run the notebook.")
# load CSV into pandas
df = pd.read_csv(athletes_fp, low_memory=False)
print("Loaded CSV with shape:", df.shape)
# standard helpers
df['Year'] = df['Year'].astype(int)
df['is_medal'] = df['Medal'].notna()


Using data path: C:\Users\Adithya\Downloads\Bibhu\SQL\athlete_events\data\athlete_events.csv
Loaded CSV with shape: (271116, 15)


## Create in-memory SQLite DB and load dataframe as a SQL table

In [2]:
# Create in-memory SQLite DB and populate 'olympics' table
conn = sqlite3.connect(':memory:')
df.to_sql('olympics', conn, index=False, if_exists='replace')
print('Table olympics created in SQLite in-memory DB')

Table olympics created in SQLite in-memory DB


## SQL: Medals over time (example query)

In [3]:
import pandas as pd
query = '''
SELECT Year, COUNT(Medal) AS total_medals
FROM olympics
WHERE Medal IS NOT NULL
GROUP BY Year
ORDER BY Year;
'''
medals_over_time = pd.read_sql_query(query, conn)
medals_over_time.head()

Unnamed: 0,Year,total_medals
0,1896,143
1,1900,604
2,1904,486
3,1906,458
4,1908,831


## SQL: Top 10 sports by medal count

In [4]:
query = '''
SELECT Sport, COUNT(Medal) AS medal_count
FROM olympics
WHERE Medal IS NOT NULL
GROUP BY Sport
ORDER BY medal_count DESC
LIMIT 10;
'''
top_sports_sql = pd.read_sql_query(query, conn)
top_sports_sql

Unnamed: 0,Sport,medal_count
0,Athletics,3969
1,Swimming,3048
2,Rowing,2945
3,Gymnastics,2256
4,Fencing,1743
5,Football,1571
6,Ice Hockey,1530
7,Hockey,1528
8,Wrestling,1296
9,Cycling,1263


## SQL: Top 10 NOCs by medal count

In [5]:
query = '''
SELECT NOC, COUNT(Medal) AS medal_count
FROM olympics
WHERE Medal IS NOT NULL
GROUP BY NOC
ORDER BY medal_count DESC
LIMIT 10;
'''
top_nocs_sql = pd.read_sql_query(query, conn)
top_nocs_sql

Unnamed: 0,NOC,medal_count
0,USA,5637
1,URS,2503
2,GER,2165
3,GBR,2068
4,FRA,1777
5,ITA,1637
6,SWE,1536
7,CAN,1352
8,AUS,1320
9,RUS,1165


## SQL: Gender participation per year (athlete counts)

In [6]:
query = '''
SELECT Year, Sex, COUNT(DISTINCT ID) AS athletes_count
FROM olympics
GROUP BY Year, Sex
ORDER BY Year;
'''
gender_participation = pd.read_sql_query(query, conn)
gender_participation.head()

Unnamed: 0,Year,Sex,athletes_count
0,1896,M,176
1,1900,F,23
2,1900,M,1201
3,1904,F,6
4,1904,M,644


## Notes
- These queries run on an **in-memory** SQLite database created from the CSV. 
- For larger/external DBs use PostgreSQL and the loader script.