## Step 1: Load raw data and inspect

In [1]:
import pandas as pd

# Load scraped data
df = pd.read_csv("../data/raw/supply_raw.csv")

# Inspect the structure
df.head()
df.info()
df.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   #                        233 non-null    int64  
 1   Country (or dependency)  233 non-null    object 
 2   Population 2025          233 non-null    object 
 3   Yearly Change            233 non-null    object 
 4   Net Change               233 non-null    object 
 5   Density (P/KmÂ²)         233 non-null    object 
 6   Land Area (KmÂ²)         233 non-null    object 
 7   Migrants (net)           233 non-null    object 
 8   Fert. Rate               233 non-null    float64
 9   Median Age               233 non-null    float64
 10  Urban Pop %              210 non-null    object 
 11  World Share              233 non-null    object 
dtypes: float64(2), int64(1), object(9)
memory usage: 22.0+ KB


Unnamed: 0,#,Country (or dependency),Population 2025,Yearly Change,Net Change,Density (P/KmÂ²),Land Area (KmÂ²),Migrants (net),Fert. Rate,Median Age,Urban Pop %,World Share
count,233.0,233,233.0,233,233.0,233.0,233.0,233,233.0,233.0,210,233
unique,,233,233.0,177,231.0,164.0,224.0,233,,,191,165
top,,India,1463865525.0,0.47%,5.0,4.0,460.0,"â495,753",,,86%,0.13%
freq,,1,1.0,4,2.0,8.0,3.0,1,,,2,8
mean,117.0,,,,,,,,2.306094,31.896996,,
std,67.405489,,,,,,,,1.134316,9.860936,,
min,1.0,,,,,,,,0.69,14.5,,
25%,59.0,,,,,,,,1.47,22.9,,
50%,117.0,,,,,,,,1.94,32.8,,
75%,175.0,,,,,,,,2.95,39.8,,


## Step 2: Cleaning

In [3]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Drop rows with any missing values
df = df.dropna()

# Convert numeric columns (Worldometers stores numbers as strings)
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')

df.head()


  df[col] = pd.to_numeric(df[col], errors='ignore')


Unnamed: 0,#,country_(or_dependency),population_2025,yearly_change,net_change,density_(p/kmâ²),land_area_(kmâ²),migrants_(net),fert._rate,median_age,urban_pop_%,world_share
0,1,India,1463865525,0.89%,12929734,492,2973190,"â495,753",1.94,28.8,37.1%,17.78%
1,2,China,1416096094,â0.23%,"â3,225,184",151,9388211,"â268,126",1.02,40.1,67.5%,17.20%
2,3,United States,347275807,0.54%,1849236,38,9147420,1230663,1.62,38.5,82.8%,4.22%
3,4,Indonesia,285721236,0.79%,2233305,158,1811570,"â39,509",2.1,30.4,59.6%,3.47%
4,5,Pakistan,255219554,1.57%,3950390,331,770880,"â1,235,336",3.5,20.6,34.4%,3.10%


## Step 3: Save processed data

In [4]:
df.to_csv("../data/processed/supply_data.csv", index=False)
df.head()


Unnamed: 0,#,country_(or_dependency),population_2025,yearly_change,net_change,density_(p/kmâ²),land_area_(kmâ²),migrants_(net),fert._rate,median_age,urban_pop_%,world_share
0,1,India,1463865525,0.89%,12929734,492,2973190,"â495,753",1.94,28.8,37.1%,17.78%
1,2,China,1416096094,â0.23%,"â3,225,184",151,9388211,"â268,126",1.02,40.1,67.5%,17.20%
2,3,United States,347275807,0.54%,1849236,38,9147420,1230663,1.62,38.5,82.8%,4.22%
3,4,Indonesia,285721236,0.79%,2233305,158,1811570,"â39,509",2.1,30.4,59.6%,3.47%
4,5,Pakistan,255219554,1.57%,3950390,331,770880,"â1,235,336",3.5,20.6,34.4%,3.10%
