In [12]:
# !pip install --quiet pandas numpy matplotlib seaborn plotly missingno nbformat jupyterlab ipywidgets
# !pip install --quiet scipy

In [13]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
from scipy import stats
import plotly.graph_objects as go
from IPython.display import display, HTML

pd.set_option("display.max_columns", 2000)
pd.set_option("display.width", 180)
sns.set(style="whitegrid")
np.random.seed(7)


In [14]:
# 1) Load
df = pd.read_csv("../data/train_data.csv")

# 2) Simple clean
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
for c in ["country", "status"]:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip().str.title()

# 3) Convert numbers (bad text -> NaN)
num_cols = [c for c in df.columns if c not in ["country","status"]]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

print("✅ Dataset cleaned")
print("Shape:", df.shape)

display(df.head(300))

print("\n Top columns with missing values")
display(df.isna().sum().sort_values(ascending=False).to_frame("Missing Values"))

df.to_csv("life_expectancy_clean.csv", index=False)

# BASIC INFO
print("=== df.info() ===")
df.info()
display(df.describe(include='all').T.head(25))



✅ Dataset cleaned
Shape: (2497, 22)


Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,bmi,under-five_deaths,polio,total_expenditure,diphtheria,hiv/aids,gdp,population,thinness__1-19_years,thinness_5-9_years,income_composition_of_resources,schooling
0,Hungary,2009,Developed,74.2,162.0,0,11.46,1281.155944,,1,61.1,1,99.0,7.55,99.0,0.1,12967.165430,12265.0,1.8,1.8,0.816,15.3
1,Singapore,2010,Developed,82.0,61.0,0,1.84,4540.543752,96.0,50,31.8,0,96.0,3.96,96.0,0.1,46569.679510,,2.1,2.1,0.889,14.5
2,New Zealand,2008,Developed,81.0,75.0,0,9.49,6761.288966,9.0,0,63.8,0,89.0,1.70,89.0,0.1,31287.778650,,0.3,0.3,0.894,19.5
3,Honduras,2000,Developing,71.0,174.0,6,2.61,28.808311,93.0,0,38.8,8,88.0,6.63,94.0,1.7,188.783165,6524283.0,2.8,2.7,0.551,9.8
4,Egypt,2002,Developing,68.7,177.0,61,0.15,0.000000,97.0,653,51.8,75,97.0,5.97,97.0,0.1,,,3.4,3.4,0.617,11.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,Gambia,2003,Developing,57.0,297.0,3,2.47,0.000000,94.0,119,19.7,6,87.0,4.22,87.0,2.7,,,9.6,9.6,0.395,7.3
296,Uzbekistan,2004,Developing,67.8,183.0,24,1.59,34.418872,99.0,75,37.0,29,99.0,5.11,99.0,0.3,465.119887,2586435.0,3.2,3.2,0.613,11.6
297,Japan,2010,Developed,83.0,62.0,3,6.90,863.006149,,450,26.9,4,98.0,9.58,97.0,0.1,4457.676390,1287.0,1.9,1.6,0.879,15.1
298,Cameroon,2008,Developing,54.2,382.0,56,5.90,68.707304,84.0,495,24.9,87,82.0,5.18,84.0,6.7,1233.524316,18978.0,6.4,6.5,0.466,8.8



 Top columns with missing values


Unnamed: 0,Missing Values
population,540
hepatitis_b,481
gdp,365
total_expenditure,191
alcohol,164
income_composition_of_resources,141
schooling,137
thinness__1-19_years,31
thinness_5-9_years,31
bmi,31


=== df.info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2497 entries, 0 to 2496
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country                          2497 non-null   object 
 1   year                             2497 non-null   int64  
 2   status                           2497 non-null   object 
 3   life_expectancy                  2488 non-null   float64
 4   adult_mortality                  2488 non-null   float64
 5   infant_deaths                    2497 non-null   int64  
 6   alcohol                          2333 non-null   float64
 7   percentage_expenditure           2497 non-null   float64
 8   hepatitis_b                      2016 non-null   float64
 9   measles                          2497 non-null   int64  
 10  bmi                              2466 non-null   float64
 11  under-five_deaths                2497 non-null   int64  
 12  po

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
country,2497.0,192.0,Hungary,16.0,,,,,,,
year,2497.0,,,,2007.479375,4.616338,2000.0,2003.0,2007.0,2011.0,2015.0
status,2497.0,2.0,Developing,2060.0,,,,,,,
life_expectancy,2488.0,,,,69.14377,9.517112,36.3,63.2,72.0,75.525,89.0
adult_mortality,2488.0,,,,164.647508,124.08001,1.0,74.0,144.0,227.0,723.0
infant_deaths,2497.0,,,,28.987185,107.624711,0.0,0.0,3.0,22.0,1800.0
alcohol,2333.0,,,,4.577008,4.056307,0.01,0.85,3.67,7.68,17.87
percentage_expenditure,2497.0,,,,731.54887,1987.075471,0.0,5.347718,64.398533,437.105966,19479.91161
hepatitis_b,2016.0,,,,81.008433,25.107816,1.0,77.0,92.0,96.0,99.0
measles,2497.0,,,,2431.130156,11740.711517,0.0,0.0,17.0,383.0,212183.0


In [15]:
cov = (df.groupby("country")["year"]
         .agg(years_present="nunique", first_year="min", last_year="max")
         .sort_values("years_present", ascending=False)
         .reset_index())

fig_bar = px.bar(
    cov, x="country", y="years_present",
    hover_data=["first_year","last_year"],
    title="Years of coverage per country",
    labels={"years_present":"# of Years"}
)
fig_bar.update_layout(
    xaxis_tickangle=-60,
    xaxis=dict(tickfont=dict(size=8)),
    height=600,
    margin=dict(b=200)
)
fig_bar.show()


In [16]:
# === Panel sanity + global mean/median + coverage heatmap (all countries) ===

# columns must exist
assert all(c in df.columns for c in ["country", "year", "life_expectancy"]), \
    "Need 'country', 'year', and 'life_expectancy' columns"

# uniqueness check for (country, year)
dup_ct = int(df.duplicated(["country", "year"]).sum())
print(f"Duplicate (country, year) rows: {dup_ct}")

# de-duplicate to ONE value per (country, year) by averaging
df_unique = (
    df.groupby(["country", "year"], as_index=False)["life_expectancy"].mean()
)

# coverage summary (how many years per country)
coverage = (
    df_unique.groupby("country")["year"]
      .agg(years_present="nunique", first_year="min", last_year="max")
      .sort_values("years_present", ascending=False)
      .reset_index()
)
print("\nYears present per country (head):")
display(coverage.head(10))

# pivot (country × year)
pivot_le = (
    df_unique.pivot(index="country", columns="year", values="life_expectancy")
            .sort_index()
)

def show_wide_table(dfp: pd.DataFrame):
    html = dfp.to_html(float_format=lambda x: f"{x:.1f}" if pd.notna(x) else "")
    display(HTML(f'<div style="overflow-x:auto; max-width:100%;">{html}</div>'))

print("\n life expectancy by country × year :")
show_wide_table(pivot_le)

# 5) GLOBAL trend per year — mean vs median
global_trend = (
    df_unique.groupby("year")["life_expectancy"]
             .agg(global_mean="mean", global_median="median")
             .reset_index()
             .sort_values("year")
)
print("\nGlobal life expectancy per year — mean vs median (first rows):")
display(global_trend.head(12))

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=global_trend["year"], y=global_trend["global_mean"],
    mode="lines+markers", name="Global Mean"
))
fig.add_trace(go.Scatter(
    x=global_trend["year"], y=global_trend["global_median"],
    mode="lines+markers", name="Global Median", line=dict(dash="dash")
))
fig.update_layout(
    title="Global Life Expectancy — Mean vs Median",
    xaxis_title="Year", yaxis_title="Life Expectancy",
    template="plotly_white"
)
fig.show()

# coverage heatmap (all countries × years)
countries = sorted(df["country"].dropna().unique().tolist())
years     = sorted(df["year"].dropna().unique().tolist())

grid = (df[["country","year"]]
          .drop_duplicates()
          .assign(present=1)
          .pivot(index="country", columns="year", values="present")
          .reindex(index=countries, columns=years))

fig_covmap = go.Figure(data=go.Heatmap(
    z=grid.values, x=years, y=countries,
    colorscale=[[0, "#f0f0f0"], [1, "#4472c4"]],
    zmin=0, zmax=1, showscale=False
))
fig_covmap.update_layout(
    title="Coverage heatmap — Countries × Years (all countries)",
    xaxis_title="Year", yaxis_title="Country",
    template="plotly_white",
    height=max(600, min(2200, 18 * len(countries))),
    margin=dict(l=160, r=20, t=50, b=40)
)
fig_covmap.update_xaxes(side="top")
fig_covmap.show()


Duplicate (country, year) rows: 0

Years present per country (head):


Unnamed: 0,country,years_present,first_year,last_year
0,Armenia,16,2000,2015
1,Cambodia,16,2000,2015
2,Cabo Verde,16,2000,2015
3,Chad,16,2000,2015
4,Czechia,16,2000,2015
5,Zambia,16,2000,2015
6,Timor-Leste,16,2000,2015
7,Vanuatu,16,2000,2015
8,Singapore,16,2000,2015
9,Sao Tome And Principe,16,2000,2015



 life expectancy by country × year :


year,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Afghanistan,54.8,55.3,56.2,56.7,57.0,57.3,57.3,57.5,58.1,58.6,58.8,,59.5,59.9,,65.0
Albania,72.6,73.6,73.3,72.8,73.0,73.5,74.2,75.9,75.3,,76.2,76.6,76.9,77.2,,
Algeria,,71.4,71.6,71.7,72.3,72.9,73.4,73.8,74.1,74.4,,74.9,75.1,75.3,75.4,75.6
Angola,45.3,45.7,46.5,46.8,47.1,47.4,47.7,48.2,,49.1,49.6,51.0,56.0,51.1,51.7,52.4
Antigua And Barbuda,73.6,,74.0,74.2,74.4,,74.8,75.0,75.2,75.4,75.6,75.7,75.9,76.1,76.2,76.4
Argentina,74.1,74.0,,,74.7,,75.2,74.8,75.4,75.6,75.5,75.7,75.9,76.0,76.2,76.3
Armenia,72.0,72.6,72.6,72.7,73.0,73.0,72.9,73.5,73.2,73.3,73.5,73.9,74.4,74.4,74.6,74.8
Australia,79.5,,79.9,83.0,86.0,81.0,81.2,81.3,81.3,81.7,81.9,82.0,82.3,82.5,82.7,
Austria,78.1,,,78.8,,,79.8,,84.0,82.0,84.0,88.0,88.0,,81.4,81.5
Azerbaijan,66.6,67.5,67.8,67.8,68.4,68.4,69.2,73.0,73.0,,,71.6,71.9,,72.5,72.7



Global life expectancy per year — mean vs median (first rows):


Unnamed: 0,year,global_mean,global_median
0,2000,66.825466,71.1
1,2001,67.324026,71.25
2,2002,67.065161,71.2
3,2003,67.490323,71.1
4,2004,67.621795,71.45
5,2005,67.893902,71.3
6,2006,68.776582,72.15
7,2007,68.83481,72.25
8,2008,69.264336,72.5
9,2009,69.430247,72.25


life_expectancy: 0–120 years (no human lives beyond ~120)

adult_mortality: ≥0 (rate, can be high but not negative)

infant_deaths / under-five_deaths: ≥0 (counts, can be zero)

alcohol: 0–20 (litres per capita, very high but not 20+)

percentage_expenditure: ≥0 (can be huge %, not bounded strictly)

hepatitis_b / measles / polio / diphtheria: cannot be negative

bmi: 0–70 (realistic BMI, above 70 is biologically implausible)
hiv/aids: ≥0 (death rate per 1000, can be large but not negative)

gdp: ≥0 (can be very large, not negative in this dataset context)

population: ≥0 (cannot be negative)

thinness (1–19y, 5–9y): 0–100 (share of children affected, % not >100)

income_composition_of_resources: 0–1 (index definition, 1 = max human development)

schooling: 0–25 (mean years of schooling, ~20+ max observed)

In [17]:
# === IDENTIFIERS + TARGET ===
id_cols = [c for c in ["country", "year", "status"] if c in df.columns]
target = "life_expectancy"  # regression target

# BOUNDS: realistic ranges ===
bounds = {
    "life_expectancy": (0, 120),
    "schooling": (0, 25),
    "income_composition_of_resources": (0, 1),
    "bmi": (0, 70),
    "adult_mortality": (0, None),         # only lower bound (>=0)
    "infant_deaths": (0, None),
    "under-five_deaths": (0, None),
    "measles": (0, None),
    "polio": (0, 100),
    "diphtheria": (0, 100),
    "hepatitis_b": (0, 100),
    "gdp": (0, None),
    "population": (0, None),
    "total_expenditure": (0, 100)         # % of govt expenditure (if present)
}

# Out-of-bounds counts per column
oob_counts = []
for c,(lo,hi) in bounds.items():
    if c not in df.columns:
        continue
    bad = pd.Series(False, index=df.index)
    if lo is not None: bad |= df[c] < lo
    if hi is not None: bad |= df[c] > hi
    n = int(bad.sum())
    if n > 0:
        oob_counts.append((c, n))

if oob_counts:
    fig = go.Figure(data=[go.Table(
        header=dict(values=["Column","Out-of-range rows"], align="left"),
        cells=dict(values=[[r[0] for r in oob_counts],
                           [r[1] for r in oob_counts]], align="left")
    )])
    fig.update_layout(title="Bounds — flagged counts")
    fig.show()
else:
    print("✅ No out-of-range values under the simple bounds.")

# Store full offending rows for inspection
oob_rows = {}
for c,(lo,hi) in bounds.items():
    if c not in df.columns:
        continue
    bad = pd.Series(False, index=df.index)
    if lo is not None: bad |= df[c] < lo
    if hi is not None: bad |= df[c] > hi
    if bad.any():
        cols_show = id_cols + [c]
        oob_rows[c] = df.loc[bad, cols_show].sort_values(id_cols).copy()

# ALL columns with out-of-bounds rows ===
for col, bad_df in oob_rows.items():
    fig = go.Figure(data=[go.Table(
        header=dict(values=list(bad_df.columns), align="left"),
        cells=dict(values=[bad_df[k].tolist() for k in bad_df.columns], align="left")
    )])
    fig.update_layout(
        title=f"{col}: ALL out-of-range rows",
        height=500
    )
    fig.show()





real zero → keep (it’s true info, like “0 measles deaths”).

fake zero → replace with NaN so we can later impute properly (otherwise regression thinks it’s “real” and learns wrong patterns).

In [18]:
# === Suspicious Zeros + Missingness Diagnostics ===

def miss_rate(s):
    return s.isna().mean()

def status_gap(s):
    if "status" not in df.columns:
        return np.nan
    g = df.groupby("status")[s.name].apply(lambda x: x.isna().mean())
    return (g.max() - g.min()) if len(g) > 1 else 0.0

def year_trend(s):
    if "year" not in df.columns:
        return np.nan
    m = s.isna().astype(float)
    if m.nunique() <= 1:
        return np.nan
    return m.corr(df["year"], method="spearman")

vaccine_cols = ["polio","diphtheria","hepatitis_b"]
count_cols   = ["infant_deaths","under-five_deaths","measles"]
cont_cols    = ["gdp","population","adult_mortality","bmi","total_expenditure",
                "income_composition_of_resources","schooling","percentage_expenditure"]

plan_zero = []

# vaccines: if Developed countries almost never have 0%, then 0 is suspicious
for c in vaccine_cols:
    if c in df.columns:
        dev_rate = np.nan
        if "status" in df.columns:
            dev = df.loc[df["status"]=="Developed", c]
            if len(dev):
                dev_rate = (dev==0).mean()
        zero_rate = (df[c]==0).mean()
        action = "treat 0 as missing" if (not np.isnan(dev_rate) and dev_rate < 0.02 and zero_rate>0) else "keep 0 as real"
        plan_zero.append((
            c,"vaccine %",f"{zero_rate*100:.2f}%",
            f"{miss_rate(df[c])*100:.1f}%",
            f"{status_gap(df[c])*100:.1f}%" if not pd.isna(status_gap(df[c])) else "n/a",
            f"{year_trend(df[c]):.2f}" if not pd.isna(year_trend(df[c])) else "n/a",
            action
        ))

# counts: 0 can be natural (no deaths/cases)
for c in count_cols:
    if c in df.columns:
        zero_rate = (df[c]==0).mean()
        plan_zero.append((
            c,"count",f"{zero_rate*100:.2f}%",
            f"{miss_rate(df[c])*100:.1f}%",
            f"{status_gap(df[c])*100:.1f}%" if not pd.isna(status_gap(df[c])) else "n/a",
            f"{year_trend(df[c]):.2f}" if not pd.isna(year_trend(df[c])) else "n/a",
            "keep 0 as real"
        ))

# continuous nonnegatives: 0 suspicious if median>0 and rare (<1%)
for c in cont_cols:
    if c in df.columns:
        s = df[c].dropna()
        if s.empty:
            continue
        zero_rate = (s==0).mean()
        med = s.median()
        action = "treat 0 as missing" if (med>0 and 0<zero_rate<0.01) else "don't auto flag as of now"
        plan_zero.append((
            c,"continuous",f"{zero_rate*100:.2f}%",
            f"{miss_rate(df[c])*100:.1f}%",
            f"{status_gap(df[c])*100:.1f}%" if not pd.isna(status_gap(df[c])) else "n/a",
            f"{year_trend(df[c]):.2f}" if not pd.isna(year_trend(df[c])) else "n/a",
            action
        ))

plan_zero_df = pd.DataFrame(plan_zero, columns=["column","type","zero_rate","miss_rate","status_gap","year_trend(rho)","recommended_action"])

fig = go.Figure(data=[go.Table(
    header=dict(values=list(plan_zero_df.columns), align="left"),
    cells=dict(values=[plan_zero_df[c] for c in plan_zero_df.columns], align="left")
)])
fig.update_layout(title="Suspicious Zeros + Missingness Diagnostics")
fig.show()


<5% missing → trivial, simple imputation okay.

5–20% missing → impute carefully, check patterns.

40% missing → column often dropped (too unreliable).

percentage_expenditure: x% zeros = suspicious → countries don’t truly spend 0% on health → these should be treated as NaN in preprocessing.


In [19]:
# === Univariate Analysis ===
num_cols = [c for c in df.select_dtypes(include="number").columns if c not in ["year"]]

for c in num_cols:
    # --- Histogram + Boxplot ---
    fig = px.histogram(df, x=c, nbins=40, marginal="box", opacity=0.7,
                       title=f"Univariate Distribution — {c}",
                       labels={c:c})
    fig.update_traces(marker_color="royalblue")
    fig.show()

    # --- Line plot: trends over years by status ---
    if "status" in df.columns:
        trend = df.groupby(["year","status"])[c].mean().reset_index()
        fig = px.line(trend, x="year", y=c, color="status",
                      markers=True,
                      title=f"Trend of {c} over Years by Status",
                      labels={"year":"Year", c:c, "status":"Country Status"})
        fig.update_layout(template="plotly_white")
        fig.show()
    else:
        trend = df.groupby("year")[c].mean().reset_index()
        fig = px.line(trend, x="year", y=c, markers=True,
                      title=f"Trend of {c} over Years (Global)",
                      labels={"year":"Year", c:c})
        fig.update_layout(template="plotly_white")
        fig.show()


In [20]:
target = "life_expectancy"

# numeric features except IDs
num_cols = [c for c in df.select_dtypes(include="number").columns if c not in ["year"] and c != target]

results = []

for c in num_cols:
    # drop missing
    subset = df[[c, target, "status"]].dropna() if "status" in df.columns else df[[c, target]].dropna()
    if subset.empty:
        continue

    # --- correlations ---
    pear_r, pear_p = stats.pearsonr(subset[c], subset[target])
    spear_r, spear_p = stats.spearmanr(subset[c], subset[target])
    kend_r, kend_p = stats.kendalltau(subset[c], subset[target])

    results.append((c, pear_r, pear_p, spear_r, spear_p, kend_r, kend_p))

    # --- scatterplot ---
    fig = px.scatter(subset, x=c, y=target,
                     color="status" if "status" in subset.columns else None,
                     opacity=0.6, trendline="ols",
                     title=(f"{target} vs {c} <br>"
                            f"Pearson: {pear_r:.2f} (p={pear_p:.3g}), "
                            f"Spearman: {spear_r:.2f} (p={spear_p:.3g}), "
                            f"Kendall: {kend_r:.2f} (p={kend_p:.3g})"),
                     labels={c:c, target:target})
    fig.show()

# === Summary correlation table ===
corr_df = pd.DataFrame(results,
                       columns=["feature","pearson_corr","pearson_p",
                                "spearman_corr","spearman_p",
                                "kendall_tau","kendall_p"])

fig = go.Figure(data=[go.Table(
    header=dict(values=list(corr_df.columns), align="left"),
    cells=dict(values=[corr_df[c] for c in corr_df.columns], align="left")
)])
fig.update_layout(title=f"Correlation Summary with {target}")
fig.show()


In [21]:
# === Correlation Summary ===

from scipy.stats import pearsonr, spearmanr, kendalltau, pointbiserialr

target = "life_expectancy"
results = []

# numeric features only
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target in num_cols:
    num_cols.remove(target)

for c in num_cols:
    x = df[c].dropna()
    y = df.loc[x.index, target].dropna()
    common_idx = x.index.intersection(y.index)
    if len(common_idx) < 10:  # too few values to correlate
        continue
    x = df.loc[common_idx, c]
    y = df.loc[common_idx, target]

    # Pearson
    pearson_corr, pearson_p = pearsonr(x, y)

    # Spearman
    spearman_corr, spearman_p = spearmanr(x, y)

    # Kendall Tau
    kendall_corr, kendall_p = kendalltau(x, y)

    results.append((c, pearson_corr, pearson_p,
                    spearman_corr, spearman_p,
                    kendall_corr, kendall_p))

# handle categorical: status (binary → point-biserial)
if "status" in df.columns:
    # encode Developed=1, Developing=0
    status_num = df["status"].map({"Developed":1, "Developing":0})
    common_idx = status_num.dropna().index.intersection(df[target].dropna().index)
    if len(common_idx) > 10:
        r, p = pointbiserialr(status_num.loc[common_idx], df.loc[common_idx, target])
        results.append(("status (point-biserial)", r, p, np.nan, np.nan, np.nan, np.nan))

# format results
corr_df = pd.DataFrame(results, columns=[
    "feature","pearson_corr","pearson_p",
    "spearman_corr","spearman_p",
    "kendall_tau","kendall_p"
])

# sort by absolute Pearson correlation
corr_df["abs_pearson"] = corr_df["pearson_corr"].abs()
corr_df = corr_df.sort_values("abs_pearson", ascending=False).drop(columns="abs_pearson")

# display table
fig = go.Figure(data=[go.Table(
    header=dict(values=list(corr_df.columns), align="left"),
    cells=dict(values=[corr_df[c] for c in corr_df.columns], align="left")
)])
fig.update_layout(title=f"Correlation of Features with {target}")
fig.show()


In [22]:
# === Population-Weighted vs Unweighted Trends===
if {"life_expectancy","year","population"}.issubset(df.columns):
    # unweighted mean
    global_unweighted = (
        df.groupby("year", as_index=False)["life_expectancy"]
          .mean()
          .rename(columns={"life_expectancy":"life_expectancy_unweighted"})
    )

    # weighted mean (population-weighted)
    global_weighted = (
        df.dropna(subset=["life_expectancy","population"])
          .groupby("year", as_index=False)
          .apply(lambda g: pd.Series({
              "life_expectancy_weighted": np.average(
                  g["life_expectancy"], weights=g["population"]
              )
          }))
    )

    # merge
    global_trends = pd.merge(global_unweighted, global_weighted, on="year", how="inner")

    # plot
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=global_trends["year"], y=global_trends["life_expectancy_unweighted"],
        mode="lines+markers", name="Unweighted Mean", line=dict(color="royalblue")
    ))
    fig.add_trace(go.Scatter(
        x=global_trends["year"], y=global_trends["life_expectancy_weighted"],
        mode="lines+markers", name="Population-Weighted Mean", line=dict(color="darkorange")
    ))
    fig.update_layout(
        title="Global Life Expectancy — Unweighted vs Population-Weighted",
        xaxis_title="Year",
        yaxis_title="Life Expectancy",
        template="plotly_white"
    )
    fig.show()

    display(global_trends.head(10))
else:
    print("⚠️ Need life_expectancy, year, and population columns for weighted trends.")






Unnamed: 0,year,life_expectancy_unweighted,life_expectancy_weighted
0,2000,66.825466,66.427787
1,2001,67.324026,63.739871
2,2002,67.065161,64.92938
3,2003,67.490323,65.048306
4,2004,67.621795,65.527022
5,2005,67.893902,66.248151
6,2006,68.776582,67.596648
7,2007,68.83481,67.231948
8,2008,69.264336,66.434538
9,2009,69.430247,70.348008


The blue line (unweighted) is higher.
Many small rich countries (like in Europe) live long.

But they are tiny → they push the unweighted average up.

The orange line (weighted) is lower.

Big countries (India, China, Nigeria) had lower life expectancy.

Since billions live there, they drag the weighted average down.

So →

Blue (unweighted) = “average country is doing quite well.”

Orange (weighted) = “average human is doing worse because big countries matter more.”