# 02 â€” Analysis (regressions + figures)

This notebook:
1. Loads `data/processed/ess_macro_merged.csv`
2. Runs the regression specifications used in the paper
3. Produces figures and saves them to `results/figures/`

Run `01_data_prep.ipynb` first.


In [None]:
# Imports
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf
import statsmodels.api as sm


In [None]:
# Paths
from pathlib import Path

ROOT = Path().resolve().parent
DATA_PROCESSED = ROOT / "data" / "processed"
RESULTS_FIG = ROOT / "results" / "figures"
RESULTS_TAB = ROOT / "results" / "tables"
RESULTS_FIG.mkdir(parents=True, exist_ok=True)
RESULTS_TAB.mkdir(parents=True, exist_ok=True)

merged_file = DATA_PROCESSED / "ess_macro_merged.csv"
country_file = DATA_PROCESSED / "country_trust_summary.csv"

df_merged = pd.read_csv(merged_file)
country_level = pd.read_csv(country_file)

print(df_merged.shape, country_level.shape)
df_merged.head()


In [None]:
# Quick correlations (country-level)
corr = country_level[["trust_index", "gini", "gdp_pps"]].corr()
corr


In [None]:
# Model A: trust ~ gini + gdp + controls (clustered by country)
df_final = df_merged.dropna(subset=["agea", "gndr", "eisced"])

model_A = smf.ols(
    "trust_index ~ gini + gdp_pps + agea + C(gndr) + C(eisced)",
    data=df_final
).fit(cov_type="cluster", cov_kwds={"groups": df_final["cntry"]})

print(model_A.summary())


In [None]:
# Model B: trust ~ gini + controls (no GDP)
model_B = smf.ols(
    "trust_index ~ gini + agea + C(gndr) + C(eisced)",
    data=df_final
).fit(cov_type="cluster", cov_kwds={"groups": df_final["cntry"]})

print(model_B.summary())


In [None]:
# Model C: trust ~ gini + log(GDP) + controls
df_final = df_final.copy()
df_final["log_gdp"] = np.log(df_final["gdp_pps"])

model_C = smf.ols(
    "trust_index ~ gini + log_gdp + agea + C(gndr) + C(eisced)",
    data=df_final
).fit(cov_type="cluster", cov_kwds={"groups": df_final["cntry"]})

print(model_C.summary())


In [None]:
# Country-level regressions (N = number of countries)
X = sm.add_constant(country_level[["gini", "gdp_pps"]])
y = country_level["trust_index"]
model_D = sm.OLS(y, X).fit()
print(model_D.summary())

country_level = country_level.copy()
country_level["log_gdp"] = np.log(country_level["gdp_pps"])
Xlog = sm.add_constant(country_level[["gini", "log_gdp"]])
model_E = sm.OLS(y, Xlog).fit()
print(model_E.summary())


In [None]:
# Figure 1: Trust vs Gini (country-level)
plt.figure(figsize=(8,6))
plt.scatter(country_level["gini"], country_level["trust_index"])

# simple fitted line
m, b = np.polyfit(country_level["gini"], country_level["trust_index"], 1)
x = np.linspace(country_level["gini"].min(), country_level["gini"].max(), 100)
plt.plot(x, m*x + b)

for _, r in country_level.iterrows():
    plt.text(r["gini"], r["trust_index"], r["cntry"])

plt.xlabel("Gini (Inequality)")
plt.ylabel("Average Trust")
plt.title("Country-Level Trust vs Inequality (2023)")
out = RESULTS_FIG / "trust_vs_gini.png"
plt.tight_layout()
plt.savefig(out, dpi=200)
plt.show()
print("Saved:", out)


In [None]:
# Figure 2: Trust vs GDP (country-level)
plt.figure(figsize=(8,6))
plt.scatter(country_level["gdp_pps"], country_level["trust_index"])

m, b = np.polyfit(country_level["gdp_pps"], country_level["trust_index"], 1)
x = np.linspace(country_level["gdp_pps"].min(), country_level["gdp_pps"].max(), 100)
plt.plot(x, m*x + b)

for _, r in country_level.iterrows():
    plt.text(r["gdp_pps"], r["trust_index"], r["cntry"])

plt.xlabel("GDP per capita (PPS)")
plt.ylabel("Average Trust")
plt.title("Country-Level Trust vs GDP (2023)")
out = RESULTS_FIG / "trust_vs_gdp.png"
plt.tight_layout()
plt.savefig(out, dpi=200)
plt.show()
print("Saved:", out)


In [None]:
# Export regression tables (simple CSV summary)
def tidy_ols(res, model_name):
    return pd.DataFrame({
        "term": res.params.index,
        "coef": res.params.values,
        "se": res.bse.values,
        "p": res.pvalues.values,
        "model": model_name
    })

tables = pd.concat([
    tidy_ols(model_A, "A_full"),
    tidy_ols(model_B, "B_no_gdp"),
    tidy_ols(model_C, "C_log_gdp"),
    tidy_ols(model_D, "D_country"),
    tidy_ols(model_E, "E_country_log"),
], ignore_index=True)

out = RESULTS_TAB / "regression_results.csv"
tables.to_csv(out, index=False)
print("Saved:", out)
tables.head(10)
