In [None]:
import pandas as pd
from pathlib import Path

# ðŸ‘‡ Write your file name here (including .csv)
file_name = "ees_round_11.csv"

# Automatically get Desktop path
desktop_path = Path.home() / "Desktop"

# Full file path
file_path = desktop_path / file_name

# Read CSV
df = pd.read_csv(file_path)

# Show first rows



In [None]:
import numpy as np

# Select trust variables
trust_vars = ['ppltrst', 'pplfair', 'pplhlp']

# Replace ESS special missing codes with NaN
for var in trust_vars:
    df[var] = df[var].replace([77, 88, 99], np.nan)

# Check that values are between 0 and 10
print(df[trust_vars].describe())

# Create Trust Index (row-wise mean, ignoring NaN)
df['trust_index'] = df[trust_vars].mean(axis=1)

# Check distribution of trust index
print(df['trust_index'].describe())


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- 1) Clean trust items and build trust_index ---
trust_vars = ["ppltrst", "pplfair", "pplhlp"]

# ESS special missing codes -> NaN
for v in trust_vars:
    df[v] = df[v].replace([77, 88, 99], np.nan)

# Trust index = equal-weight average of 3 items
df["trust_index"] = df[trust_vars].mean(axis=1)

# Keep only rows with country + trust_index
df_micro = df.dropna(subset=["cntry", "trust_index"]).copy()

# --- 2) Country-level trust summary (mean + N) ---
country_trust = (
    df_micro.groupby("cntry")
    .agg(
        mean_trust=("trust_index", "mean"),
        sd_trust=("trust_index", "std"),
        n=("trust_index", "size")
    )
    .reset_index()
    .sort_values("mean_trust", ascending=False)
)

print(country_trust.head(15))

# --- 3) Export the table (optional) ---
country_trust.to_csv("country_trust_summary.csv", index=False)
print("Saved: country_trust_summary.csv")

# --- 4) Plot: average trust by country (optional but useful) ---
# (Plot top 20 for readability)
top = country_trust.head(20).sort_values("mean_trust", ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(top["cntry"], top["mean_trust"])
plt.xlabel("Average Trust Index (0â€“10)")
plt.ylabel("Country")
plt.title("Top 20 Countries by Average Trust (ESS Round 11)")
plt.tight_layout()
plt.show()


In [None]:
!pip install statsmodels
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# =========================
# 0) Paths (your uploaded files)
# =========================
# Use existing df with trust_index already built

# Keep essential variables
df_clean = df[[
    "trust_index",
    "cntry",
    "agea",
    "gndr",
    "eisced",
    "hincfel"
]].copy()

df_clean = df_clean.dropna(subset=["trust_index", "cntry"])



# =========================
# 1) Load ESS microdata
# =========================
df = pd.read_csv(ESS_PATH, low_memory=False)

# Trust items (ESS special missings -> NaN), build equal-weight Trust Index
trust_vars = ["ppltrst", "pplfair", "pplhlp"]
for v in trust_vars:
    df[v] = df[v].replace([77, 88, 99], np.nan)

df["trust_index"] = df[trust_vars].mean(axis=1)

# Keep essential columns (controls optional; keep simple)
keep_cols = ["trust_index", "cntry", "gndr", "agea", "eisced", "hincfel"]
df_micro = df[keep_cols].copy()

# Clean obvious special missings in controls (common ESS codes)
for c in ["gndr", "agea", "eisced", "hincfel"]:
    if c in df_micro.columns:
        df_micro[c] = df_micro[c].replace([77, 88, 99], np.nan)

# Drop rows missing key fields
df_micro = df_micro.dropna(subset=["trust_index", "cntry"])

# =========================
# 2) Load Eurostat Gini (2023) + GDP per capita PPS (2023)
# =========================
gini = pd.read_csv(GINI_PATH, sep="\t")
gdp  = pd.read_csv(GDP_PATH, sep="\t")

# Strip whitespace from col names (Eurostat has trailing spaces)
gini.columns = [c.strip() for c in gini.columns]
gdp.columns  = [c.strip() for c in gdp.columns]

# Extract geo code from the first "dimensions" column
gini_dim = "freq,age,statinfo,geo\\TIME_PERIOD"
gdp_dim  = "freq,indic_ppp,ppp_cat18,geo\\TIME_PERIOD"

gini["geo"] = gini[gini_dim].str.split(",", expand=True).iloc[:, -1]
gdp["geo"]  = gdp[gdp_dim].str.split(",", expand=True).iloc[:, -1]

# Pull 2023 columns and convert to numeric
gini_2023 = gini[["geo", "2023"]].rename(columns={"2023": "gini_2023"}).copy()
gdp_2023  = gdp[["geo", "2023"]].rename(columns={"2023": "gdp_pps_2023"}).copy()

gini_2023["gini_2023"] = pd.to_numeric(gini_2023["gini_2023"], errors="coerce")
gdp_2023["gdp_pps_2023"] = pd.to_numeric(gdp_2023["gdp_pps_2023"], errors="coerce")

# Merge macro datasets (country-level)
macro = pd.merge(gini_2023, gdp_2023, on="geo", how="inner").dropna()

# =========================
# 3) Merge macro into ESS microdata
# =========================
df_merged = pd.merge(df_micro, macro, left_on="cntry", right_on="geo", how="inner")

print("ESS micro rows (clean):", df_micro.shape)
print("Macro rows:", macro.shape)
print("Merged rows:", df_merged.shape)
print("\nCountries in merged data:", df_merged["cntry"].nunique())

# =========================
# 4) Quick descriptive + baseline regression (unweighted)
# =========================
# (You can add/adjust controls as you like)
# Baseline: trust_index ~ gini_2023 + gdp_pps_2023
model = smf.ols("trust_index ~ gini_2023 + gdp_pps_2023", data=df_merged).fit(
    cov_type="cluster", cov_kwds={"groups": df_merged["cntry"]}
)

print("\n=== Baseline regression (clustered SE by country) ===")
print(model.summary())

# Optional: add basic individual controls (keep simple)
model_controls = smf.ols(
    "trust_index ~ gini_2023 + gdp_pps_2023 + agea + C(gndr) + C(eisced) + C(hincfel)",
    data=df_merged.dropna(subset=["agea", "gndr", "eisced", "hincfel"])
).fit(cov_type="cluster", cov_kwds={"groups": df_merged.dropna(subset=["agea","gndr","eisced","hincfel"])["cntry"]})

print("\n=== Regression + basic controls (clustered SE by country) ===")
print(model_controls.summary())


In [None]:
!pip install statsmodels
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf



# =========================
# 0) Paths (your uploaded files)
# =========================
# Use existing df with trust_index already built
import pandas as pd
from pathlib import Path

# File name
file_name = "gini_path.tsv"

# Desktop path
desktop_path = Path.home() / "Desktop"

# Full file path
file_path = desktop_path / file_name

# Read TSV file
GINI_PATH = pd.read_csv(file_path, sep="\t")

# Keep essential variables

file_name = "gdp_path.tsv"

# Desktop path (auto-detects your home directory)
desktop_path = Path.home() / "Desktop"

# Full file path
file_path = desktop_path / file_name

# Read TSV file
GDP_PATH = pd.read_csv(file_path, sep="\t")

# Preview


df_clean = df[[
    "trust_index",
    "cntry",
    "agea",
    "gndr",
    "eisced",
    "hincfel"
]].copy()

df_clean = df_clean.dropna(subset=["trust_index", "cntry"])



# =========================
# 1) Load ESS microdata
# =========================
df = pd.read_csv(ESS_PATH, low_memory=False)

# Trust items (ESS special missings -> NaN), build equal-weight Trust Index
trust_vars = ["ppltrst", "pplfair", "pplhlp"]
for v in trust_vars:
    df[v] = df[v].replace([77, 88, 99], np.nan)

df["trust_index"] = df[trust_vars].mean(axis=1)

# Keep essential columns (controls optional; keep simple)
keep_cols = ["trust_index", "cntry", "gndr", "agea", "eisced", "hincfel"]
df_micro = df[keep_cols].copy()

# Clean obvious special missings in controls (common ESS codes)
for c in ["gndr", "agea", "eisced", "hincfel"]:
    if c in df_micro.columns:
        df_micro[c] = df_micro[c].replace([77, 88, 99], np.nan)

# Drop rows missing key fields
df_micro = df_micro.dropna(subset=["trust_index", "cntry"])

# =========================
# 2) Load Eurostat Gini (2023) + GDP per capita PPS (2023)
# =========================
gini = pd.read_csv(GINI_PATH, sep="\t")
gdp  = pd.read_csv(GDP_PATH, sep="\t")

# Strip whitespace from col names (Eurostat has trailing spaces)
gini.columns = [c.strip() for c in gini.columns]
gdp.columns  = [c.strip() for c in gdp.columns]

# Extract geo code from the first "dimensions" column
gini_dim = "freq,age,statinfo,geo\\TIME_PERIOD"
gdp_dim  = "freq,indic_ppp,ppp_cat18,geo\\TIME_PERIOD"

gini["geo"] = gini[gini_dim].str.split(",", expand=True).iloc[:, -1]
gdp["geo"]  = gdp[gdp_dim].str.split(",", expand=True).iloc[:, -1]

# Pull 2023 columns and convert to numeric
gini_2023 = gini[["geo", "2023"]].rename(columns={"2023": "gini_2023"}).copy()
gdp_2023  = gdp[["geo", "2023"]].rename(columns={"2023": "gdp_pps_2023"}).copy()

gini_2023["gini_2023"] = pd.to_numeric(gini_2023["gini_2023"], errors="coerce")
gdp_2023["gdp_pps_2023"] = pd.to_numeric(gdp_2023["gdp_pps_2023"], errors="coerce")

# Merge macro datasets (country-level)
macro = pd.merge(gini_2023, gdp_2023, on="geo", how="inner").dropna()

# =========================
# 3) Merge macro into ESS microdata
# =========================
df_merged = pd.merge(df_micro, macro, left_on="cntry", right_on="geo", how="inner")

print("ESS micro rows (clean):", df_micro.shape)
print("Macro rows:", macro.shape)
print("Merged rows:", df_merged.shape)
print("\nCountries in merged data:", df_merged["cntry"].nunique())

# =========================
# 4) Quick descriptive + baseline regression (unweighted)
# =========================
# (You can add/adjust controls as you like)
# Baseline: trust_index ~ gini_2023 + gdp_pps_2023
model = smf.ols("trust_index ~ gini_2023 + gdp_pps_2023", data=df_merged).fit(
    cov_type="cluster", cov_kwds={"groups": df_merged["cntry"]}
)

print("\n=== Baseline regression (clustered SE by country) ===")
print(model.summary())

# Optional: add basic individual controls (keep simple)
model_controls = smf.ols(
    "trust_index ~ gini_2023 + gdp_pps_2023 + agea + C(gndr) + C(eisced) + C(hincfel)",
    data=df_merged.dropna(subset=["agea", "gndr", "eisced", "hincfel"])
).fit(cov_type="cluster", cov_kwds={"groups": df_merged.dropna(subset=["agea","gndr","eisced","hincfel"])["cntry"]})

print("\n=== Regression + basic controls (clustered SE by country) ===")
print(model_controls.summary())


In [None]:
file_name = "gdp_path.tsv"

# Desktop path (auto-detects your home directory)
desktop_path = Path.home() / "Desktop"

# Full file path
file_path = desktop_path / file_name

# Read TSV file
GDP_PATH = pd.read_csv(file_path, sep="\t")

# Preview
print(GDP_PATH.describe())

In [None]:
import pandas as pd
from pathlib import Path

# File name
file_name = "gini_path.tsv"

# Desktop path
desktop_path = Path.home() / "Desktop"

# Full file path
file_path = desktop_path / file_name

# Read TSV file
GINI_PATH = pd.read_csv(file_path, sep="\t")

# Preview
print(GINI_PATH.info())


In [None]:
# !pip install statsmodels  # uncomment if needed

import pandas as pd
import numpy as np
import re
from pathlib import Path
import statsmodels.formula.api as smf

# =========================
# 0) Desktop paths (Mac/Linux)
# =========================
DESKTOP = Path.home() / "Desktop"

ESS_PATH  = DESKTOP / "ESS11e04_1.csv"
GINI_PATH = DESKTOP / "gini_path.tsv"
GDP_PATH  = DESKTOP / "gdp_path.tsv"

YEAR = "2023"

print("Reading files from:")
print("ESS :", ESS_PATH)
print("Gini:", GINI_PATH)
print("GDP :", GDP_PATH)

# =========================
# Helper: Eurostat value -> numeric
# =========================
def to_num_eurostat(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    if s.startswith(":"):
        return np.nan
    m = re.search(r"[-+]?\d+(\.\d+)?", s)
    return float(m.group(0)) if m else np.nan

# =========================
# 1) Load ESS + Trust Index
# =========================
df = pd.read_csv(ESS_PATH, low_memory=False)

trust_vars = ["ppltrst", "pplfair", "pplhlp"]
for v in trust_vars:
    df[v] = df[v].replace([77, 88, 99], np.nan)

df["trust_index"] = df[trust_vars].mean(axis=1)

df_micro = df[["trust_index", "cntry", "agea", "gndr", "eisced", "hincfel"]].copy()

# clean special missings in controls too
for c in ["agea", "gndr", "eisced", "hincfel"]:
    df_micro[c] = df_micro[c].replace([77, 88, 99], np.nan)

df_micro = df_micro.dropna(subset=["trust_index", "cntry"])

# =========================
# 2) Load Eurostat TSVs + extract YEAR
# =========================
gini_raw = pd.read_csv(GINI_PATH, sep="\t")
gdp_raw  = pd.read_csv(GDP_PATH,  sep="\t")

gini_raw.columns = [c.strip() for c in gini_raw.columns]
gdp_raw.columns  = [c.strip() for c in gdp_raw.columns]

# first column contains dimensions; geo is last comma-separated element
gini_raw["geo"] = gini_raw.iloc[:, 0].astype(str).str.split(",", expand=True).iloc[:, -1].str.strip()
gdp_raw["geo"]  = gdp_raw.iloc[:, 0].astype(str).str.split(",", expand=True).iloc[:, -1].str.strip()

if YEAR not in gini_raw.columns:
    raise ValueError(f"Gini file does not contain year {YEAR}. Last columns: {gini_raw.columns[-5:]}")
if YEAR not in gdp_raw.columns:
    raise ValueError(f"GDP file does not contain year {YEAR}. Last columns: {gdp_raw.columns[-5:]}")

gini_yr = gini_raw[["geo", YEAR]].rename(columns={YEAR: "gini"}).copy()
gdp_yr  = gdp_raw[["geo", YEAR]].rename(columns={YEAR: "gdp_pps"}).copy()

gini_yr["gini"] = gini_yr["gini"].apply(to_num_eurostat)
gdp_yr["gdp_pps"] = gdp_yr["gdp_pps"].apply(to_num_eurostat)

macro = pd.merge(gini_yr, gdp_yr, on="geo", how="inner").dropna(subset=["gini", "gdp_pps"])

# =========================
# 3) Merge macro into ESS
# =========================
df_merged = pd.merge(df_micro, macro, left_on="cntry", right_on="geo", how="inner")

print("\nShapes:")
print("ESS micro:", df_micro.shape)
print("Macro   :", macro.shape)
print("Merged  :", df_merged.shape)
print("Countries in merged:", df_merged["cntry"].nunique())

# =========================
# 4) Regressions (unweighted, clustered SE by country)
# =========================
m1 = smf.ols("trust_index ~ gini + gdp_pps", data=df_merged).fit(
    cov_type="cluster", cov_kwds={"groups": df_merged["cntry"]}
)
print("\n=== Model 1: trust ~ gini + gdp_pps (clustered by country) ===")
print(m1.summary())

df_ctrl = df_merged.dropna(subset=["agea", "gndr", "eisced", "hincfel"]).copy()

m2 = smf.ols(
    "trust_index ~ gini + gdp_pps + agea + C(gndr) + C(eisced) + C(hincfel)",
    data=df_ctrl
).fit(cov_type="cluster", cov_kwds={"groups": df_ctrl["cntry"]})

print("\n=== Model 2: + controls (clustered by country) ===")
print(m2.summary())


In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import statsmodels.formula.api as smf

YEAR = "2023"

# âœ… Desktop path (Mac)
DESKTOP = Path.home() / "Desktop"
GINI_FILE = DESKTOP / "gini_path.tsv"
GDP_FILE  = DESKTOP / "gdp_path.tsv"

# 1) Use existing df (already loaded) + trust_index (already created)
df_clean = df[["trust_index", "cntry", "agea", "gndr", "eisced", "hincfel"]].copy()
df_clean = df_clean.dropna(subset=["trust_index", "cntry"])

# 2) Load Eurostat TSVs from Desktop
gini_raw = pd.read_csv(GINI_FILE, sep="\t")
gdp_raw  = pd.read_csv(GDP_FILE,  sep="\t")

gini_raw.columns = [c.strip() for c in gini_raw.columns]
gdp_raw.columns  = [c.strip() for c in gdp_raw.columns]

# Extract geo code from first column (Eurostat dimension column)
gini_raw["geo"] = gini_raw.iloc[:, 0].astype(str).str.split(",", expand=True).iloc[:, -1].str.strip()
gdp_raw["geo"]  = gdp_raw.iloc[:, 0].astype(str).str.split(",", expand=True).iloc[:, -1].str.strip()

# Eurostat numeric cleaner
def to_num(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    if s.startswith(":"):
        return np.nan
    m = re.search(r"[-+]?\d+(\.\d+)?", s)
    return float(m.group(0)) if m else np.nan

# Extract 2023 values and convert
gini = gini_raw[["geo", YEAR]].rename(columns={YEAR: "gini"}).copy()
gdp  = gdp_raw[["geo", YEAR]].rename(columns={YEAR: "gdp_pps"}).copy()

gini["gini"] = gini["gini"].apply(to_num)
gdp["gdp_pps"] = gdp["gdp_pps"].apply(to_num)

macro = pd.merge(gini, gdp, on="geo", how="inner").dropna(subset=["gini", "gdp_pps"])

# 3) Merge macro into ESS microdata
df_merged = pd.merge(df_clean, macro, left_on="cntry", right_on="geo", how="inner")

print("Merged shape:", df_merged.shape)
print("Countries:", df_merged["cntry"].nunique())
print(df_merged[["cntry", "trust_index", "gini", "gdp_pps"]].head())

# 4) Regression (unweighted), clustered SE by country
model = smf.ols("trust_index ~ gini + gdp_pps", data=df_merged).fit(
    cov_type="cluster",
    cov_kwds={"groups": df_merged["cntry"]}
)

print("\n=== trust_index ~ gini + gdp_pps (clustered by country) ===")
print(model.summary())

In [None]:
print("Shape of merged dataset:", df_merged.shape)
print("\nColumns:")
print(df_merged.columns)


In [None]:
df_merged[["trust_index", "gini", "gdp_pps"]].corr()


In [None]:
model1 = smf.ols(
    "trust_index ~ gini + gdp_pps",
    data=df_merged
).fit(
    cov_type="cluster",
    cov_kwds={"groups": df_merged["cntry"]}
)

print(model1.summary())


In [None]:
df_final = df_merged.dropna(subset=["agea", "gndr", "eisced"])

model2 = smf.ols(
    "trust_index ~ gini + gdp_pps + agea + C(gndr) + C(eisced)",
    data=df_final
).fit(
    cov_type="cluster",
    cov_kwds={"groups": df_final["cntry"]}
)

print(model2.summary())


In [None]:
model_no_gdp = smf.ols(
    "trust_index ~ gini + agea + C(gndr) + C(eisced)",
    data=df_final
).fit(
    cov_type="cluster",
    cov_kwds={"groups": df_final["cntry"]}
)

print(model_no_gdp.summary())


In [None]:
# Aggregate average trust by country
country_trust = df_merged.groupby("cntry")["trust_index"].mean().reset_index()

# Merge with macro data
country_macro = pd.merge(country_trust, macro, left_on="cntry", right_on="geo")

country_macro.head()


In [None]:
!pip install seaborn
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,6))
sns.regplot(data=country_macro, x="gini", y="trust_index")

for i in range(len(country_macro)):
    plt.text(country_macro["gini"][i],
             country_macro["trust_index"][i],
             country_macro["cntry"][i])

plt.xlabel("Gini (Inequality)")
plt.ylabel("Average Trust")
plt.title("Country-Level Trust vs Inequality (2023)")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.regplot(data=country_macro, x="gdp_pps", y="trust_index")

for i in range(len(country_macro)):
    plt.text(country_macro["gdp_pps"][i],
             country_macro["trust_index"][i],
             country_macro["cntry"][i])

plt.xlabel("GDP per capita (PPS)")
plt.ylabel("Average Trust")
plt.title("Country-Level Trust vs GDP (2023)")
plt.show()


In [None]:
import numpy as np

df_final["log_gdp"] = np.log(df_final["gdp_pps"])


In [None]:
model_log = smf.ols(
    "trust_index ~ gini + log_gdp + agea + C(gndr) + C(eisced)",
    data=df_final
).fit(
    cov_type="cluster",
    cov_kwds={"groups": df_final["cntry"]}
)

print(model_log.summary())


In [None]:
# Step 1: Aggregate to country level

country_level = df_merged.groupby("cntry").agg({
    "trust_index": "mean",   # average trust per country
    "gini": "first",         # same value for each country
    "gdp_pps": "first"
}).reset_index()

print(country_level.head())
print("Number of countries:", country_level.shape[0])


In [None]:
import statsmodels.api as sm

# Define dependent and independent variables
X = country_level[["gini", "gdp_pps"]]
X = sm.add_constant(X)
y = country_level["trust_index"]

# Run OLS
model_country = sm.OLS(y, X).fit()

print(model_country.summary())


In [None]:
import numpy as np

country_level["log_gdp"] = np.log(country_level["gdp_pps"])

X_log = country_level[["gini", "log_gdp"]]
X_log = sm.add_constant(X_log)
y = country_level["trust_index"]

model_country_log = sm.OLS(y, X_log).fit()

print(model_country_log.summary())
