In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### Loading 2002, 2012 dta files

In [None]:
with pd.io.stata.StataReader("../data/GESIS/2012.dta", convert_categoricals=True) as rdr:
    cols = [c for c in rdr.variable_labels().keys() if c not in {"ISCO88", "SPISCO88"}]
    df_2012 = rdr.read(columns=cols)

with pd.io.stata.StataReader("../data/GESIS/2002.dta", convert_categoricals=True) as rdr:
    cols = [c for c in rdr.variable_labels().keys() if c not in ["v241","v247"]]
    df_2002 = rdr.read(columns=cols)

df_2022 = pd.read_stata("../data/GESIS/2022.dta")


In [None]:
df_2002_clean=pd.read_csv("../data/efa_csv/2002.csv")
df_2012_clean=pd.read_csv("../data/efa_csv/2012.csv")
df_2022_clean=pd.read_csv("../data/efa_csv/2022.csv")

### Proper plot-1

In [None]:
def code_for_income_control_2012(x):
    if x is None:
        return None
    elif x == 'I manage all and give partner his share':
        return "Financial control is with the respondent"
    elif x == 'Partner manages all and gives me my share':
        return "Financial control is with the partner"
    elif x in ['We pool all money, each take out', 'We pool some money, rest separate']:
        return "Financial control is shared"
    elif x == 'We each keep own money separate':
        return "Financial control is separate"
    else:
        return None


In [None]:
def code_for_income_control_2022(x):
    if x is None:
        return None
    elif x == '1. I manage all and give partner his share':
        return "Financial control is with the respondent"
    elif x == '2. Partner manages all and gives me my share':
        return "Financial control is with the partner"
    elif x in ['3. We pool all money, each take out', '4. We pool some money, rest separate']:
        return "Financial control is shared"
    elif x == '5. We each keep own money separate':
        return "Financial control is separate"
    else:
        return None


In [None]:
def code_for_income_control_2002(x):
    if x is None:
        return None
    elif x == 'I manage all the money':
        return "Financial control is with the respondent"
    elif x == "Spouse,partner manages money":
        return "Financial control is with the partner"
    elif x in ['We pool all the money', 'We pool some money']:
        return "Financial control is shared"
    elif x == 'Each keep own money separate':
        return "Financial control is separate"
    else:
        return None


In [None]:
df_2002["code_income_control"] = df_2002["v29"].apply(code_for_income_control_2002)
df_2012["code_income_control"] = df_2012["V41"].apply(code_for_income_control_2012)
df_2022["code_income_control"] = df_2022["v38"].apply(code_for_income_control_2022)

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(14, 12), sharey=True)

years = [2002, 2012, 2022]
dfs = [df_2002, df_2012, df_2022]
sex_labels = ["Male", "Female"]
sex_codes_2022 = ["1. Male", "2. Female"]

for row, (year, df) in enumerate(zip(years, dfs)):
    for col, sex in enumerate(sex_labels):
        ax = axes[row, col]
        
        if year == 2002:
            sub = df[(df["v200"] == sex) & (df["v202"] == "Marr,liv as mar")]
        elif year == 2012:
            sub = df[(df["SEX"] == sex) & (df["MARITAL"] == "Married")]
        else:  # 2022
            sub = df[(df["SEX"] == sex_codes_2022[col]) & (df["MARITAL"] == '1. Married')]
        
        vc = sub["code_income_control"].value_counts(normalize=True) * 100
        vc = vc.sort_index()
        
        sns.barplot(x=vc.index, y=vc.values, ax=ax, palette="Set2")
        ax.set_title(f"{year} - {sex}", fontsize=12, fontweight='bold')
        ax.set_ylabel("Percentage (%)" if col == 0 else "")
        ax.set_xlabel("")
        ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
def fix_hh_hours(x):
    invalid_options=[
        "Don't know, BG: can't choose",
        'No answer, CA: no answer, refused',
        "Don't know, BG: can't choose, KR: don't know, refused",
        'No answer, CA: no answer, refused',
        'NAP, no partner (3 (AR,AT,BE,CH,CL,ES,IS,KR,NL,NO,PL,SK,US:2,3; BG,CZ,PT:2,3,7;IL:3,7) in PARTLIV;TW:3-6 in MARITAL)',
        "-9. No answer; HR, IS: DK/NA; ES, HU: Can't Ch/NA; LT: NA/DK/Hard to say; PL: Hard to say/NA; TW: Can't choose",
        "-9. No answer; HR, IS: DK/NA; ES, HU, TW: Can't Ch/NA; LT: NA/DK/Hard to say; PL: Hard to say/NA",
       '-4. NAP, no partn. or not liv. with part. (c.2,3,-7 PARTLIV; US: c.2 (if neither married nor cohabit. with partner), 3 PARTL'
                     ]
    if x in invalid_options:
        return None
    if isinstance(x, str):
        x = x.strip()
        if x.startswith("None") or x.startswith("0"):
            return 0
        elif x.startswith("95"):
            return 95
        elif x.startswith("1"):
            return 1
        elif x.startswith("2"):
            return 2
        elif x.startswith("3"):
            return 3
        else:
            return int(x)
    return x


df_2002['v36']=df_2002['v36'].apply(fix_hh_hours)
df_2002['v37']=df_2002['v37'].apply(fix_hh_hours)
df_2002["hh_bin"] = pd.cut(
    df_2002["v36"],
    bins=[0, 20, 40, 60, 80, 100],
    labels=["0–20", "21–40", "41–60", "61–80", "81+"]
)
df_2002["spouse_hh_bin"] = pd.cut(
    df_2002["v37"],
    bins=[0, 20, 40, 60, 80, 100],
    labels=["0–20", "21–40", "41–60", "61–80", "81+"]
)

df_2012['V37']=df_2012['V37'].apply(fix_hh_hours)
df_2012['V39']=df_2012['V39'].apply(fix_hh_hours)

df_2012["hh_bin"] = pd.cut(
    df_2012["V37"],
    bins=[0, 20, 40, 60, 80, 100],
    labels=["0–20", "21–40", "41–60", "61–80", "81+"]
)
df_2012["spouse_hh_bin"] = pd.cut(
    df_2012["V39"],
    bins=[0, 20, 40, 60, 80, 100],
    labels=["0–20", "21–40", "41–60", "61–80", "81+"]
)

df_2022['v34']=df_2022['v34'].apply(fix_hh_hours)
df_2022['v36']=df_2022['v36'].apply(fix_hh_hours)

df_2022["hh_bin"] = pd.cut(
    df_2022["v34"],
    bins=[0, 20, 40, 60, 80, 100],
    labels=["0–20", "21–40", "41–60", "61–80", "81+"]
)
df_2022["spouse_hh_bin"] = pd.cut(
    df_2022["v36"],
    bins=[0, 20, 40, 60, 80, 100],
    labels=["0–20", "21–40", "41–60", "61–80", "81+"]
)


In [None]:
df_2002.name = "df_2002"
df_2012.name = "df_2012"
df_2022.name = "df_2022"

In [None]:
years = [2002, 2012, 2022]
dfs = [df_2002, df_2012, df_2022]

results = []

for year, df in zip(years, dfs):
    if year == 2002:
        male = df[(df["v200"] == "Male") & (df["v202"] == "Marr,liv as mar")]
        female = df[(df["v200"] == "Female") & (df["v202"] == "Marr,liv as mar")]
    elif year == 2012:
        male = df[(df["SEX"] == "Male") & (df["MARITAL"] == "Married")]
        female = df[(df["SEX"] == "Female") & (df["MARITAL"] == "Married")]
    else:  # 2022
        male = df[(df["SEX"] == "1. Male") & (df["MARITAL"] == "1. Married")]
        female = df[(df["SEX"] == "2. Female") & (df["MARITAL"] == "1. Married")]
    

    # keep only single-person control
    male_single = male[male["code_income_control"].isin(["Financial control is with the partner", "Financial control is with the respondent"])]
    # print(len(male_single))
    female_single = female[female["code_income_control"].isin(["Financial control is with the partner", "Financial control is with the respondent"])]

    results.append({
        "Year": year,
        "Male share": len(male_single) / (len(male_single) + len(female_single)) * 100,
        "Female share": len(female_single) / (len(male_single) + len(female_single)) * 100
    })

res_df = pd.DataFrame(results)
res_df

In [None]:
res_df.set_index("Year")[["Male share", "Female share"]].plot(
    kind="bar",
    figsize=(8, 5)
)

plt.ylabel("Share among single-person control (%)")
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.title("Who controls finances when control is with one person? \n P(Male|Single person control) ~= P(Female|Single person control)")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
df_2012[df_2012["code_income_control"].isin(["Financial control is with the partner"])][["V65"]].value_counts()
df_2012[df_2012["code_income_control"].isin(["Financial control is with the respondent"])][["DEGREE"]].value_counts()

# No co-relation between education and financial control

In [None]:
df_2012[df_2012["code_income_control"].isin(["Financial control is with the partner"])][["MAINSTAT"]].value_counts() #MAINSTAT
# df_2012[df_2012["code_income_control"].isin(["Financial control is with the respondent"])][["MAINSTAT"]].value_counts() #SPMAINST

In [None]:
df_2012[(df_2012["MAINSTAT"] == "Unemployed and looking for a job, HR: incl never had a job") & (df_2012["SEX"] == "Male")]["code_income_control"].value_counts()

In [None]:
def code_for_higher_income_2012(x):
    if x == 'NAP, no partner (3 (AT,BE,CH,CL,ES,IN,IS,KR,NL,NO,PL,SK,US:2,3;AR,BG,CZ,PT:2,3,7;IL:3,7) in PARTLIV;TW:3-6 in MARITAL)':
        return None
    elif x in ["I have a much higher income", "I have a higher income	"]:
        return "Respondent has higher income"
    elif x in ['My spouse/ partner has a higher income', 'My spouse/ partner has a much higher income']:
        return "Partner has higher income"
    elif x == "I have no income":
        return "Respondent has no income"
    elif x == 'My spouse/ partner has no income':
        return "Partner has no income"
    else:
        return None

def code_for_higher_income_2002(x):
    if x is None:
        return None
    elif x in ["I have much higher income", "I have a higher income"]:
        return "Respondent has higher income"
    elif x in ["Spouse has higher income", "Spouse has much higher income"]:
        return "Partner has higher income"
    elif x == "I have no income":
        return "Respondent has no income"
    elif x == "Spouse has no income":
        return "Partner has no income"
    elif x == "We have about the same income":
        return "Same income"
    else:
        return None

In [None]:
df_2012["code_higher_income"] = df_2012["V50"].apply(code_for_higher_income_2012)
df_2002["code_higher_income"] = df_2002["v43"].apply(code_for_higher_income_2002)

The below cell tries to identify if there is a relation between higher income and financial control. 
We saw that the financial control is shared in most of the cases, so the hypothesis is that men believe in equality and share financial control. 
We also want to check if income has any effect on financial control. Men are generally paid higher, so what if in cases where the financial control is not shared, it is because of higher income. 
If that is also not true, i.e. having high income does not have an effect on financial control, then men believe in equality in financial control. 

In [None]:
bad = "NAP, no partner (3 (AT,BE,CH,CL,ES,IN,IS,KR,NL,NO,PL,SK,US:2,3;AR,BG,CZ,PT:2,3,7;IL:3,7) in PARTLIV;TW:3-6 in MARITAL)"
bad_v41 = {
    "Don't know, KR: don't know, refused",
    "No answer, CA: no answer, refused",
}

base = df_2002[
    (df_2012["MARITAL"] == "Married")
    & (df_2012["sex"] == "Male")
    & (df_2012["V50"] != bad)
    & (~df_2012["V41"].isin(bad_v41))
    & (df_2012["V41"] != bad)
].copy()

base["V50"] = base["V50"].cat.remove_unused_categories()
base["V41"] = base["V41"].cat.remove_unused_categories()

tab = (
    base.groupby("code_higher_income")["code_income_control"]
        .value_counts()
        .unstack(fill_value=0)
)
tab

In [None]:
from scipy.stats import chi2_contingency

chi2, p, dof, expected = chi2_contingency(tab)

print(f"Chi-square: {chi2:.2f}")
print(f"p-value: {p:.4f}")


In [None]:
tab_pct = tab.div(tab.sum(axis=1), axis=0) * 100
tab_pct.round(1)

## Conclusion of testing whether men believe in visible equality (income and financial control)

Most of the times, financial control is shared, but income sways the financial control only modestly and is with the partner who earns. Even in cases, when men earn higher, financial control is shared. 

## To Do - Make the below for all three years

In [None]:
# Plot for mean household work_hours by gender

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pycountry

GENDER_COL = "SEX"
MALE_VALUE = "Male"
FEMALE_VALUE = "Female"

# 1) Clean + extract ISO-2
df_clean = df_2012.dropna(subset=["V37", "V4", GENDER_COL]).copy()
df_clean["V37"] = pd.to_numeric(df_clean["V37"], errors="coerce")
df_clean = df_clean.dropna(subset=["V37"])

df_clean["iso2"] = df_clean["V4"].str.split("-", n=1).str[0].str.strip()

# 2) Convert ISO-2 -> ISO-3
def iso2_to_iso3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except Exception:
        return None

df_clean["iso3"] = df_clean["iso2"].apply(iso2_to_iso3)
df_clean = df_clean.dropna(subset=["iso3"])

# 3) Split male/female
df_male = df_clean[df_clean[GENDER_COL] == MALE_VALUE].copy()
df_female = df_clean[df_clean[GENDER_COL] == FEMALE_VALUE].copy()

# 4) Aggregate mean hours by country
country_mean_male = (
    df_male.groupby("iso3", as_index=False)["V37"]
    .mean()
    .rename(columns={"V37": "mean_hours"})
)

country_mean_female = (
    df_female.groupby("iso3", as_index=False)["V37"]
    .mean()
    .rename(columns={"V37": "mean_hours"})
)

# 5) Force SAME color scale for both plots
vmin = min(country_mean_male["mean_hours"].min(), country_mean_female["mean_hours"].min())
vmax = max(country_mean_male["mean_hours"].max(), country_mean_female["mean_hours"].max())

# 6) Create side-by-side subplots
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "geo"}, {"type": "geo"}]],
    subplot_titles=("Male: Mean Household Work Hours", "Female: Mean Household Work Hours")
)

# Male choropleth
fig.add_trace(
    go.Choropleth(
        locations=country_mean_male["iso3"],
        z=country_mean_male["mean_hours"],
        colorscale="Viridis",
        zmin=vmin,
        zmax=vmax,
        colorbar=dict(title="Hours", x=0.46),
        name="Male"
    ),
    row=1, col=1
)

# Female choropleth
fig.add_trace(
    go.Choropleth(
        locations=country_mean_female["iso3"],
        z=country_mean_female["mean_hours"],
        colorscale="Viridis",
        zmin=vmin,
        zmax=vmax,
        colorbar=dict(title="Hours", x=1.02),
        name="Female"
    ),
    row=1, col=2
)

# Update layout
fig.update_geos(showframe=False, showcoastlines=True, projection_type="natural earth")
fig.update_layout(
    title_text="Mean Household Work Hours by Gender (2012)",
    height=600,
    width=1400
)

fig.show()

In [None]:
def fix_work_hours(x):
    invalid_options=[
        '-9. No answer',
        "-8. Don't know",
        '-4. NAP (code 2 or 3 in WORK)',
        '-1. DK: Not available',
        '-9. No answer',
        "-8. Don't know",
        '-6. IL: Invalid answers',
        '-4. NAP (code -4, 2 or 3 in SPWORK)',
        'Refused; TW: time varies',
        "Don't know",
        'No answer',
        'TW: time varies',
        'NAP (code 0,2,3 (AR,VE: 0,3) in SPWORK); not available: BG,GB',
        'NAP (code 2,3 (AR,DK,VE: code 3) in WORK)',
                     ]
    if x in invalid_options:
        return None
    if isinstance(x, str):
        x = x.strip()
        if x.startswith("None") or x.startswith("0"):
            return 0
        elif x.startswith("96"):
            return 96
        elif x.startswith("89"):
            return 89
        elif x.startswith("10"):
            return 10
        elif x.startswith("50"):
            return 50
        elif x.startswith("15"):
            return 15
        elif x.startswith("30"):
            return 30
        elif x.startswith("40"):
            return 40
        elif x.startswith("5"):
            return 5
        elif x.startswith("1"):
            return 1
        elif x.startswith("One"):
            return 1
        elif x.startswith("20"):
            return 20
        elif x.startswith("2"):
            return 2
        elif x.startswith("3"):
            return 3
        else:
            return int(x)
    return x


In [None]:
df_2002["v71"] = df_2002["v71"].apply(fix_work_hours)
df_2002["v240"] = df_2002["v240"].apply(fix_work_hours)

df_2012["WRKHRS"] = df_2012["WRKHRS"].apply(fix_work_hours)
df_2012["SPWRKHRS"] = df_2012["SPWRKHRS"].apply(fix_work_hours)

df_2022["WRKHRS"] = df_2022["WRKHRS"].apply(fix_work_hours)
df_2022["SPWRKHRS"] = df_2022["SPWRKHRS"].apply(fix_work_hours)

In [None]:
df_2002_egal = df_2002[["v4", "v5", "v6", "v7", "v8", "v11"]]
df_2012_egal = df_2012[["V5", "V6", "V7", "V8", "V9", "V11"]]
df_2022_egal = df_2022[["v1", "v2", "v3", "v4", "v5", "v6"]]

In [None]:
df_2002_egal.name = "df_2002_egal"
df_2012_egal.name = "df_2012_egal"
df_2022_egal.name = "df_2022_egal"

In [None]:
import numpy as np

likert_map = {
    "Strongly agree": 5,
    "Agree": 4,
    "Neither agree nor disagree": 3,
    "Disagree": 2,
    "Strongly disagree": 1,
    "1. Strongly agree": 5,
    "2. Agree": 4,
    "3. Neither agree nor disagree": 3,
    "4. Disagree": 2,
    "5. Strongly disagree": 1,
}

def add_numeric_and_egal_columns(df, cols, reverse_cols, invalid_values=None,
                                 num_suffix="_num", egal_suffix="_egal"):
    """
    Keeps original columns intact.
    Adds:
      - <col>_num  : numeric 1..5 (NaN for invalid/missing)
      - <col>_egal : egalitarian-coded where higher = more egalitarian
                    (reverse-coded if col in reverse_cols)
    """
    if invalid_values is None:
        invalid_values = []

    for col in cols:
        num_col = f"{col}{num_suffix}"
        df[num_col] = df[col].replace(invalid_values, np.nan).map(likert_map)

        egal_col = f"{col}{egal_suffix}"
        if col in reverse_cols:
            df[egal_col] = df[num_col].apply(lambda x: (6 - x) if pd.notnull(x) else np.nan)
        else:
            df[egal_col] = df[num_col]
    return df


In [None]:
%%capture
cols_2002 = ['v4', 'v5', 'v6', 'v7', 'v8', 'v11']
reverse_2002 = ['v5', 'v6', 'v7', 'v8', 'v11']

cols_2012 = ['V5', 'V6', 'V7', 'V8', 'V9', 'V11']
reverse_2012 = ['V6', 'V7', 'V8', 'V9', 'V11']

cols_2022 = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']
reverse_2022 = ['v2', 'v3', 'v4', 'v5', 'v6']

df_2002_egal = add_numeric_and_egal_columns(
    df_2002_egal,
    cols=cols_2002,
    reverse_cols=reverse_2002,
    invalid_values=[],
)
df_2012_egal = add_numeric_and_egal_columns(
    df_2012_egal,
    cols=cols_2012,
    reverse_cols=reverse_2012,
    invalid_values=[],
)
df_2022_egal = add_numeric_and_egal_columns(
    df_2022_egal,
    cols=cols_2022,
    reverse_cols=reverse_2022,
    invalid_values=[],
)


In [None]:
df_2002_only_egal = df_2002_egal[[col for col in df_2002_egal.columns if col.endswith("egal")]]
df_2012_only_egal = df_2012_egal[[col for col in df_2012_egal.columns if col.endswith("egal")]]
df_2022_only_egal = df_2022_egal[[col for col in df_2022_egal.columns if col.endswith("egal")]]

In [None]:
%%capture
# Get ONLY the _egal columns (which are already numeric after add_numeric_and_egal_columns)
df_2002_only_egal = df_2002_egal[[col for col in df_2002_egal.columns if col.endswith("_egal")]]
df_2012_only_egal = df_2012_egal[[col for col in df_2012_egal.columns if col.endswith("_egal")]]
df_2022_only_egal = df_2022_egal[[col for col in df_2022_egal.columns if col.endswith("_egal")]]

# Now compute z-scores and mean
dfs = [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]

for df in dfs:
    egal_cols = [col for col in df.columns if not col.endswith("_z")]  # exclude _z columns
    z_items = []
    
    for col in egal_cols:
        z_col = col + "_z"
        z_items.append(z_col)
        # Convert to numeric first, just in case
        df[z_col] = pd.to_numeric(df[col], errors='coerce')
        df[z_col] = (df[z_col] - df[z_col].mean()) / df[z_col].std()
    
    df["eg_score"] = df[z_items].mean(axis=1)

## Validation of Equality Score using Cronbach's Alpha

In [None]:
import numpy as np
import pandas as pd

def cronbach_alpha(df_items):
    # Force everything to numeric (categories/strings -> numbers; invalid -> NaN)
    X = df_items.apply(pd.to_numeric, errors="coerce")

    # Optionally drop rows with any missing (strict) OR keep pairwise via dropna:
    X = X.dropna(axis=0, how="any")  # strict complete-case

    k = X.shape[1]
    item_variances = X.var(axis=0, ddof=1)
    total_score = X.sum(axis=1)
    total_variance = total_score.var(ddof=1)

    return (k / (k - 1)) * (1 - item_variances.sum() / total_variance)

alpha = cronbach_alpha(df_2022_egal[[col for col in df_2022_egal.columns if col.endswith("_egal")]])
print(f"Cronbach's alpha: {alpha:.3f}")

In [None]:
df_2002_only_egal["sex"] = df_2002["v200"]

In [None]:
df_2012_only_egal["sex"] = df_2012["SEX"]
df_2022_only_egal["sex"] = df_2022["SEX"]

In [None]:
mapping = {"1. Male": "Male",
           "2. Female": "Female"}

df_2022_only_egal["sex"] = df_2022_only_egal["sex"].map(mapping)

In [None]:
df_2022_only_egal

## Validation of Equality Index using Data

In [None]:
men_color = '#4C72B0'
women_color = '#DD8452'

fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

years = [2002, 2012, 2022]
dfs = [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]

for ax, year, df in zip(axes, years, dfs):
    men_mean = df.loc[df['sex'] == 'Male', 'eg_score'].mean()
    women_mean = df.loc[df['sex'] == 'Female', 'eg_score'].mean()
    
    ax.hist(
        df.loc[df['sex'] == 'Male', 'eg_score'].dropna(),
        bins=30, alpha=0.6, label='Men', color=men_color
    )
    
    ax.hist(
        df.loc[df['sex'] == 'Female', 'eg_score'].dropna(),
        bins=30, alpha=0.6, label='Women', color=women_color
    )
    
    ax.axvline(men_mean, linestyle='--', linewidth=2, color=men_color)
    ax.axvline(women_mean, linestyle='--', linewidth=2, color=women_color)
    
    ax.set_xlabel('Egalitarian Attitude Score')
    ax.set_title(f'{year}', fontsize=14, fontweight='bold')
    ax.legend()

axes[0].set_ylabel('Number of respondents')
fig.suptitle('Egalitarian Attitudes by Gender Across Years', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

years = [2002, 2012, 2022]
dfs = [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]

for ax, year, df in zip(axes, years, dfs):
    df.loc[df['sex'] == 'Male', 'eg_score'].plot(
        kind='kde', label='Men', ax=ax
    )
    
    df.loc[df['sex'] == 'Female', 'eg_score'].plot(
        kind='kde', label='Women', ax=ax
    )
    
    ax.set_xlabel('Egalitarian Attitude Score')
    ax.set_title(f'{year}', fontsize=14, fontweight='bold')
    ax.legend()

axes[0].set_ylabel('Density')
fig.suptitle('Kernel Density of Egalitarian Attitudes by Gender Across Years', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
%%capture
df_2002_only_egal["hh_wrk_hrs"] = df_2002["v36"]
df_2012_only_egal["hh_wrk_hrs"] = df_2012["V37"]
df_2022_only_egal["hh_wrk_hrs"] = df_2022["v34"]

In [None]:
%%capture
df_2002_only_egal["hh_wrk_hrs"] = df_2002_only_egal["hh_wrk_hrs"].astype(float)
df_2012_only_egal["hh_wrk_hrs"] = df_2012_only_egal["hh_wrk_hrs"].astype(float)
df_2022_only_egal["hh_wrk_hrs"] = df_2022_only_egal["hh_wrk_hrs"].astype(float)

In [None]:
men_color = '#4C72B0'
women_color = '#DD8452'

fig, axes = plt.subplots(2, 3, figsize=(18, 8), sharex=True, sharey=True)

years = [2002, 2012, 2022]
dfs = [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]
sex_rows = ["Male", "Female"]
colors = {"Male": men_color, "Female": women_color}

for col, (year, df) in enumerate(zip(years, dfs)):
    sub = df.dropna(subset=["eg_score", "hh_wrk_hrs", "sex"])
    for row, sex in enumerate(sex_rows):
        ax = axes[row, col]
        ax.scatter(
            sub.loc[sub["sex"] == sex, "eg_score"],
            sub.loc[sub["sex"] == sex, "hh_wrk_hrs"],
            color=colors[sex], alpha=0.5, s=25, label=sex
        )
        if row == 1:
            ax.set_xlabel("Egalitarian Attitude Score")
        if col == 0:
            ax.set_ylabel("Household work hours")
        ax.set_title(f"{year} — {sex}", fontsize=12, fontweight="bold")
        ax.grid(alpha=0.2)

fig.suptitle("Egalitarian Attitudes vs Household Work Hours (separate by sex & year)", fontsize=16, y=0.98)
plt.tight_layout()
plt.show()

## Correlation between eg_score and household work hours by sex and year

In [None]:
years = [2002, 2012, 2022]
dfs = [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]
sexes = ["Male", "Female"]

rows = []
for year, df in zip(years, dfs):
    sub = df.dropna(subset=["eg_score", "hh_wrk_hrs", "sex"])
    for sex in sexes:
        d = sub[sub["sex"] == sex]
        r = d["eg_score"].corr(d["hh_wrk_hrs"])
        rows.append({"year": year, "sex": sex, "correlation": r})

corr_df = pd.DataFrame(rows)
corr_df

## Conclusion
For Females, 
the correlation is negative, which means as women become more egalitarian, they reduce household work hours, which can be because they resist traditional gender norms. 

For Men,
The correlation coefficient remains close to zero, indicationg no effect of becoming more or less egalitarian

In [None]:
%%capture
df_2002_only_egal["wrk_hrs"] = df_2002["v240"]
df_2012_only_egal["wrk_hrs"] = df_2012["WRKHRS"]
df_2022_only_egal["wrk_hrs"] = df_2022["WRKHRS"]

## Three way correlation between eg_score, hh work hours and paid work hours
 To check whether increase in eg score and reduction in hh work hours is just because of becoming more educated and increasing paid work

In [None]:
df_2022_only_egal[df_2022_only_egal["sex"] == "Male"][["eg_score", "hh_wrk_hrs", "wrk_hrs"]].corr()

In [None]:
df_2022_only_egal[df_2022_only_egal["sex"] == "Female"][["eg_score", "hh_wrk_hrs", "wrk_hrs"]].corr()

In [None]:
%%capture
# Set the country column per source df
country_cols = {
    2002: 'C_ALPHAN',        # replace if different
    2012: 'V4',        # e.g., "DE-Germany"
    2022: 'c_alphan',   # replace if different (or 'CNTRY')
}

# Copy country codes into the egal-only frames
df_2002_only_egal['country'] = df_2002[country_cols[2002]]

df_2012_only_egal['country'] = (
    df_2012[country_cols[2012]].str.split('-', n=1).str[0].str.strip()  # ISO2 from "DE-Germany"
)

df_2022_only_egal['country'] = df_2022[country_cols[2022]]

## Plotting mean eg_scores by country

In [None]:
import pycountry

def iso2_to_iso3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except Exception:
        return None

def plot_mean_eg_map(df, title):
    tmp = df.copy()
    # handle values like "DE-Germany" or bare ISO2
    tmp["iso2"] = tmp["country"].astype(str).str.split("-", n=1).str[0].str.strip()
    tmp["iso3"] = tmp["iso2"].apply(iso2_to_iso3)
    tmp = tmp.dropna(subset=["iso3"])
    country_mean = tmp.groupby("iso3", as_index=False)["eg_score"].mean()

    fig = px.choropleth(
        country_mean,
        locations="iso3",
        locationmode="ISO-3",
        color="eg_score",
        color_continuous_scale="Viridis",
        labels={"eg_score": "Mean egalitarian score"},
        title=title,
    )
    fig.update_layout(geo=dict(showframe=False, showcoastlines=True))
    pio.renderers.default = "browser"
    fig.show()

plot_mean_eg_map(df_2002_only_egal, "Mean egalitarian score (2002)")
plot_mean_eg_map(df_2012_only_egal, "Mean egalitarian score (2012)")
plot_mean_eg_map(df_2022_only_egal, "Mean egalitarian score (2022)")

In [None]:
# Clean age column in 2002 and make numeric
invalid_age = {
    "Don't know",
    "No answer",
    "Refused",
    "Not available",
}

def to_numeric_age_2002(val):
    if val is None:
        return None
    s = str(val)
    if s in invalid_age:
        return None
    # keep leading digits (handles categories like "89 or more")
    import re
    m = re.search(r"\d+", s)
    return int(m.group()) if m else None

df_2002["age_num"] = df_2002["v201"].apply(to_numeric_age_2002).astype("float")
df_2002[["v201", "age_num"]].head()

In [None]:
invalid_age_2012 = {
    "Don't know",
    "No answer",
    "Refused",
    "Not available",
}

def to_numeric_age_2012(val):
    if val is None:
        return None
    s = str(val)
    if s in invalid_age_2012:
        return None
    import re
    m = re.search(r"\d+", s)  # grabs leading digits, e.g. "89 or more"
    return int(m.group()) if m else None

df_2012["age_num"] = df_2012["AGE"].apply(to_numeric_age_2012).astype("float")
df_2012[["AGE", "age_num"]].head()

In [None]:
invalid_age_2022 = {
    "Don't know",
    "No answer",
    "Refused",
    "Not available",
}

def to_numeric_age_2022(val):
    if val is None:
        return None
    s = str(val)
    if s in invalid_age_2022:
        return None
    import re
    m = re.search(r"\d+", s)  # handles values like "89 or more"
    return int(m.group()) if m else None

df_2022["age_num"] = df_2022["AGE"].apply(to_numeric_age_2022).astype(float)
df_2022[["AGE", "age_num"]].head()

In [None]:
df_2002_only_egal["age"] = df_2002["age_num"]
df_2012_only_egal["age"] = df_2012["age_num"]
df_2022_only_egal["age"] = df_2022["age_num"]

## To-do (Put this in Validation section)

## Checking Correlation between age and eg_score 
Obvious result- Eg_score decreases with age

In [None]:
print(df_2002_only_egal[["age", "eg_score"]].corr())
print(df_2012_only_egal[["age", "eg_score"]].corr())
print(df_2022_only_egal[["age", "eg_score"]].corr())

In [None]:
df_2022_only_egal[df_2022_only_egal["age"] == 15]
# df_20

In [None]:
# Bin age into categories for all three datasets
df_2002_only_egal["age_bin"] = pd.cut(
    df_2002_only_egal["age"],
    bins=[0, 17, 25, 35, 45, 55, 65, 75, 100],
    labels=["<18","18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
)

df_2012_only_egal["age_bin"] = pd.cut(
    df_2012_only_egal["age"],
    bins=[0, 17, 25, 35, 45, 55, 65, 75, 100],
    labels=["<18", "18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
)

df_2022_only_egal["age_bin"] = pd.cut(
    df_2022_only_egal["age"],
    bins=[0, 17, 25, 35, 45, 55, 65, 75, 100],
    labels=["<18", "18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
)


In [None]:
# Compute mean eg_score by age_bin for each year
eg_2002 = df_2002_only_egal[df_2002_only_egal["sex"] == "Male"].groupby("age_bin")["eg_score"].mean()
eg_2012 = df_2012_only_egal[df_2012_only_egal["sex"] == "Male"].groupby("age_bin")["eg_score"].mean()
eg_2022 = df_2022_only_egal[df_2022_only_egal["sex"] == "Male"].groupby("age_bin")["eg_score"].mean()

# Combine into one DataFrame
df_eg_score_by_age_male = pd.DataFrame({
    "2002": eg_2002,
    "2012": eg_2012,
    "2022": eg_2022
})

df_eg_score_by_age_male

In [None]:
# Compute mean eg_score by age_bin for each year
eg_2002 = df_2002_only_egal[df_2002_only_egal["sex"] == "Female"].groupby("age_bin")["eg_score"].mean()
eg_2012 = df_2012_only_egal[df_2012_only_egal["sex"] == "Female"].groupby("age_bin")["eg_score"].mean()
eg_2022 = df_2022_only_egal[df_2022_only_egal["sex"] == "Female"].groupby("age_bin")["eg_score"].mean()

# Combine into one DataFrame
df_eg_score_by_age_female = pd.DataFrame({
    "2002": eg_2002,
    "2012": eg_2012,
    "2022": eg_2022
})

df_eg_score_by_age_female

In [None]:
dfs = [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]

for df in dfs:
    for col in df.columns:
        if col.endswith("_egal") or col.endswith("_z"):
            df[col] = df[col].astype(float)

In [None]:
eg_score_by_year = pd.DataFrame({
    "2002": [df_2002_only_egal["eg_score"].mean()],
    "2012": [df_2012_only_egal["eg_score"].mean()],
    "2022": [df_2022_only_egal["eg_score"].mean()]
})
eg_score_by_year

## To do- Put below the section of three way correlation

## Making a regression model to check the association between eg_scores and wrk_hrs

In [None]:
import statsmodels.formula.api as smf

# Prepare data
data = df_2002_only_egal[["wrk_hrs", "eg_score", "sex"]].copy()

# Convert wrk_hrs to numeric (handles strings, invalid values → NaN)
data["wrk_hrs"] = pd.to_numeric(data["wrk_hrs"], errors="coerce")

# Drop rows with any missing values
data = data.dropna()

# Convert sex to numeric (0 = Female, 1 = Male)
data["sex_numeric"] = (data["sex"] == "Male").astype(int)

model = smf.ols(
    "wrk_hrs ~ eg_score * sex_numeric",
    data=data
).fit()

print(model.summary())

In [None]:
import statsmodels.formula.api as smf

# Prepare data
data = df_2002_only_egal[["hh_wrk_hrs", "eg_score", "sex"]].copy()

# Convert wrk_hrs to numeric (handles strings, invalid values → NaN)
data["hh_wrk_hrs"] = pd.to_numeric(data["hh_wrk_hrs"], errors="coerce")

# Drop rows with any missing values
data = data.dropna()

data["sex_numeric"] = (data["sex"] == "Male").astype(int)

model = smf.ols(
    "hh_wrk_hrs ~ eg_score * sex_numeric",
    data=data
).fit()

print(model.summary())

In [None]:
import statsmodels.formula.api as smf

# Prepare data (include hh_wrk_hrs as predictor)
data = df_2002_only_egal[["wrk_hrs", "hh_wrk_hrs", "eg_score", "sex"]].copy()

# Ensure numeric
data["wrk_hrs"] = pd.to_numeric(data["wrk_hrs"], errors="coerce")
data["hh_wrk_hrs"] = pd.to_numeric(data["hh_wrk_hrs"], errors="coerce")

# Drop rows with missing
data = data.dropna(subset=["wrk_hrs", "hh_wrk_hrs", "eg_score", "sex"])

# Encode sex
data["sex_numeric"] = (data["sex"] == "Male").astype(int)

# Include hh_wrk_hrs as independent variable
model = smf.ols(
    "wrk_hrs ~ eg_score * sex_numeric + hh_wrk_hrs",
    data=data
).fit()

print(model.summary())

In [None]:
import statsmodels.formula.api as smf

# Prepare data (include hh_wrk_hrs as predictor)
data = df_2002_only_egal[["wrk_hrs", "hh_wrk_hrs", "eg_score", "sex"]].copy()

# Ensure numeric
data["wrk_hrs"] = pd.to_numeric(data["wrk_hrs"], errors="coerce")
data["hh_wrk_hrs"] = pd.to_numeric(data["hh_wrk_hrs"], errors="coerce")

# Drop rows with missing
data = data.dropna(subset=["wrk_hrs", "hh_wrk_hrs", "eg_score", "sex"])

# Encode sex
data["sex_numeric"] = (data["sex"] == "Male").astype(int)

# Include hh_wrk_hrs as independent variable
model = smf.ols(
    "hh_wrk_hrs ~ eg_score * sex_numeric + wrk_hrs",
    data=data
).fit()

print(model.summary())

In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# -----------------------------
# Settings
# -----------------------------
years = [2002, 2012, 2022]
dfs = {
    2002: df_2002_only_egal,
    2012: df_2012_only_egal,
    2022: df_2022_only_egal
}

# Models you want to run (same spec each year)
MODEL_SPECS = {
    "M1_paid_on_egal": "wrk_hrs ~ eg_score * sex_numeric",
    "M2_house_on_egal": "hh_wrk_hrs ~ eg_score * sex_numeric",
    "M3_paid_on_egal_plus_house": "wrk_hrs ~ eg_score * sex_numeric + hh_wrk_hrs",
    "M4_house_on_egal_plus_paid": "hh_wrk_hrs ~ eg_score * sex_numeric + wrk_hrs",
}

# Minimal output: only what you need to interpret + compare
KEEP_TERMS = ["eg_score", "sex_numeric", "eg_score:sex_numeric", "hh_wrk_hrs", "wrk_hrs"]

# -----------------------------
# Helper to run & collect output
# -----------------------------
def run_year_models(df, year):
    df = df.copy()

    # Ensure numeric (coerce invalid to NaN)
    for c in ["wrk_hrs", "hh_wrk_hrs", "eg_score"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Encode sex (0 = Female, 1 = Male)
    df = df.dropna(subset=["sex"])
    df["sex_numeric"] = (df["sex"] == "Male").astype(int)

    out_rows = []

    for model_name, formula in MODEL_SPECS.items():
        # Identify required columns for this formula
        required = ["eg_score", "sex_numeric"]
        if "wrk_hrs" in formula:
            required.append("wrk_hrs")
        if "hh_wrk_hrs" in formula:
            required.append("hh_wrk_hrs")

        data = df.dropna(subset=required).copy()

        # Fit
        res = smf.ols(formula, data=data).fit()

        # Collect only what you need
        params = res.params
        bse = res.bse
        pvals = res.pvalues

        # Compute slopes for women (sex=0) and men (sex=1)
        # Women slope on eg_score = beta_eg
        # Men slope on eg_score = beta_eg + beta_interaction
        beta_eg = params.get("eg_score", np.nan)
        beta_int = params.get("eg_score:sex_numeric", 0.0)  # if term absent, treat as 0
        slope_women = beta_eg
        slope_men = beta_eg + beta_int

        row = {
            "year": year,
            "model": model_name,
            "n": int(res.nobs),
            "r2": float(res.rsquared),
            "beta_eg_score": float(beta_eg) if pd.notnull(beta_eg) else np.nan,
            "beta_interaction": float(params.get("eg_score:sex_numeric", np.nan)),
            "slope_eg_women": float(slope_women) if pd.notnull(slope_women) else np.nan,
            "slope_eg_men": float(slope_men) if pd.notnull(slope_men) else np.nan,
            "p_eg_score": float(pvals.get("eg_score", np.nan)),
            "p_interaction": float(pvals.get("eg_score:sex_numeric", np.nan)),
        }

        # Add only the extra covariate if present in that model
        if "hh_wrk_hrs" in formula:
            row["beta_hh_wrk_hrs"] = float(params.get("hh_wrk_hrs", np.nan))
            row["p_hh_wrk_hrs"] = float(pvals.get("hh_wrk_hrs", np.nan))
        else:
            row["beta_hh_wrk_hrs"] = np.nan
            row["p_hh_wrk_hrs"] = np.nan

        if "wrk_hrs" in formula and model_name == "M4_house_on_egal_plus_paid":
            row["beta_wrk_hrs"] = float(params.get("wrk_hrs", np.nan))
            row["p_wrk_hrs"] = float(pvals.get("wrk_hrs", np.nan))
        else:
            row["beta_wrk_hrs"] = np.nan
            row["p_wrk_hrs"] = np.nan

        out_rows.append(row)

    return pd.DataFrame(out_rows)

# -----------------------------
# Run all years & show once
# -----------------------------
all_results = []
for y in years:
    all_results.append(run_year_models(dfs[y], y))

results_df = pd.concat(all_results, ignore_index=True)

# Optional: nicer ordering
results_df = results_df[
    [
        "year", "model", "n", "r2",
        "beta_eg_score", "beta_interaction",
        "slope_eg_women", "slope_eg_men",
        "beta_hh_wrk_hrs", "beta_wrk_hrs",
        "p_eg_score", "p_interaction", "p_hh_wrk_hrs", "p_wrk_hrs"
    ]
].sort_values(["model", "year"]).reset_index(drop=True)

# Display
results_df

In [None]:
df_2002_only_egal["high_egal"] = df_2002_only_egal["eg_score"] > df_2002_only_egal["eg_score"].median()
df_2002_only_egal["low_housework"] = df_2002_only_egal["hh_wrk_hrs"] < df_2002_only_egal["hh_wrk_hrs"].median()
df_2002_only_egal[df_2002_only_egal["high_egal"] & df_2002_only_egal["low_housework"] & (df_2002_only_egal["sex"] == "Male")]

In [None]:
df_2012_only_egal["high_egal"] = df_2012_only_egal["eg_score"] > df_2012_only_egal["eg_score"].median()
df_2012_only_egal["low_housework"] = df_2012_only_egal["hh_wrk_hrs"] < df_2012_only_egal["hh_wrk_hrs"].median()

In [None]:
df_2022_only_egal["high_egal"] = df_2022_only_egal["eg_score"] > df_2022_only_egal["eg_score"].median()
df_2022_only_egal["low_housework"] = df_2022_only_egal["hh_wrk_hrs"] < df_2022_only_egal["hh_wrk_hrs"].median()

In [None]:
df_2002_only_egal["highest_education"] = df_2002["v205"]
df_2012_only_egal["highest_education"] = df_2012["DEGREE"]
df_2022_only_egal["highest_education"] = df_2022["EDULEVEL"]

In [None]:
import numpy as np
import pandas as pd
import re

def clean_education(df, year, educ_col, new_col="educ_4"):
    """
    Creates a harmonized education variable across waves.
    Keeps original column; adds df[new_col] with 4 categories.
    """
    d = df.copy()
    s = d[educ_col].astype(str).str.strip()

    # Treat missing-ish / no-answer categories as NaN
    missing_patterns = [
        r"^-9", r"^-8", r"no answer", r"don't know", r"cant choose", r"can't choose",
        r"not classifiable", r"not codable"
    ]
    miss_re = re.compile("|".join(missing_patterns), flags=re.IGNORECASE)
    s = s.mask(s.str.contains(miss_re), np.nan)

    # --- 2002: "University degree completed", "Higher secondary completed", etc. ---
    if year == 2002:
        def map_2002(x):
            if pd.isna(x): return np.nan
            x = x.lower()
            if "no formal" in x or "lowest formal" in x:
                return 0
            if "higher secondary completed" in x:
                return 1
            if "above lowest" in x or "above higher sec" in x:
                return 2
            if "university degree completed" in x:
                return 3
            return np.nan

        d[new_col] = s.map(map_2002)

    # --- 2012: Primary / lower secondary / upper secondary / tertiary levels ---
    elif year == 2012:
        def map_2012(x):
            if pd.isna(x): return np.nan
            x = x.lower()
            if "no formal education" in x or "primary" in x:
                return 0
            if "lower secondary" in x or "upper secondary" in x:
                return 1
            if "post secondary, non-tertiary" in x or "lower level tertiary" in x:
                # includes short-cycle / technical tertiary
                return 2
            if "upper level tertiary" in x or "master" in x or "dr" in x or "phd" in x:
                return 3
            return np.nan

        d[new_col] = s.map(map_2012)

    # --- 2022: numeric-coded labels 0..8 embedded in strings ---
    elif year == 2022:
        # Extract leading integer if present (e.g., "6. Lower level tertiary, BA")
        code = pd.to_numeric(s.str.extract(r"^\s*([0-9]+)")[0], errors="coerce")

        # Map numeric codes to 4 bins
        # 0-1: none/primary
        # 2-3: secondary
        # 4-5: post-secondary non-tertiary / short-cycle tertiary
        # 6-8: BA+
        d[new_col] = np.select(
            [
                code.isna(),
                code.isin([0, 1]),
                code.isin([2, 3]),
                code.isin([4, 5]),
                code.isin([6, 7, 8]),
            ],
            [np.nan, 0, 1, 2, 3],
            default=np.nan
        )

    else:
        raise ValueError("year must be one of {2002, 2012, 2022}")

    # Optional: add readable labels too
    labels = {
        0: "No/Primary",
        1: "Secondary",
        2: "Post-sec / Short tertiary",
        3: "University+",
    }
    d[new_col + "_label"] = d[new_col].map(labels)

    return d

In [None]:
df_2002_only_egal = clean_education(df_2002_only_egal, 2002, educ_col="highest_education", new_col="educ_4")
df_2012_only_egal = clean_education(df_2012_only_egal, 2012, educ_col="highest_education", new_col="educ_4")
df_2022_only_egal = clean_education(df_2022_only_egal, 2022, educ_col="highest_education", new_col="educ_4")


## Change in eg_scores by education

In [None]:
order = ["No/Primary", "Secondary", "Post-sec / Short tertiary", "University+"]

idx = (
    pd.Index(df_2002_only_egal["educ_4_label"].dropna().unique())
    .union(df_2012_only_egal["educ_4_label"].dropna().unique())
    .union(df_2022_only_egal["educ_4_label"].dropna().unique())
)

means_by_educ = pd.DataFrame(index=idx, columns=["2002", "2012", "2022"])
means_by_educ["2002"] = df_2002_only_egal.groupby("educ_4_label")["eg_score"].mean()
means_by_educ["2012"] = df_2012_only_egal.groupby("educ_4_label")["eg_score"].mean()
means_by_educ["2022"] = df_2022_only_egal.groupby("educ_4_label")["eg_score"].mean()

means_by_educ = means_by_educ.reindex(order)
means_by_educ

In [None]:
df_2002_only_egal[(df_2002_only_egal["educ_4"] >= 2) & 
                  (~df_2002_only_egal["high_egal"]) & 
                  (df_2002_only_egal["sex"] == "Male")]["hh_wrk_hrs"].mean()

In [None]:
df_2002_only_egal[(df_2002_only_egal["educ_4"] < 2) 
                  & (~df_2002_only_egal["high_egal"]) & 
                  (df_2002_only_egal["sex"] == "Male")]["hh_wrk_hrs"].mean()

In [None]:
df_2002_only_egal[(df_2002_only_egal["educ_4"] >= 2) & 
                  (df_2002_only_egal["high_egal"]) & 
                  (df_2002_only_egal["sex"] == "Female")]["hh_wrk_hrs"].mean()

In [None]:
df_2002_only_egal[(df_2002_only_egal["educ_4"] < 2) 
                  & (df_2002_only_egal["high_egal"]) & 
                  (df_2002_only_egal["sex"] == "Female")]["hh_wrk_hrs"].mean()

In [None]:
%%capture
def hh_confusion(df):
    return (
        df.pivot_table(
            index="sex",
            columns="high_egal",
            values="hh_wrk_hrs",
            aggfunc="mean"
        )
        .rename(columns={True: "High egal", False: "Low egal"})
        .loc[["Male", "Female"]]
    )

cm_2002 = hh_confusion(df_2002_only_egal)
cm_2012 = hh_confusion(df_2012_only_egal)
cm_2022 = hh_confusion(df_2022_only_egal)

cm_2002, cm_2012, cm_2022

In [None]:
%%capture
def hh_confusion_by_educ(df):
    # require educ_4 and hh_wrk_hrs present
    return (
        df.pivot_table(
            index="sex",
            columns=df["educ_4"] >= 2,
            values="hh_wrk_hrs",
            aggfunc="mean"
        )
        .rename(columns={True: "Educ ≥2", False: "Educ <2"})
        .loc[["Male", "Female"]]
    )

cm_2002_educ = hh_confusion_by_educ(df_2002_only_egal)
cm_2012_educ = hh_confusion_by_educ(df_2012_only_egal)
cm_2022_educ = hh_confusion_by_educ(df_2022_only_egal)

cm_2002_educ, cm_2012_educ, cm_2022_educ


In [None]:
df_2002_only_egal[["hh_wrk_hrs", "educ_4"]].corr()

In [None]:
df_2002_only_egal["code_income_control"] = df_2002["code_income_control"]
df_2012_only_egal["code_income_control"] = df_2012["code_income_control"]
df_2022_only_egal["code_income_control"] = df_2022["code_income_control"]

In [None]:
df_2002_only_egal["code_higher_income"] = df_2002["code_higher_income"]
df_2012_only_egal["code_higher_income"] = df_2012["code_higher_income"]
df_2022_only_egal["code_higher_income"] = np.nan #2022 has no higher income data

In [None]:
df_2002_only_egal.info()

In [None]:
df_2012_only_egal.info()

In [None]:
df_2022_only_egal.info()

In [None]:
df_2012["MARITAL"].value_counts()

In [None]:
dfs = [df_2002, df_2012, df_2022]
years = [2002, 2012, 2022]

for df, year in zip(dfs, years):
    print(f"================df_{year}==================")
    if year == 2002:
        print(df["v202"].cat.categories)
    else:
        print(df["MARITAL"].cat.categories)

In [None]:
df_2002_only_egal["marital"] = df_2002["v202"]
df_2012_only_egal["marital"] = df_2012["MARITAL"]
df_2022_only_egal["marital"] = df_2022["MARITAL"]

In [None]:
# # Clean and standardize marital status across all years
# def clean_marital_status(val, year):
#     """
#     Standardizes marital status categories across years.
#     Returns: Married, Widowed, Divorced, Separated, Single, or None
#     """
#     if pd.isna(val):
#         return None
    
#     val_str = str(val).lower()
    
#     # Handle refused/no answer
#     if 'refused' in val_str or 'no answer' in val_str or val_str.startswith('-'):
#         return None
    
#     # Married (including civil partnership)
#     if 'married' in val_str or 'marr' in val_str or 'civil partnership' in val_str:
#         if 'never' not in val_str and 'separated' not in val_str and 'divorced' not in val_str:
#             return "Married"
    
#     # Widowed
#     if 'widow' in val_str or 'died' in val_str:
#         return "Widowed"
    
#     # Divorced
#     if 'divorced' in val_str or 'legally separated' in val_str:
#         return "Divorced"
    
#     # Separated (but still legally married)
#     if 'separated' in val_str and 'divorced' not in val_str and 'legally separated' not in val_str:
#         return "Separated"
    
#     # Single
#     if 'single' in val_str or 'never' in val_str:
#         return "Single"
    
#     return None

# # Apply to all three dataframes
# df_2002_only_egal['marital'] = df_2002_only_egal['marital'].apply(lambda x: clean_marital_status(x, 2002))
# df_2012_only_egal['marital'] = df_2012_only_egal['marital'].apply(lambda x: clean_marital_status(x, 2012))
# df_2022_only_egal['marital'] = df_2022_only_egal['marital'].apply(lambda x: clean_marital_status(x, 2022))

# # Check the distribution
# print("2002 Marital Status Distribution:")
# print(df_2002_only_egal['marital'].value_counts())
# print("\n2012 Marital Status Distribution:")
# print(df_2012_only_egal['marital'].value_counts())
# print("\n2022 Marital Status Distribution:")
# print(df_2022_only_egal['marital'].value_counts())



# Clean and standardize marital status across all years
def clean_marital_status(val, year):
    """
    Standardizes marital status categories across years.
    Returns: Married, Civil partnership, Widowed, Divorced, Separated, Single, or None
    """
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    # Handle refused/no answer
    if 'refused' in val_str or 'no answer' in val_str or val_str.startswith('-'):
        return None
    
    # Civil partnership (separate from marriage)
    if 'civil partnership' in val_str and 'never' not in val_str and "separate" not in val_str:
        return "Civil partnership"
    
    # Married (but NOT civil partnership)
    if ('married' in val_str or 'marr' in val_str) and 'civil partnership' not in val_str:
        if 'never' not in val_str and 'separated' not in val_str and 'divorced' not in val_str:
            return "Married"
    
    # Widowed
    if 'widow' in val_str or 'died' in val_str:
        return "Widowed"
    
    # Divorced
    if 'divorced' in val_str or 'legally separated' in val_str:
        return "Divorced"
    
    # Separated (but still legally married/in partnership)
    if 'separated' in val_str and 'divorced' not in val_str and 'legally separated' not in val_str:
        return "Separated"
    
    # Single
    if 'single' in val_str or 'never' in val_str:
        return "Single"
    
    return None

# Apply to all three dataframes
df_2002_only_egal["marital"] = df_2002_only_egal["marital"].apply(lambda x: clean_marital_status(x, 2002))
df_2012_only_egal["marital"] = df_2012_only_egal["marital"].apply(lambda x: clean_marital_status(x, 2012))
df_2022_only_egal["marital"] = df_2022_only_egal["marital"].apply(lambda x: clean_marital_status(x, 2022))

# Check the distribution
print("2002 Marital Status Distribution:")
print(df_2002_only_egal["marital"].value_counts())
print("\n2012 Marital Status Distribution:")
print(df_2012_only_egal["marital"].value_counts())
print("\n2022 Marital Status Distribution:")
print(df_2022_only_egal["marital"].value_counts())

In [None]:
df_2002["v358"].value_counts()

In [None]:
df_2002_only_egal = pd.read_csv("../data/final_csv/2002_final.csv")
df_2012_only_egal = pd.read_csv("../data/final_csv/2012_final.csv")
df_2022_only_egal = pd.read_csv("../data/final_csv/2022_final.csv")

In [None]:
df_2002_only_egal["urban_rural"] = df_2002["v358"]
df_2012_only_egal["urban_rural"] = df_2012["URBRURAL"]
df_2022_only_egal["urban_rural"] = df_2022["URBRURAL"]

In [None]:
dfs = [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]
year = [2002, 2012, 2002]

for df, year in zip(dfs, year):
    print("="*20, f"df_{year}_only_egal", "="*20)
    print(df["urban_rural"].cat.categories)

In [None]:
import pandas as pd
import re

URBAN_RURAL_ORDER = ["Urban", "Suburban", "Town", "Rural"]

def clean_urban_rural(val):
    if pd.isna(val):
        return None

    s = str(val).strip().lower()

    # Missing / DK / No answer / NAP / coded negatives
    if s.startswith("-"):
        return None
    if any(k in s for k in ["no answer", "don't know", "dont know", "refused", "not available", "nap"]):
        return None
    if "other" in s:   # "Other", "Other answer"
        return None

    # IMPORTANT ORDERING:
    # Town/small city first so it doesn't get caught by generic "city"
    if "town" in s or "small city" in s and 'suburb' not in s:
        return "Town"

    # Suburban
    if "suburb" in s or "outskirts" in s:
        return "Suburban"

    # Urban (big city only)
    if "big city" in s or "large city" in s or re.fullmatch(r"a big city", s):
        return "Urban"
    # If it says "urban" explicitly (and isn't suburban)
    if "urban" in s:
        return "Urban"

    # Rural
    if any(k in s for k in ["country village", "village", "farm", "home in the country", "countryside", "country"]):
        return "Rural"

    return None

# Apply to all three dataframes
df_2002_only_egal['urban_rural'] = df_2002_only_egal['urban_rural'].apply(clean_urban_rural)
df_2012_only_egal['urban_rural'] = df_2012_only_egal['urban_rural'].apply(clean_urban_rural)
df_2022_only_egal['urban_rural'] = df_2022_only_egal['urban_rural'].apply(clean_urban_rural)

# Check the distribution
print("2002 Urban/Rural Distribution:")
print(df_2002_only_egal['urban_rural'].value_counts())
print("\n2012 Urban/Rural Distribution:")
print(df_2012_only_egal['urban_rural'].value_counts())
print("\n2022 Urban/Rural Distribution:")
print(df_2022_only_egal['urban_rural'].value_counts())

In [None]:
import re
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

UNIFORM_ORDER = [
    "Paid work",
    "Unemployed",
    "Education",
    "Apprentice/Trainee",
    "Sick/Disabled",
    "Retired",
    "Domestic work",
    "Help family member",
    "Military/Community service",
    "Other",
    "DK/No answer",
    "NAP",
]

cat_type = CategoricalDtype(categories=UNIFORM_ORDER, ordered=True)

# Precompile regex patterns (faster + less error-prone)
RE_NEGCODE = re.compile(r"^\s*-\d+\s*[\.\:]")          # "-9. No answer", "-4. NAP ..."
RE_NAP     = re.compile(r"\bnap\b|\bnot applicable\b", re.I)
RE_DKNA    = re.compile(r"don't know|dont know|no answer|refused|not available", re.I)

RE_UNEMP   = re.compile(r"\bunemploy(ed|ment)?\b", re.I)
RE_PAID    = re.compile(
    r"\bin paid work\b|\bemployed\b|\bself[-\s]?employ(ed|ment)?\b|\bfull[-\s]?time\b|\bpart[-\s]?time\b|\bf\-t\b|\bp\-t\b|\bmain job\b",
    re.I
)
RE_HELPFAM = re.compile(r"help(ing)?\s+family", re.I)
RE_EDU     = re.compile(r"\bin education\b|\bstud(ent|t)?\b|\bschool\b|\beduc(at|ation)\b|\bvocat", re.I)
RE_APPR    = re.compile(r"\bapprentice\b|\btrainee\b", re.I)
RE_SICK    = re.compile(r"permanently sick|\bdisabled\b", re.I)
RE_RET     = re.compile(r"\bretired\b", re.I)
RE_DOM     = re.compile(r"\bdomestic work\b|\bhousewife\b|\bhome duties\b", re.I)
RE_MIL     = re.compile(r"military service|community service|compulsory military", re.I)
RE_OTHER   = re.compile(r"\bother\b|\both\b|not in (the )?labou?r force", re.I)

def _std_status(x) -> str:
    # Preserve real missing values
    if pd.isna(x):
        return "DK/No answer"

    s = str(x).strip()

    # Handle coded responses first
    if RE_NEGCODE.match(s):
        # But keep NAP if the label indicates NAP
        if RE_NAP.search(s):
            return "NAP"
        return "DK/No answer"

    s_l = s.lower()

    # NAP / routing
    if RE_NAP.search(s_l):
        return "NAP"

    # Missing / DK
    if RE_DKNA.search(s_l):
        return "DK/No answer"

    # IMPORTANT: Unemployed BEFORE employed/paid work to avoid "unemployed" -> "employed"
    if RE_UNEMP.search(s_l):
        return "Unemployed"

    if RE_HELPFAM.search(s_l):
        return "Help family member"

    if RE_EDU.search(s_l):
        return "Education"

    if RE_APPR.search(s_l):
        return "Apprentice/Trainee"

    if RE_SICK.search(s_l):
        return "Sick/Disabled"

    if RE_RET.search(s_l):
        return "Retired"

    if RE_DOM.search(s_l):
        return "Domestic work"

    if RE_MIL.search(s_l):
        return "Military/Community service"

    if RE_PAID.search(s_l):
        return "Paid work"

    if RE_OTHER.search(s_l):
        return "Other"

    return "Other"

# Apply (keep originals!)
for _df in [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]:
    _df["work_status_std"] = pd.Categorical(_df["work_status"].map(_std_status), dtype=cat_type)
    _df["spouse_work_status_std"] = pd.Categorical(_df["spouse_work_status"].map(_std_status), dtype=cat_type)

# Debug checks: show where "unemployed" got mis-mapped (should be none now)
for y, _df in zip([2002, 2012, 2022], [df_2002_only_egal, df_2012_only_egal, df_2022_only_egal]):
    bad = _df[_df["work_status"].astype(str).str.lower().str.contains("unemployed", na=False) &
              (_df["work_status_std"] != "Unemployed")]
    print(f"{y}: unemployed mis-mapped rows = {len(bad)}")


In [None]:
df_2002_only_egal["work_status_std"].value_counts()

In [None]:
df_2002_only_egal.drop("work_status", axis=1, inplace=True)
df_2002_only_egal.drop("spouse_work_status", axis=1, inplace=True)

df_2012_only_egal.drop("work_status", axis=1, inplace=True)
df_2012_only_egal.drop("spouse_work_status", axis=1, inplace=True)

df_2022_only_egal.drop("work_status", axis=1, inplace=True)
df_2022_only_egal.drop("spouse_work_status", axis=1, inplace=True)

In [None]:
# Convert category columns to strings for better CSV compatibility
# and ensure proper data types

# For df_2002_only_egal
df_2002_only_egal['sex'] = df_2002_only_egal['sex'].astype(str)
df_2002_only_egal['wrk_hrs'] = pd.to_numeric(df_2002_only_egal['wrk_hrs'], errors='coerce')
df_2002_only_egal['age_bin'] = df_2002_only_egal['age_bin'].astype(str)
df_2002_only_egal['highest_education'] = df_2002_only_egal['highest_education'].astype(str)
df_2002_only_egal['marital'] = df_2002_only_egal['marital'].astype(str)
df_2002_only_egal['work_status'] = df_2002_only_egal['work_status'].astype(str)
df_2002_only_egal['spouse_work_status'] = df_2002_only_egal['spouse_work_status'].astype(str)
df_2002_only_egal["urban_rural"] = df_2002_only_egal["urban_rural"].astype(str)

# For df_2012_only_egal
df_2012_only_egal['sex'] = df_2012_only_egal['sex'].astype(str)
df_2012_only_egal['age_bin'] = df_2012_only_egal['age_bin'].astype(str)
df_2012_only_egal['highest_education'] = df_2012_only_egal['highest_education'].astype(str)
df_2012_only_egal['marital'] = df_2012_only_egal['marital'].astype(str)
df_2012_only_egal['work_status'] = df_2012_only_egal['work_status'].astype(str)
df_2012_only_egal['spouse_work_status'] = df_2012_only_egal['spouse_work_status'].astype(str)
df_2012_only_egal["urban_rural"] = df_2012_only_egal["urban_rural"].astype(str)

# For df_2022_only_egal
df_2022_only_egal['sex'] = df_2022_only_egal['sex'].astype(str)
df_2022_only_egal['age_bin'] = df_2022_only_egal['age_bin'].astype(str)
df_2022_only_egal['highest_education'] = df_2022_only_egal['highest_education'].astype(str)
df_2022_only_egal['marital'] = df_2022_only_egal['marital'].astype(str)
df_2022_only_egal['work_status'] = df_2022_only_egal['work_status'].astype(str)
df_2022_only_egal['spouse_work_status'] = df_2022_only_egal['spouse_work_status'].astype(str)
df_2022_only_egal["urban_rural"] = df_2022_only_egal["urban_rural"].astype(str)

# Save as CSV files
df_2002_only_egal.to_csv('../data/final_csv/2002_final.csv', index=False)
df_2012_only_egal.to_csv('../data/final_csv/2012_final.csv', index=False)
df_2022_only_egal.to_csv('../data/final_csv/2022_final.csv', index=False)

print("Files saved successfully!")
print(f"df_2002_only_egal: {len(df_2002_only_egal)} rows, {len(df_2002_only_egal.columns)} columns")
print(f"df_2012_only_egal: {len(df_2012_only_egal)} rows, {len(df_2012_only_egal.columns)} columns")
print(f"df_2022_only_egal: {len(df_2022_only_egal)} rows, {len(df_2022_only_egal.columns)} columns")