## Imports

In [3]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')

## Data loading and extracting

Loading .dta and mapping files

In [4]:
df_mapping = pd.read_csv("../data/common_question_mapping.csv")
with pd.io.stata.StataReader("../data/GESIS/2002.dta", convert_categoricals=True) as rdr:
    cols = [c for c in rdr.variable_labels().keys() if c not in ["v241","v247"]]
    dta_2002 = rdr.read(columns=cols)
with pd.io.stata.StataReader("../data/GESIS/2012.dta", convert_categoricals=True) as rdr:
    cols = [c for c in rdr.variable_labels().keys() if c not in {"ISCO88", "SPISCO88"}]
    dta_2012 = rdr.read(columns=cols)


dta_2022 = pd.read_stata("../data/GESIS/2022.dta")

Extracting relevant columns from every survey data

In [5]:
df_2002 = dta_2002.copy()
df_2012 = dta_2012.copy()
df_2022 = dta_2022.copy()

all_common_vars = df_mapping['COMMON_VAR'].dropna().unique()

cols_2002 = [col for col in df_mapping['var_02'].dropna() if col in df_2002.columns]
df_2002_subset = df_2002[cols_2002].copy()
rename_map_2002 = dict(zip(
    df_mapping['var_02'].dropna(),
    df_mapping.loc[df_mapping['var_02'].notna(), 'COMMON_VAR']
))
df_2002_subset = df_2002_subset.rename(columns=rename_map_2002)

cols_2012 = [col for col in df_mapping['var_12'].dropna() if col in df_2012.columns]
df_2012_subset = df_2012[cols_2012].copy()
rename_map_2012 = dict(zip(
    df_mapping['var_12'].dropna(),
    df_mapping.loc[df_mapping['var_12'].notna(), 'COMMON_VAR']
))
df_2012_subset = df_2012_subset.rename(columns=rename_map_2012)

cols_2022 = [col for col in df_mapping['var_22'].dropna() if col in df_2022.columns]
df_2022_subset = df_2022[cols_2022].copy()
rename_map_2022 = dict(zip(
    df_mapping['var_22'].dropna(),
    df_mapping.loc[df_mapping['var_22'].notna(), 'COMMON_VAR']
))
df_2022_subset = df_2022_subset.rename(columns=rename_map_2022)

df_2002 = pd.DataFrame(index=df_2002_subset.index)
df_2012 = pd.DataFrame(index=df_2012_subset.index)
df_2022 = pd.DataFrame(index=df_2022_subset.index)

for common_var in all_common_vars:
    # For 2002
    if common_var in df_2002_subset.columns:
        df_2002[common_var] = df_2002_subset[common_var]
    else:
        df_2002[common_var] = None
    
    # For 2012
    if common_var in df_2012_subset.columns:
        df_2012[common_var] = df_2012_subset[common_var]
    else:
        df_2012[common_var] = None
    
    # For 2022
    if common_var in df_2022_subset.columns:
        df_2022[common_var] = df_2022_subset[common_var]
    else:
        df_2022[common_var] = None

print(f"df_2002: {df_2002.shape}")
print(f"df_2012: {df_2012.shape}")
print(f"df_2022: {df_2022.shape}")

df_2002: (46638, 47)
df_2012: (61754, 47)
df_2022: (45762, 47)


## Equality Score data extraction

In [6]:
cols_to_add_2002 = ['v4', 'v5', 'v6', 'v7', 'v8', 'v11']
cols_to_add_2012 = ['V5', 'V6', 'V7', 'V8', 'V9', 'V11']
cols_to_add_2022 = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']

for col in cols_to_add_2002:
    if col in dta_2002.columns:
        df_2002[col] = dta_2002[col]

for col in cols_to_add_2012:
    if col in dta_2012.columns:
        df_2012[col] = dta_2012[col]

for col in cols_to_add_2022:
    if col in dta_2022.columns:
        df_2022[col] = dta_2022[col]

In [7]:
likert_map = {
    "Strongly agree": 5,
    "Agree": 4,
    "Neither agree nor disagree": 3,
    "Disagree": 2,
    "Strongly disagree": 1,
    "1. Strongly agree": 5,
    "2. Agree": 4,
    "3. Neither agree nor disagree": 3,
    "4. Disagree": 2,
    "5. Strongly disagree": 1,
}

def add_numeric_and_egal_columns(df, cols, reverse_cols, invalid_values=None,
                                 num_suffix="_num", egal_suffix="_egal"):
    """
    Keeps original columns intact.
    Adds:
      - <col>_num  : numeric 1..5 (NaN for invalid/missing)
      - <col>_egal : egalitarian-coded where higher = more egalitarian
                    (reverse-coded if col in reverse_cols)
    """
    if invalid_values is None:
        invalid_values = []

    for col in cols:
        num_col = f"{col}{num_suffix}"
        df[num_col] = df[col].replace(invalid_values, np.nan).map(likert_map)

        egal_col = f"{col}{egal_suffix}"
        if col in reverse_cols:
            df[egal_col] = df[num_col].apply(lambda x: (6 - x) if pd.notnull(x) else np.nan)
        else:
            df[egal_col] = df[num_col]
        df.drop(num_col, axis=1, inplace=True)
    return df

In [8]:
cols_2002 = ['v4', 'v5', 'v6', 'v7', 'v8', 'v11']
reverse_2002 = ['v5', 'v6', 'v7', 'v8', 'v11']

cols_2012 = ['V5', 'V6', 'V7', 'V8', 'V9', 'V11']
reverse_2012 = ['V6', 'V7', 'V8', 'V9', 'V11']

cols_2022 = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']
reverse_2022 = ['v2', 'v3', 'v4', 'v5', 'v6']

df_2002 = add_numeric_and_egal_columns(
    df_2002,
    cols=cols_2002,
    reverse_cols=reverse_2002,
    invalid_values=[],
)
df_2012 = add_numeric_and_egal_columns(
    df_2012,
    cols=cols_2012,
    reverse_cols=reverse_2012,
    invalid_values=[],
)
df_2022 = add_numeric_and_egal_columns(
    df_2022,
    cols=cols_2022,
    reverse_cols=reverse_2022,
    invalid_values=[],
)

## Cleaning and pre-processing

### `urban_rural`

In [9]:
URBAN_RURAL_ORDER = ["Urban", "Suburban", "Town", "Rural"]

def clean_urban_rural(val):
    if pd.isna(val):
        return None

    s = str(val).strip().lower()

    if s.startswith("-"):
        return None
    if any(k in s for k in ["no answer", "don't know", "dont know", "refused", "not available", "nap"]):
        return None
    if "other" in s:
        return None
    if ("town" in s or "small city" in s) and ('suburb' not in s):
        return "Town"
    if "suburb" in s or "outskirts" in s:
        return "Suburban"
    if "big city" in s or "large city" in s or re.fullmatch(r"a big city", s):
        return "Urban"
    if "urban" in s:
        return "Urban"
    if any(k in s for k in ["country village", "village", "farm", "home in the country", "countryside", "country"]):
        return "Rural"

    return None

df_2002['urban_rural'] = df_2002['urban_rural'].apply(clean_urban_rural)
df_2012['urban_rural'] = df_2012['urban_rural'].apply(clean_urban_rural)
df_2022['urban_rural'] = df_2022['urban_rural'].apply(clean_urban_rural)

### `spouse_work_status` and `work_status`

In [10]:
import re
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

UNIFORM_ORDER = [
    "Paid work",
    "Unemployed",
    "Education",
    "Apprentice/Trainee",
    "Sick/Disabled",
    "Retired",
    "Domestic work",
    "Help family member",
    "Military/Community service",
    "Other",
    "DK/No answer",
    "NAP",
]

cat_type = CategoricalDtype(categories=UNIFORM_ORDER, ordered=True)

RE_NEGCODE = re.compile(r"^\s*-\d+\s*[\.\:]")
RE_NAP     = re.compile(r"\bnap\b|\bnot applicable\b", re.I)
RE_DKNA    = re.compile(r"don't know|dont know|no answer|refused|not available", re.I)

RE_UNEMP   = re.compile(r"\bunemploy(ed|ment)?\b", re.I)
RE_PAID    = re.compile(
    r"\bin paid work\b|\bemployed\b|\bself[-\s]?employ(ed|ment)?\b|\bfull[-\s]?time\b|\bpart[-\s]?time\b|\bf\-t\b|\bp\-t\b|\bmain job\b",
    re.I
)
RE_HELPFAM = re.compile(r"help(ing)?\s+family", re.I)
RE_EDU     = re.compile(r"\bin education\b|\bstud(ent|t)?\b|\bschool\b|\beduc(at|ation)\b|\bvocat", re.I)
RE_APPR    = re.compile(r"\bapprentice\b|\btrainee\b", re.I)
RE_SICK    = re.compile(r"permanently sick|\bdisabled\b", re.I)
RE_RET     = re.compile(r"\bretired\b", re.I)
RE_DOM     = re.compile(r"\bdomestic work\b|\bhousewife\b|\bhome duties\b", re.I)
RE_MIL     = re.compile(r"military service|community service|compulsory military", re.I)
RE_OTHER   = re.compile(r"\bother\b|\both\b|not in (the )?labou?r force", re.I)

def _std_status(x) -> str:
    if pd.isna(x):
        return "DK/No answer"

    s = str(x).strip()

    if RE_NEGCODE.match(s):
        if RE_NAP.search(s):
            return "NAP"
        return "DK/No answer"

    s_l = s.lower()

    if RE_NAP.search(s_l):
        return "NAP"
    if RE_DKNA.search(s_l):
        return "DK/No answer"
    if RE_UNEMP.search(s_l):
        return "Unemployed"
    if RE_HELPFAM.search(s_l):
        return "Help family member"
    if RE_EDU.search(s_l):
        return "Education"
    if RE_APPR.search(s_l):
        return "Apprentice/Trainee"
    if RE_SICK.search(s_l):
        return "Sick/Disabled"
    if RE_RET.search(s_l):
        return "Retired"
    if RE_DOM.search(s_l):
        return "Domestic work"
    if RE_MIL.search(s_l):
        return "Military/Community service"
    if RE_PAID.search(s_l):
        return "Paid work"
    if RE_OTHER.search(s_l):
        return "Other"

    return "Other"

for _df in [df_2002, df_2012, df_2022]:
    _df["work_status_std"] = _df["work_status"].map(_std_status)
    _df["spouse_work_status_std"] = _df["spouse_work_status"].map(_std_status)


### `sex`

In [11]:
mapping = {"1. Male": "Male",
           "2. Female": "Female",
           "Male": "Male",
           "Female":"Female",
           "No answer":np.nan,
           '-9. No answer':np.nan}

df_2002["sex"] = df_2002["sex"].map(mapping).astype(str)
df_2012["sex"] = df_2012["sex"].map(mapping).astype(str)
df_2022["sex"] = df_2022["sex"].map(mapping).astype(str)

### `code_income_control`

In [12]:
def code_for_income_control_2002(x):
    if x is None:
        return None
    elif x == 'I manage all the money':
        return "Financial control is with the respondent"
    elif x == "Spouse,partner manages money":
        return "Financial control is with the partner"
    elif x in ['We pool all the money', 'We pool some money']:
        return "Financial control is shared"
    elif x == 'Each keep own money separate':
        return "Financial control is separate"
    else:
        return None


def code_for_income_control_2012(x):
    if x is None:
        return None
    elif x == 'I manage all and give partner his share':
        return "Financial control is with the respondent"
    elif x == 'Partner manages all and gives me my share':
        return "Financial control is with the partner"
    elif x in ['We pool all money, each take out', 'We pool some money, rest separate']:
        return "Financial control is shared"
    elif x == 'We each keep own money separate':
        return "Financial control is separate"
    else:
        return None


def code_for_income_control_2022(x):
    if x is None:
        return None
    elif x == '1. I manage all and give partner his share':
        return "Financial control is with the respondent"
    elif x == '2. Partner manages all and gives me my share':
        return "Financial control is with the partner"
    elif x in ['3. We pool all money, each take out', '4. We pool some money, rest separate']:
        return "Financial control is shared"
    elif x == '5. We each keep own money separate':
        return "Financial control is separate"
    else:
        return None

df_2002['code_income_control'] = df_2002['code_income_control'].apply(code_for_income_control_2002)
df_2012['code_income_control'] = df_2012['code_income_control'].apply(code_for_income_control_2012)
df_2022['code_income_control'] = df_2022['code_income_control'].apply(code_for_income_control_2022)

### `code_for_higher_income`

In [13]:
def code_for_higher_income_2012(x):
    if x == 'NAP, no partner (3 (AT,BE,CH,CL,ES,IN,IS,KR,NL,NO,PL,SK,US:2,3;AR,BG,CZ,PT:2,3,7;IL:3,7) in PARTLIV;TW:3-6 in MARITAL)':
        return None
    elif x in ["I have a much higher income", "I have a higher income	"]:
        return "Respondent has higher income"
    elif x in ['My spouse/ partner has a higher income', 'My spouse/ partner has a much higher income']:
        return "Partner has higher income"
    elif x == "I have no income":
        return "Respondent has no income"
    elif x == 'My spouse/ partner has no income':
        return "Partner has no income"
    else:
        return None

def code_for_higher_income_2002(x):
    if x is None:
        return None
    elif x in ["I have much higher income", "I have a higher income"]:
        return "Respondent has higher income"
    elif x in ["Spouse has higher income", "Spouse has much higher income"]:
        return "Partner has higher income"
    elif x == "I have no income":
        return "Respondent has no income"
    elif x == "Spouse has no income":
        return "Partner has no income"
    elif x == "We have about the same income":
        return "Same income"
    else:
        return None
    
## Not available for 2022

df_2002['code_higher_income'] = df_2002['code_higher_income'].apply(code_for_higher_income_2002)
df_2012['code_higher_income'] = df_2012['code_higher_income'].apply(code_for_higher_income_2012)

### `marital`

In [14]:
def clean_marital_status(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if 'refused' in val_str or 'no answer' in val_str or val_str.startswith('-'):
        return None
    if 'civil partnership' in val_str and 'never' not in val_str and "separate" not in val_str:
        return "Civil partnership"
    if ('married' in val_str or 'marr' in val_str) and 'civil partnership' not in val_str:
        if 'never' not in val_str and 'separated' not in val_str and 'divorced' not in val_str:
            return "Married"
    if 'widow' in val_str or 'died' in val_str:
        return "Widowed"
    if 'divorced' in val_str or 'legally separated' in val_str:
        return "Divorced"
    if 'separated' in val_str and 'divorced' not in val_str and 'legally separated' not in val_str:
        return "Separated"
    if 'single' in val_str or 'never' in val_str:
        return "Single"
    
    return None

df_2002["marital"] = df_2002["marital"].apply(clean_marital_status)
df_2012["marital"] = df_2012["marital"].apply(clean_marital_status)
df_2022["marital"] = df_2022["marital"].apply(clean_marital_status)

### `SPWRKHRS`, `wrk_hrs`, `SP_HH_FAM`, `HH_FAM`, `SP_HH` and `hh_wrk_hrs`

In [15]:
def clean_work_hours(val):
    if pd.isna(val):
        return None

    s = str(val).strip().lower()

    if re.match(r"^\s*-\d+", s):
        return None
    if any(k in s for k in [
        "nap", "no answer", "don't know", "refused", "not available",
        "can't choose", "time varies", "does not apply", "not applicable"
    ]):
        return None
    if any(k in s for k in ["none, no hour", "none, no hours", "no hour", "no hours", "0. none"]):
        return 0
    if "one hour" in s or s.startswith("1. 1 hour") or s == "1 hour":
        return 1
    if "96" in s or "95" in s or "hours and more" in s or "hrs a more" in s:
        return 95
    m = re.search(r"(\d+\.?\d*)", s)
    if m:
        num = float(m.group(1))
        return int(num) if num <= 95 else 95

    return None

hour_vars = ["SPWRKHRS", "wrk_hrs", "SP_HH_FAM", "HH_FAM", "SP_HH", "hh_wrk_hrs"]
for var in hour_vars:
    for df, year in zip([df_2002, df_2012, df_2022],[2002, 2012, 2002]):
        df[var] = df[var].apply(clean_work_hours).astype(float)

### `educ_4`

In [16]:
def clean_education_educ_4(val, year):
    """
    Standardize education into 4 harmonized categories across waves.
    Returns label string directly.
    """
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    # Handle missing values
    missing_patterns = [
        r"no answer", r"don't know", r"cant choose", r"can't choose",
        r"not classifiable", r"not codable"
    ]
    miss_re = re.compile("|".join(missing_patterns), flags=re.IGNORECASE)
    if miss_re.search(val_str) or val_str.startswith('-'):
        return None

    # --- 2002: "University degree completed", "Higher secondary completed", etc. ---
    if year == 2002:
        if "no formal" in val_str or "lowest formal" in val_str:
            return "No/Primary"
        if "higher secondary completed" in val_str:
            return "Secondary"
        if "above lowest" in val_str or "above higher sec" in val_str:
            return "Post-sec / Short tertiary"
        if "university degree completed" in val_str:
            return "University+"
        return None

    # --- 2012: Primary / lower secondary / upper secondary / tertiary levels ---
    elif year == 2012:
        if "no formal education" in val_str or "primary" in val_str:
            return "No/Primary"
        if "lower secondary" in val_str or "upper secondary" in val_str:
            return "Secondary"
        if "post secondary, non-tertiary" in val_str or "lower level tertiary" in val_str:
            return "Post-sec / Short tertiary"
        if "upper level tertiary" in val_str or "master" in val_str or "dr" in val_str or "phd" in val_str:
            return "University+"
        return None

    # --- 2022: numeric-coded labels 0..8 embedded in strings ---
    elif year == 2022:
        match = re.search(r"^\s*([0-9]+)", val_str)
        if match:
            code = int(match.group(1))
            if code in [0, 1]:
                return "No/Primary"
            elif code in [2, 3]:
                return "Secondary"
            elif code in [4, 5]:
                return "Post-sec / Short tertiary"
            elif code in [6, 7, 8]:
                return "University+"
        return None

In [17]:
df_2002['educ_4_label'] = df_2002['educ_4_label'].apply(lambda x: clean_education_educ_4(x, 2002))
df_2012['educ_4_label'] = df_2012['educ_4_label'].apply(lambda x: clean_education_educ_4(x, 2012))
df_2022['educ_4_label'] = df_2022['educ_4_label'].apply(lambda x: clean_education_educ_4(x, 2022))

In [18]:
edu_map = {
    "No/Primary": 0,
    "Secondary": 1,
    "Post-sec / Short tertiary": 2,
    "University+": 3
}
df_2002["educ_4"] = df_2002["educ_4_label"].map(edu_map)
df_2012["educ_4"] = df_2012["educ_4_label"].map(edu_map)
df_2022["educ_4"] = df_2022["educ_4_label"].map(edu_map)

### `SP_DEGREE`

In [19]:
def clean_spouse_education(df, year, educ_col, new_col):
    d = df.copy()
    s = d[educ_col].astype(str).str.strip()

    missing_patterns = [
        r"^-9", r"^-8", r"^-4", r"^-1", r"no answer", r"don't know", r"cant choose", 
        r"can't choose", r"not classifiable", r"not codable", r"nap", r"not available"
    ]
    miss_re = re.compile("|".join(missing_patterns), flags=re.IGNORECASE)
    s = s.mask(s.str.contains(miss_re, na=False), np.nan)

    if year == 2002:
        def map_2002(x):
            if pd.isna(x): return np.nan
            x = x.lower()
            if "no formal" in x or "lowest formal" in x:
                return 0
            if "higher secondary completed" in x:
                return 1
            if "above lowest" in x or "above higher sec" in x:
                return 2
            if "university degree completed" in x:
                return 3
            return np.nan

        d[new_col] = s.map(map_2002)

    elif year == 2012:
        def map_2012(x):
            if pd.isna(x): return np.nan
            x = x.lower()
            if "no formal education" in x or "primary" in x:
                return 0
            if "lower secondary" in x or "upper secondary" in x:
                return 1
            if "post secondary, non-tertiary" in x or "lower level tertiary" in x or "short-cycle tertiary" in x:
                return 2
            if "upper level tertiary" in x or "master" in x or "dr." in x or "phd" in x:
                return 3
            return np.nan

        d[new_col] = s.map(map_2012)

    elif year == 2022:
        code = pd.to_numeric(s.str.extract(r"^\s*([0-9]+)")[0], errors="coerce")

        d[new_col] = np.select(
            [
                code.isna(),
                code.isin([0, 1]),
                code.isin([2, 3]),
                code.isin([4, 5]),
                code.isin([6, 7, 8]),
            ],
            [np.nan, 0, 1, 2, 3],
            default=np.nan
        )

    else:
        raise ValueError("year must be one of {2002, 2012, 2022}")

    labels = {
        0: "No/Primary",
        1: "Secondary",
        2: "Post-sec / Short tertiary",
        3: "University+",
    }
    d[new_col + "_label"] = d[new_col].map(labels)

    return d

df_2002 = clean_spouse_education(df_2002, 2002, "SP_DEGREE", "SP_DEGREE_clean")
df_2012 = clean_spouse_education(df_2012, 2012, "SP_DEGREE", "SP_DEGREE_clean")
df_2022 = clean_spouse_education(df_2022, 2022, "SP_DEGREE", "SP_DEGREE_clean")

df_2002["SP_DEGREE"] = df_2002["SP_DEGREE_clean_label"]
df_2012["SP_DEGREE"] = df_2012["SP_DEGREE_clean_label"]
df_2022["SP_DEGREE"] = df_2022["SP_DEGREE_clean_label"]

df_2002.drop(["SP_DEGREE_clean", "SP_DEGREE_clean_label"], axis=1, inplace=True)
df_2012.drop(["SP_DEGREE_clean", "SP_DEGREE_clean_label"], axis=1, inplace=True)
df_2022.drop(["SP_DEGREE_clean", "SP_DEGREE_clean_label"], axis=1, inplace=True)

### `LIVWOMAR`,`MEWH`,`HW_FULFIL`,`WO_WANT`,`WW_FAM_SUFFER`,`WW_CHILD_SUFFER` and `WW_WARM` (Likert-scale variables)

In [20]:
def clean_likert_5(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', "don't know", 'refused', 'nap', "can't choose", 'not available']):
        return None
    
    if 'strongly agree' in val_str or val_str.startswith('1.'):
        return "Strongly agree"
    if ('agree' in val_str and 'disagree' not in val_str and 'neither' not in val_str) or val_str.startswith('2.'):
        return "Agree"
    if 'neither' in val_str or val_str.startswith('3.'):
        return "Neither agree nor disagree"
    if ('disagree' in val_str and 'strongly' not in val_str) or val_str.startswith('4.'):
        return "Disagree"
    if 'strongly disagree' in val_str or val_str.startswith('5.'):
        return "Strongly disagree"
    
    return None

In [21]:
likert_vars = ['LIVWOMAR','MEWH','HW_FULFIL','WO_WANT','WW_FAM_SUFFER','WW_CHILD_SUFFER','WW_WARM']
for var in likert_vars:
    for df, year in zip([df_2002, df_2012, df_2022], [2002, 2012, 2002]):
        df[var] = df[var].apply(clean_likert_5)

### `TOPBOT`

In [22]:
def clean_topbot(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ["don't know", 'no answer', 'refused', 'not available']):
        return None
    
    if 'lowest' in val_str or 'bottom' in val_str or '01' in val_str or val_str.startswith('1.'):
        return 1
    if 'highest' in val_str or 'top' in val_str or '10' in val_str:
        return 10
    
    match = re.search(r'(\d+)', val_str)
    if match:
        num = int(match.group(1))
        if 1 <= num <= 10:
            return num
    
    return None

In [23]:
df_2002["TOPBOT"] = df_2002["TOPBOT"].apply(clean_topbot)
df_2012["TOPBOT"] = df_2012["TOPBOT"].apply(clean_topbot)
df_2022["TOPBOT"] = df_2022["TOPBOT"].apply(clean_topbot)

### `WWYKS` and `WWYKUS`

In [24]:
def clean_work_preference(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', "don't know", "can't choose"]):
        return None
    if 'women should decide' in val_str or 'women shld decide' in val_str or val_str.startswith('4.'):
        return "Women should decide"
    if 'full-time' in val_str or 'full time' in val_str or val_str.startswith('1.'):
        return "Work full-time"
    if 'part-time' in val_str or 'part time' in val_str or val_str.startswith('2.'):
        return "Work part-time"
    if 'stay at home' in val_str or val_str.startswith('3.'):
        return "Stay at home"
    
    return None

In [25]:
df_2002["WWYKS"] = df_2002["WWYKS"].apply(clean_work_preference)
df_2012["WWYKS"] = df_2012["WWYKS"].apply(clean_work_preference)
df_2022["WWYKS"] = df_2022["WWYKS"].apply(clean_work_preference)

df_2002["WWYKUS"] = df_2002["WWYKUS"].apply(clean_work_preference)
df_2012["WWYKUS"] = df_2012["WWYKUS"].apply(clean_work_preference)
df_2022["WWYKUS"] = df_2022["WWYKUS"].apply(clean_work_preference)

### `MOMORFAF`

In [26]:
def clean_parent_suit(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', "don't know", "can't choose"]):
        return None
    
    if 'strongly agree' in val_str or 'mothers and fathers are equally' in val_str or val_str.startswith('3.'):
        return "Equally suited"
    
    if ('agree' in val_str and 'strongly' not in val_str and 'neither' not in val_str and 'disagree' not in val_str) or \
       'mothers are somewhat better' in val_str or val_str.startswith('2.'):
        return "Mothers somewhat better"
    
    if 'mothers are much better' in val_str or val_str.startswith('1.'):
        return "Mothers much better"
    
    if 'fathers are somewhat better' in val_str or val_str.startswith('4.'):
        return "Fathers somewhat better"
    
    if 'strongly disagree' in val_str or 'fathers are much better' in val_str or val_str.startswith('5.'):
        return "Fathers much better"
    
    if 'neither' in val_str or 'disagree' in val_str:
        return "Equally suited"
    
    return None

df_2002["MOMORFAF"] = df_2002["MOMORFAF"].apply(clean_parent_suit)
df_2012["MOMORFAF"] = df_2012["MOMORFAF"].apply(clean_parent_suit)
df_2022["MOMORFAF"] = df_2022["MOMORFAF"].apply(clean_parent_suit)

### `HHTODD`, `HHCHILDR`, `HHADULT` and `HOMPOP`

In [27]:
def clean_household_count(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', 'refused', 'nap', 'not available']):
        return None
    
    if any(x in val_str for x in ['no children', 'no toddlers', 'no adults', 'no persons']):
        return 0
    
    if 'one child' in val_str or 'one toddler' in val_str or 'one adult' in val_str or 'one person' in val_str:
        return 1
    
    match = re.search(r'(\d+)', val_str)
    if match:
        return int(match.group(1))
    
    return None

In [28]:
household_count_vars = ["HHTODD", "HHCHILDR", "HHADULT", "HOMPOP"]

for var in household_count_vars:
    for df, year in zip([df_2002, df_2012, df_2022], [2002, 2012, 2002]):
        df[var] = df[var].apply(clean_household_count).astype(float)

In [29]:
# Create HHADULT for 2012 by subtracting children and toddlers from home population
df_2012['HHADULT'] = df_2012['HOMPOP'] - df_2012['HHCHILDR'] - df_2012['HHTODD']
df_2012=df_2012[df_2012['HHADULT']>0]

### `FAM_DIF`, `DIFF_CONC_WORK`, `HH_TIRED` and `WORK_TIRED`

In [30]:
def clean_frequency(val):
    if pd.isna(val):
        return "NAP"
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', "don't know", 'refused', "doesn't apply", 'nap', 'not available', "can't choose"]):
        return "NAP"
    
    if 'several times a week' in val_str or val_str.startswith('1.'):
        return "Several times a week"
    
    if 'several times a month' in val_str or val_str.startswith('2.'):
        return "Several times a month"
    
    if 'once or twice' in val_str or val_str.startswith('3.'):
        return "Once or twice"
    
    if 'never' in val_str or val_str.startswith('4.'):
        return "Never"
    
    return "NAP"

In [31]:
frequency_vars = ["FAM_DIF", "DIFF_CONC_WORK", "HH_TIRED", "WORK_TIRED"]
for var in frequency_vars:
    for df, year in zip([df_2002, df_2012, df_2022], [2002, 2012, 2022]):
        if year == 2002:
            df[var] = df[var].astype('string')
            df[var] = df[var].apply(clean_frequency)
        else:
            df[var] = df[var].apply(clean_frequency).astype(str)

### `SHARE_HH`

In [32]:
def clean_fairness_share(val):
    if pd.isna(val):
        return "NAP"
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', "don't know", 'refused', 'nap', "can't choose"]):
        return "NAP"
    
    if 'much more' in val_str or val_str.startswith('1.'):
        return "Much more than fair share"
    
    if ('bit more' in val_str or 'a bit more' in val_str) or val_str.startswith('2.'):
        return "Bit more than fair share"
    
    if 'roughly my fair share' in val_str or ('fair share' in val_str and 'more' not in val_str and 'less' not in val_str) or val_str.startswith('3.'):
        return "Fair share"
    
    if ('bit less' in val_str or 'a bit less' in val_str) or val_str.startswith('4.'):
        return "Bit less than fair share"
    
    if 'much less' in val_str or val_str.startswith('5.'):
        return "Much less than fair share"
    
    return "NAP"

df_2002["SHARE_HH"] = df_2002["SHARE_HH"].astype('string').apply(clean_fairness_share)
df_2012["SHARE_HH"] = df_2012["SHARE_HH"].apply(clean_fairness_share).astype('string')
df_2022["SHARE_HH"] = df_2022["SHARE_HH"].apply(clean_fairness_share).astype('string')


### `DIV_HH_COOK`,`DIV_HH_CLEAN`,`DIV_HH_GROC`,`DIV_HH_CARE` and `DIV_HH_LAUND`

In [33]:
def clean_task_div(val):
    if pd.isna(val):
        return "NAP"
    
    val_str = str(val).lower()
    
    # Handle missing/NAP
    if any(x in val_str for x in ['no answer', "don't know", 'refused', 'nap', "can't choose"]):
        return "NAP"
    
    # Third person first
    if 'third person' in val_str or val_str.startswith('6.'):
        return "Third person"
    
    # Always respondent/me
    if ('always me' in val_str or 'always respondent' in val_str) or val_str.startswith('1.'):
        return "Always respondent"
    
    # Usually respondent/me
    if ('usually me' in val_str or 'usually respondent' in val_str) or val_str.startswith('2.'):
        return "Usually respondent"
    
    # About equal
    if 'about equal' in val_str or 'both together' in val_str or 'both equally' in val_str or val_str.startswith('3.'):
        return "About equal"
    
    # Usually partner
    if ('usually' in val_str and ('spouse' in val_str or 'partner' in val_str)) or val_str.startswith('4.'):
        return "Usually partner"
    
    # Always partner
    if ('always' in val_str and ('spouse' in val_str or 'partner' in val_str)) or val_str.startswith('5.'):
        return "Always partner"
    
    return "NAP"

In [34]:
div_hh_vars = ['DIV_HH_COOK','DIV_HH_CLEAN','DIV_HH_GROC','DIV_HH_CARE','DIV_HH_LAUND']

for var in div_hh_vars:
    for df, year in zip([df_2002, df_2012, df_2022], [2002, 2012, 2002]):
        if year == 2002:
            df[var] = df[var].astype('string')
            df[var] = df[var].apply(clean_task_div)
        else:
            df[var] = df[var].apply(clean_task_div).astype(str)

### `LIFE_HAP`

In [35]:
def clean_happiness(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', "don't know", "can't choose"]):
        return None
    
    if 'completely happy' in val_str or val_str.startswith('1.'):
        return "Completely happy"
    if 'very happy' in val_str or val_str.startswith('2.'):
        return "Very happy"
    if 'fairly happy' in val_str or val_str.startswith('3.'):
        return "Fairly happy"
    if 'neither' in val_str or val_str.startswith('4.'):
        return "Neither happy nor unhappy"
    if 'fairly unhappy' in val_str or val_str.startswith('5.'):
        return "Fairly unhappy"
    if 'very unhappy' in val_str or val_str.startswith('6.'):
        return "Very unhappy"
    if 'completely unhappy' in val_str or val_str.startswith('7.'):
        return "Completely unhappy"
    
    return None

df_2002["LIFE_HAP"] = df_2002["LIFE_HAP"].apply(clean_happiness)
df_2012["LIFE_HAP"] = df_2012["LIFE_HAP"].apply(clean_happiness) 
df_2022["LIFE_HAP"] = df_2022["LIFE_HAP"].apply(clean_happiness)

### `HH_WEEKEND`

In [36]:
def clean_weekend_decision(val):
    if pd.isna(val):
        return "NAP"
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', "don't know", 'refused', 'nap', "can't choose"]):
        return "NAP"
    
    if 'always me' in val_str or 'mostly me' in val_str or val_str.startswith('1.'):
        return "Always respondent"
    
    if 'usually me' in val_str or val_str.startswith('2.'):
        return "Usually respondent"
    
    if 'we decide together' in val_str or 'about equal' in val_str or 'both together' in val_str or val_str.startswith('3.'):
        return "About equal"
    
    if ('usually' in val_str and ('spouse' in val_str or 'partner' in val_str)) or val_str.startswith('4.'):
        return "Usually partner"
    
    if ('always' in val_str and ('spouse' in val_str or 'partner' in val_str)) or 'mostly my spouse' in val_str or val_str.startswith('5.'):
        return "Always partner"
    
    if 'third person' in val_str or 'someone else' in val_str or val_str.startswith('6.'):
        return "Third person"
    
    if 'sometimes' in val_str:
        return "About equal"
    
    return "NAP"
df_2002["HH_WEEKEND"] = df_2002["HH_WEEKEND"].astype('string').apply(clean_weekend_decision)
df_2012["HH_WEEKEND"] = df_2012["HH_WEEKEND"].apply(clean_weekend_decision).astype('string')
df_2022["HH_WEEKEND"] = df_2022["HH_WEEKEND"].apply(clean_weekend_decision).astype('string')

### `COHAB`

In [37]:
def clean_cohab(val):
    if pd.isna(val):
        return None
    
    val_str = str(val).lower()
    
    if any(x in val_str for x in ['no answer', 'refused', 'not available']):
        return None
    
    if 'yes' in val_str and 'same household' in val_str or val_str.startswith('1.'):
        return "Partner, same household"
    
    if 'yes' in val_str and "don't live" in val_str or val_str.startswith('2.'):
        return "Partner, different household"
    
    if 'no partner' in val_str or val_str.startswith('3.'):
        return "No partner"
    
    if val_str == 'yes':
        return "Partner, same household"
    if val_str == 'no':
        return "No partner"
    
    return None

df_2002["COHAB"] = df_2002["COHAB"].apply(clean_cohab)
df_2012["COHAB"] = df_2012["COHAB"].apply(clean_cohab) 
df_2022["COHAB"] = df_2022["COHAB"].apply(clean_cohab)

### `C_ALPHAN`

In [38]:
def clean_country(val):
    """Extract standardized country code."""
    if pd.isna(val):
        return None
    
    val_str = str(val).upper()
    
    # Extract 2-letter ISO code at start or after dash
    match = re.match(r'(\d+\.\s*)?([A-Z]{2})', val_str)
    if match:
        return match.group(2)
    
    return val_str[:2] if len(val_str) >= 2 else None

df_2002["C_ALPHAN"] = df_2002["C_ALPHAN"].apply(clean_country)
df_2012["C_ALPHAN"] = df_2012["C_ALPHAN"].apply(clean_country) 
df_2022["C_ALPHAN"] = df_2022["C_ALPHAN"].apply(clean_country)

### `age`

In [39]:
invalid_age = {
    "Don't know",
    "No answer",
    "Refused",
    "Not available",
}

def clean_age(val):
    if val is None:
        return None
    s = str(val)
    if s in invalid_age:
        return None
    import re
    m = re.search(r"\d+", s)
    return int(m.group()) if m else None

df_2002["age"] = df_2002["age"].apply(clean_age).astype(float)
df_2012["age"] = df_2012["age"].apply(clean_age).astype(float)
df_2022["age"] = df_2022["age"].apply(clean_age).astype(float)

Create age bin

In [40]:
df_2002["age_bin"] = pd.cut(
    df_2002["age"],
    bins=[0, 17, 25, 35, 45, 55, 65, 75, 100],
    labels=["<18","18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
)

df_2012["age_bin"] = pd.cut(
    df_2012["age"],
    bins=[0, 17, 25, 35, 45, 55, 65, 75, 100],
    labels=["<18", "18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
)

df_2022["age_bin"] = pd.cut(
    df_2022["age"],
    bins=[0, 17, 25, 35, 45, 55, 65, 75, 100],
    labels=["<18", "18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
)

### Remove rows with more than 60% null values

In [41]:
df_2002 = df_2002[df_2002.isnull().mean(axis=1) <= 0.6]
df_2012 = df_2012[df_2012.isnull().mean(axis=1) <= 0.6]
df_2022 = df_2022[df_2022.isnull().mean(axis=1) <= 0.6]

In [42]:
print(f"df_2002: {df_2002.shape}")
print(f"df_2012: {df_2012.shape}")
print(f"df_2022: {df_2022.shape}")

df_2002: (46618, 63)
df_2012: (58464, 63)
df_2022: (45762, 63)


### Remove rows where any of columns, which are used to construct equality score, are null

In [43]:
df_2002.dropna(subset=['v4_egal', 'v5_egal', 'v6_egal', 'v7_egal', 'v8_egal', 'v11_egal'], inplace=True)
df_2012.dropna(subset=['V5_egal', 'V6_egal', 'V7_egal', 'V8_egal', 'V9_egal', 'V11_egal'], inplace=True)
df_2022.dropna(subset=['v1_egal', 'v2_egal', 'v3_egal', 'v4_egal', 'v5_egal', 'v6_egal'], inplace=True)

In [44]:
print(f"df_2002: {df_2002.shape}")
print(f"df_2012: {df_2012.shape}")
print(f"df_2022: {df_2022.shape}")

df_2002: (40395, 63)
df_2012: (49884, 63)
df_2022: (40768, 63)


### Remove rows where household adult population is more than total household population

In [45]:
for df in [df_2002, df_2012, df_2022]:
    if "HOMPOP" in df.columns and "HHADULT" in df.columns:
        df.drop(df[df["HOMPOP"] < df["HHADULT"]].index, inplace=True)

In [46]:
print(f"df_2002: {df_2002.shape}")
print(f"df_2012: {df_2012.shape}")
print(f"df_2022: {df_2022.shape}")

df_2002: (40374, 63)
df_2012: (49884, 63)
df_2022: (40714, 63)


In [47]:
df_2002.to_csv("../data/cleaned_csv/2002.csv", index=False)
df_2012.to_csv("../data/cleaned_csv/2012.csv", index=False)
df_2022.to_csv("../data/cleaned_csv/2022.csv", index=False)