In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import sklearn
from plotly.subplots import make_subplots
import plotly.express as px

In [95]:
df_2002=pd.read_csv("../data/efa_csv/2002.csv")
df_2012=pd.read_csv("../data/efa_csv/2012.csv")
df_2022=pd.read_csv("../data/efa_csv/2022.csv")

In [96]:
print(df_2002.shape)
print(df_2012.shape)
print(df_2022.shape)

(23730, 67)
(28013, 67)
(20493, 67)


### Summary Statistics of some descriptive variables 


In [97]:
summary_stats = []

for data, year in [(df_2002, 2002), (df_2012, 2012), (df_2022, 2022)]:
    total = len(data)
    female_count = len(data[data['sex'] == 'Female'])
    male_count = len(data[data['sex'] == 'Male'])
    
    clean_age = data.dropna(subset=["age"])
    clean_country = data.dropna(subset=["COUNTRY"])
    
    summary_stats.append({
        'Year': year,
        'Total Respondents': total,
        'Female %': f"{(female_count/total*100):.1f}%",
        'Male %': f"{(male_count/total*100):.1f}%",
        'Mean Age': f"{clean_age['age'].mean():.1f}",
        'Median Age': f"{clean_age['age'].median():.0f}",
        'Age Std Dev': f"{clean_age['age'].std():.1f}",
        'Num Countries': clean_country['COUNTRY'].nunique()
    })

summary_df = pd.DataFrame(summary_stats)
summary_df

Unnamed: 0,Year,Total Respondents,Female %,Male %,Mean Age,Median Age,Age Std Dev,Num Countries
0,2002,23730,55.1%,44.9%,47.9,47,14.2,35
1,2012,28013,51.5%,48.5%,50.7,50,14.4,43
2,2022,20493,52.5%,47.5%,53.8,54,14.8,34


## Exploratory Data Analysis (EDA) on Factor Scores

In this section, we perform basic exploratory data analysis on the dataset which includes the following variables

- Sex 
- Age
- Country
- Marital Status
- Work Hours of Respondent and Spouse (Household work and Paid work)
- Education Level of Respondent
- Income sharing between partners
- Number of adults and children in the family
- Item level responses to survery questions with Likert-type scale

The analyses include:
- Distribution of variables
- Group-wise comparisons of variables (e.g., by gender, education, or country)
- Summary statistics such as mean, standard deviation, and confidence intervals


In [98]:
## Defining a color map for genders
gender_color_map = {"Female":"#f678a7", "Male": "#add2e4"}

In [99]:
df_2002['sex'].unique()

array(['Female', 'Male', nan], dtype=object)

## Sex

In [100]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))
for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["sex"])
    counts = clean["sex"].value_counts()
    total = counts.sum()
    perc = (counts / total * 100).reindex(["Female", "Male"])
    colors=[gender_color_map[g] for g in perc.index]
    fig.add_trace(
        go.Bar(
            x=perc.index,
            y=perc.values,
            text=[f"{v:.1f}%" for v in perc.values],
            textposition="outside",
            marker=dict(color=colors)
        ),
        row=1, col=idx
    )
fig.update_layout(
    height=500, width=1500,
    showlegend=False,
    uniformtext_minsize=8,
    bargap=0.25,
    title='Gender Distribution across years'
)

fig.update_xaxes(categoryorder="array", categoryarray=["Female", "Male"], title_text="Gender")
fig.update_yaxes(range=[0, 100], title_text="Percentage")
fig.show()

## Age

In [101]:
age_labels = ["18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]

fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))
for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["age_bin"]).copy()
    counts = clean["age_bin"].value_counts()
    total = counts.sum()
    percentages = (counts / total * 100).reindex(age_labels)
    fig.add_trace(
        go.Bar(
            x=percentages.index,
            y=percentages.values,
            marker_color="#4c72b0",
            text=[f"{v:.1f}%" for v in percentages.values],
            textposition="outside"
        ),
        row=1, col=idx
    )
fig.update_layout(
    height=600, width=1500,
    showlegend=False,
    uniformtext_minsize=8,
    bargap=0.25,
    title='Age Distribution Across Years'
)
fig.update_xaxes(categoryorder="array", tickangle=45, title_text="Age Group")
fig.update_yaxes(title_text="Percentage")
fig.show()

In [102]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))


for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["sex", "age"])
    
    for gender in ["Female", "Male"]:
        gender_data = clean[clean["sex"] == gender]["age"]
        fig.add_trace(
            go.Histogram(
                x=gender_data,
                name=gender,
                marker_color=gender_color_map[gender],
                opacity=1,
                showlegend=(idx == 1),
                nbinsx=30
            ),
            row=1, col=idx
        )

fig.update_layout(
    height=400, width=1500,
    barmode='overlay',
    title='Age Distribution by Gender Across Years'
)
fig.update_xaxes(title_text="Age")
fig.update_yaxes(title_text="Count")
fig.show()

In [103]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

age_labels = ["18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["sex", "age_bin"]).copy()
    
    # Calculate percentage by age group
    age_sex = clean.groupby(['age_bin', 'sex']).size().unstack(fill_value=0)
    age_sex_pct = age_sex.div(age_sex.sum(axis=1), axis=0) * 100
    age_sex_pct = age_sex_pct.reindex(age_labels)
    
    for sex in ['Female', 'Male']:
        if sex in age_sex_pct.columns:
            fig.add_trace(
                go.Bar(
                    name=sex,
                    x=age_sex_pct.index,
                    y=age_sex_pct[sex],
                    marker_color=gender_color_map[sex],
                    showlegend=(idx == 1)
                ),
                row=1, col=idx
            )

fig.update_layout(
    height=500, width=1500,
    barmode='stack',
    title='Gender Distribution by Age Group Across Years'
)
fig.update_xaxes(title_text="Age Group", tickangle=45)
fig.update_yaxes(title_text="Percentage", range=[0, 100])
fig.show()

## Country

In [104]:
import pycountry

def get_country_name(country_code):
    try:
        return pycountry.countries.get(alpha_2=country_code).name
    except:
        return country_code 

fig = make_subplots(
    rows=1, cols=3,
    specs=[[{"type": "treemap"}, {"type": "treemap"}, {"type": "treemap"}]],
    subplot_titles=("2002", "2012", "2022")
)

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["COUNTRY"])
    
    country_counts = clean['COUNTRY'].value_counts().sort_values(ascending=False)
    total = country_counts.sum()
    country_pct = (country_counts / total * 100)
    country_names = [get_country_name(code) for code in country_counts.index]
    
    fig.add_trace(
    go.Treemap(
        labels=country_names,
        parents=[""] * len(country_names),
        values=country_counts.values,
        marker=dict(
            colors=country_counts.values,
            colorscale="Viridis",
            line=dict(width=2, color="white")
        ),
        text=[
            f"<b>{name}</b><br>{pct:.1f}%"
            for name, pct in zip(country_names, country_pct.values)
        ],
        textposition="middle center",
        textfont=dict(size=11, color="white"),
        hovertemplate=(
            "<b>%{label}</b><br>"
            "Respondents: %{value}<br>"
            "Percentage: %{customdata:.1f}%"
            "<extra></extra>"
        ),
        customdata=country_pct.values
    ),
    row=1, col=idx
)
    
fig.show()


In [105]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["COUNTRY"])
    
    # Calculate percentages by country
    country_counts = clean['COUNTRY'].value_counts()
    total = country_counts.sum()
    country_pct = (country_counts / total * 100).sort_index()
    
    fig.add_trace(
        go.Bar(
            x=country_pct.index,
            y=country_pct.values,
            marker_color="#4c72b0",
            text=[f"{v:.1f}%" for v in country_pct.values],
            textposition="outside"
        ),
        row=1, col=idx
    )

fig.update_layout(
    height=600, width=2400,
    showlegend=False,
    title='Respondent Distribution by Country Across Years',
    margin=dict(b=150)
)
fig.update_xaxes(title_text="Country", tickangle=90)
fig.update_yaxes(title_text="Percentage")
fig.show()

In [106]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["age", "COUNTRY"])
    
    # Calculate average age by country
    avg_age = clean.groupby('COUNTRY')['age'].mean().sort_index()
    
    fig.add_trace(
        go.Bar(
            x=avg_age.index,
            y=avg_age.values,
            marker_color="#4c72b0",
            text=[f"{v:.1f}" for v in avg_age.values],
            textposition="outside"
        ),
        row=1, col=idx
    )

fig.update_layout(
    height=600, width=2400,
    showlegend=False,
    title='Average Age by Country Across Years',
    margin=dict(b=150)
)
fig.update_xaxes(title_text="Country", tickangle=90)
fig.update_yaxes(title_text="Average Age")
fig.show()

In [107]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("2002", "2012", "2022")
)

for idx, (data, year) in enumerate(
    [(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")],
    start=1
):
    clean = data.dropna(subset=["sex", "COUNTRY"])
    
    country_sex = clean.groupby(['COUNTRY', 'sex']).size().unstack(fill_value=0)
    country_sex_pct = country_sex.div(country_sex.sum(axis=1), axis=0) * 100
    
    country_sex_pct = country_sex_pct.sort_index()
    
    for sex in ['Female', 'Male']:
        if sex in country_sex_pct.columns:
            fig.add_trace(
                go.Bar(
                    name=sex,
                    x=country_sex_pct.index,
                    y=country_sex_pct[sex],
                    marker_color=gender_color_map[sex],
                    showlegend=(idx == 1)
                ),
                row=1,
                col=idx
            )
for col in [1, 2, 3]:
    fig.add_hline(
        y=50,
        row=1,
        col=col,
        line_dash="dash",
        line_color="black",
        line_width=2,
        annotation_text="50%",
        annotation_position="top left",
        annotation_font_size=10
    )

fig.update_layout(
    height=600,
    width=2400,
    barmode='stack',
    uniformtext_minsize=8,
    bargap=0.25,
    title='Gender Distribution by Country Across Years',
    margin=dict(b=150)
)

fig.update_xaxes(
    tickangle=90,
    title_text="Country"
)

fig.update_yaxes(
    title_text="Percentage",
    range=[0, 100]
)

fig.show()


## Marital status

In [108]:
# Marital status distribution by gender (percentages)
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))
marital_order = ['Single', 'Married', 'Separated', 'Divorced', 'Widowed']
marital_order_12_22 = ['Single', 'Married', 'Civil partnership', 'Divorced', 'Widowed']

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["marital", "sex"])
    
    for gender in ["Female", "Male"]:
        gender_data = clean[clean["sex"] == gender]["marital"]
        value_counts = gender_data.value_counts()
        total = value_counts.sum()
        if year=='2012' or year=='2022':
            percentages = (value_counts / total * 100).reindex(marital_order_12_22, fill_value=0)
        else:
            percentages = (value_counts / total * 100).reindex(marital_order, fill_value=0)

        
        fig.add_trace(
            go.Bar(
                x=percentages.index,
                y=percentages.values,
                name=gender,
                marker_color=gender_color_map[gender],
                opacity=0.7,
                showlegend=(idx == 1),
                text=[f"{v:.1f}%" for v in percentages.values],
                textposition="outside"
            ),
            row=1, col=idx
        )

fig.update_layout(
    height=500, width=1800,
    barmode='group',
    title='Marital Status Distribution by Gender Across Years'
)
fig.update_xaxes(title_text="Marital Status", tickangle=45)
fig.update_yaxes(title_text="Percentage")
fig.show()

In [109]:

fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))
marital_order = ['Single', 'Married', 'Divorced', 'Widowed']
marital_order_12_22 = ['Single', 'Married', 'Divorced', 'Widowed',"Civil Partnership"]

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["marital", "sex"])
    

    marital_sex = clean.groupby(['marital', 'sex']).size().unstack(fill_value=0)
    marital_sex_pct = marital_sex.div(marital_sex.sum(axis=1), axis=0) * 100
    if year=="2012" or year=='2022':
       marital_sex_pct = marital_sex_pct.reindex(marital_order_12_22)
    else:
          marital_sex_pct = marital_sex_pct.reindex(marital_order) 
    
    for gender in ['Female', 'Male']:
        if gender in marital_sex_pct.columns:
            fig.add_trace(
                go.Bar(
                    name=gender,
                    x=marital_sex_pct.index,
                    y=marital_sex_pct[gender],
                    marker_color=gender_color_map[gender],
                    showlegend=(idx == 1)
                ),
                row=1, col=idx
            )

fig.update_layout(
    height=500, width=1800,
    barmode='stack',
    title='Gender Distribution within Each Marital Status Category Across Years'
)
fig.update_xaxes(title_text="Marital Status", tickangle=45)
fig.update_yaxes(title_text="Percentage", range=[0, 100])
fig.show()

In [110]:

fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

age_labels = ["<18","18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
marital_order = ['Single', 'Married', 'Separated', 'Divorced', 'Widowed']
marital_colors = {
    'Single': '#4c78a8',
    'Married': '#f28e2b',
    'Separated': '#59a14f',
    'Divorced': '#e15759',
    'Widowed': '#b07aa1'
}

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["age_bin", "marital"]).copy()
    clean['age_bin'] = pd.Categorical(clean['age_bin'], categories=age_labels, ordered=True)
    
    age_marital = clean.groupby(['age_bin', 'marital']).size().unstack(fill_value=0)
    age_marital_pct = age_marital.div(age_marital.sum(axis=1), axis=0) * 100
    age_marital_pct = age_marital_pct.reindex(marital_order, axis=1, fill_value=0)
    
    for marital_status in marital_order:
        if marital_status in age_marital_pct.columns:
            fig.add_trace(
                go.Bar(
                    name=marital_status,
                    x=age_marital_pct.index,
                    y=age_marital_pct[marital_status],
                    marker_color=marital_colors[marital_status],
                    showlegend=(idx == 1),
                    text=[f"{v:.1f}%" for v in age_marital_pct[marital_status]],
                    textposition="inside"
                ),
                row=1, col=idx
            )

fig.update_layout(
    height=600, width=2000,
    barmode='stack',
    title='Marital Status Distribution by Age Group Across Years',
    legend_title_text='Marital Status'
)
fig.update_xaxes(title_text="Age Group", tickangle=45)
fig.update_yaxes(title_text="Percentage", range=[0, 100])
fig.show()









In the above plot, there are people under 18, who are married, divorced, widowed, we remove such samples from our analysis, since the legal age of marriage in most countries is 18 or higher

In [111]:
df_2002=df_2002[df_2002['age']>=18]
df_2012=df_2012[df_2012['age']>=18]
df_2022=df_2022[df_2022['age']>=18]

In [112]:
df_2002=df_2002[df_2002['marital']=='Married']
df_2012=df_2012[df_2012['marital']=='Married']
df_2022=df_2022[df_2022['marital']=='Married']

### Household members

In [113]:
# Create HHADULT for 2012 by subtracting children and toddlers from home population
df_2012['HHADULT'] = df_2012['HOMPOP'] - df_2012['HHCHILDR'] - df_2012['HHTODD']
df_2012=df_2012[df_2012['HHADULT']>0]

In [114]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

all_values = []
for data in [df_2002, df_2012, df_2022]:
    all_values.extend(data['HHADULT'].dropna().unique())
x_categories = sorted([x for x in set(all_values) if x >= 1])

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["HHADULT"])
    
    value_counts = clean['HHADULT'].value_counts()
    total = value_counts.sum()
    percentages = (value_counts / total * 100).reindex(x_categories, fill_value=0)
    
    non_zero_data = percentages[percentages > 0]
    
    fig.add_trace(
        go.Bar(
            x=non_zero_data.index,
            y=non_zero_data.values,
            marker_color="#4c72b0",
            text=[f"{v:.1f}%" for v in non_zero_data.values],
            textposition="outside"
        ),
        row=1, col=idx
    )

fig.update_layout(
    height=500, width=1500,
    showlegend=False,
    title='Distribution of Number of Adults in Household Across Years'
)
fig.update_xaxes(title_text="Number of Adults", categoryorder="array", categoryarray=x_categories)
fig.update_yaxes(title_text="Percentage")
fig.show()

Apparently, the dataset contains households with greater than 15 people.

In [115]:
df_2002[df_2002['HHADULT']>15][['HOMPOP', 'HHADULT', 'HHCHILDR', 'HHTODD']]

Unnamed: 0,HOMPOP,HHADULT,HHCHILDR,HHTODD
23204,24.0,17.0,6.0,1.0


In [116]:
df_2012[df_2012['HHADULT']>30][['HOMPOP', 'HHADULT', 'HHCHILDR', 'HHTODD']]

Unnamed: 0,HOMPOP,HHADULT,HHCHILDR,HHTODD
13544,37.0,36.0,1.0,0.0


In [117]:
df_2022[df_2022['HHADULT']>20][['HOMPOP', 'HHADULT', 'HHCHILDR', 'HHTODD']]

Unnamed: 0,HOMPOP,HHADULT,HHCHILDR,HHTODD


Clearly these are erroneous values and will not be considered in the analysis

### Education

In [118]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

educ_order = ['No/Primary', 'Secondary', 'Post-sec / Short tertiary', 'University+']

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["educ_4_label", "sex"])
    
    # Calculate percentage by education level
    educ_sex = clean.groupby(['educ_4_label', 'sex']).size().unstack(fill_value=0)
    educ_sex_pct = educ_sex.div(educ_sex.sum(axis=1), axis=0) * 100
    educ_sex_pct = educ_sex_pct.reindex(educ_order, fill_value=0)
    
    for gender in ['Female', 'Male']:
        if gender in educ_sex_pct.columns:
            fig.add_trace(
                go.Bar(
                    name=gender,
                    x=educ_sex_pct.index,
                    y=educ_sex_pct[gender],
                    marker_color=gender_color_map[gender],
                    showlegend=(idx == 1)
                ),
                row=1, col=idx
            )

fig.update_layout(
    height=500, width=1800,
    barmode='stack',
    title='Gender Distribution within Each Education Level Across Years'
)
fig.update_xaxes(title_text="Education Level", tickangle=45)
fig.update_yaxes(title_text="Percentage", range=[0, 100])
fig.show()

In [119]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

educ_order = ['No/Primary', 'Secondary', 'Post-sec / Short tertiary', 'University+']

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["educ_4_label", "sex"])
    
    for gender in ["Female", "Male"]:
        gender_data = clean[clean["sex"] == gender]["educ_4_label"]
        value_counts = gender_data.value_counts()
        total = value_counts.sum()
        percentages = (value_counts / total * 100).reindex(educ_order, fill_value=0)
        
        fig.add_trace(
            go.Bar(
                x=percentages.index,
                y=percentages.values,
                name=gender,
                marker_color=gender_color_map[gender],
                opacity=0.7,
                showlegend=(idx == 1),
                text=[f"{v:.1f}%" for v in percentages.values],
                textposition="outside"
            ),
            row=1, col=idx
        )

fig.update_layout(
    height=500, width=1800,
    barmode='group',
    title='Education Level Distribution by Gender Across Years'
)
fig.update_xaxes(title_text="Education Level", tickangle=45)
fig.update_yaxes(title_text="Percentage")
fig.show()

In [120]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

educ_order = ['No/Primary', 'Secondary', 'Post-sec / Short tertiary', 'University+']

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["educ_4_label"])
    
    value_counts = clean['educ_4_label'].value_counts()
    total = value_counts.sum()
    percentages = (value_counts / total * 100).reindex(educ_order, fill_value=0)
    
    fig.add_trace(
        go.Bar(
            x=percentages.index,
            y=percentages.values,
            marker_color="#4c72b0",
            text=[f"{v:.1f}%" for v in percentages.values],
            textposition="outside"
        ),
        row=1, col=idx
    )

fig.update_layout(
    height=500, width=1800,
    showlegend=False,
    title='Education Level Distribution Across Years'
)
fig.update_xaxes(title_text="Education Level", tickangle=45)
fig.update_yaxes(title_text="Percentage")
fig.show()

In [121]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

educ_order = ['No/Primary', 'Secondary', 'Post-sec / Short tertiary', 'University+']

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["age", "educ_4_label"]).copy()
    clean['educ_4_label'] = pd.Categorical(clean['educ_4_label'], categories=educ_order, ordered=True)
    clean = clean.sort_values('educ_4_label')
    
    fig.add_trace(
        go.Box(
            y=clean['age'],
            x=clean['educ_4_label'],
            name=year,
            marker_color='#4c72b0',
            showlegend=False
        ),
        row=1, col=idx
    )

fig.update_layout(
    height=600, width=2200,
    title='Age Distribution by Education Level Across Years',
)
fig.update_xaxes(title_text="Education Level")
fig.update_yaxes(title_text="Age")
fig.show()

In [122]:

fig = make_subplots(rows=2, cols=3, subplot_titles=(
    "2002 - Female", "2012 - Female", "2022 - Female",
    "2002 - Male", "2012 - Male", "2022 - Male"
), vertical_spacing=0.15)

educ_order = ['No/Primary', 'Secondary', 'Post-sec / Short tertiary', 'University+']
gender_colors = {"Female": "#ff69b4", "Male": "#4169e1"}

for gender_idx, gender in enumerate(['Female', 'Male'], start=1):
    for col_idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
        clean = data.dropna(subset=["age", "educ_4_label", "sex"]).copy()
        clean = clean[clean['sex'] == gender]
        clean['educ_4_label'] = pd.Categorical(clean['educ_4_label'], categories=educ_order, ordered=True)
        clean = clean.sort_values('educ_4_label')
        
        fig.add_trace(
            go.Box(
                y=clean['age'],
                x=clean['educ_4_label'],
                name=gender,
                
                marker_color=gender_colors[gender],
                showlegend=(col_idx == 1 and gender_idx == 1),
                hovertemplate=f"{gender}<br>Education=%{{x}}<br>Age=%{{y}}<extra>{year}</extra>"
            ),
            row=gender_idx, col=col_idx
        )

fig.update_layout(
    height=1500, width=2200,
    title='Age Distribution by Education Level and Gender Across Years',
)
fig.update_xaxes(title_text="Education Level", tickangle=45)
fig.update_yaxes(title_text="Age")
fig.show()

In [123]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

age_labels = ["18-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"]
educ_order = ['No/Primary', 'Secondary', 'Post-sec / Short tertiary', 'University+']
age_colors = {
    '18-25': '#4c78a8',
    '26-35': '#f28e2b',
    '36-45': '#59a14f',
    '46-55': '#e15759',
    '56-65': '#b07aa1',
    '66-75': '#9c755f',
    '75+': '#edc948'
}

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["age_bin", "educ_4_label"]).copy()
    clean['age_bin'] = pd.Categorical(clean['age_bin'], categories=age_labels, ordered=True)
    clean['educ_4_label'] = pd.Categorical(clean['educ_4_label'], categories=educ_order, ordered=True)

    educ_age = clean.groupby(['educ_4_label', 'age_bin']).size().unstack(fill_value=0)
    educ_age_pct = educ_age.div(educ_age.sum(axis=1), axis=0).reindex(educ_order)

    for age_group in age_labels:
        fig.add_trace(
            go.Bar(
                name=age_group,
                x=educ_age_pct.index,
                y=educ_age_pct[age_group] * 100,
                marker_color=age_colors[age_group],
                showlegend=(idx == 1)
            ),
            row=1, col=idx
        )

fig.update_layout(
    height=550, width=2000,
    barmode='stack',
    title='Age Group Distribution within Education Levels Across Years',
    legend_title_text='Age Group'
)
fig.update_xaxes(title_text="Education Level", tickangle=45)
fig.update_yaxes(title_text="Percentage", range=[0, 100])
fig.show()









### TOPBOT is a variable where people rank their social status from 1 to 10, where 1 is lowest status, 10 is highest status

In [124]:
## This variable was encoded in the opposite direction for Norway, hence we reverse it
for data in [df_2002, df_2012, df_2022]:
    norway_mask = data['COUNTRY'] == 'NO'
    data.loc[norway_mask & data['TOPBOT'].notna(), 'TOPBOT'] = 11 - data.loc[norway_mask & data['TOPBOT'].notna(), 'TOPBOT']


In [125]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

bin_config = dict(start=0.5, end=10.5, size=1)
color_main = "#4c72b0"
marker_mean = dict(color="#e15759", width=2, dash="dash")

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["TOPBOT"])
    mean_val = clean["TOPBOT"].mean()

    fig.add_trace(
        go.Histogram(
            x=clean["TOPBOT"],
            xbins=bin_config,
            histnorm="percent",
            marker_color=color_main,
            opacity=0.75,
            showlegend=False,
            texttemplate="%{y:.1f}%",
            hovertemplate="TOPBOT=%{x}<br>%{y:.1f}% of respondents<extra>" + year + "</extra>"
        ),
        row=1, col=idx
    )

    fig.add_shape(
        type="line",
        x0=mean_val, x1=mean_val, y0=0, y1=1,
        xref=f"x{idx}", yref="paper",
        line=marker_mean
    )
    fig.add_annotation(
        x=mean_val, y=1.05,
        xref=f"x{idx}", yref="paper",
        text=f"Mean {mean_val:.2f}",
        showarrow=False,
        font=dict(color=marker_mean["color"])
    )

fig.update_layout(
    height=700, width=2000,
    title="TOPBOT distribution",
    bargap=0.05,
    margin=dict(t=60, b=50)
)
fig.update_xaxes(title_text="TOPBOT score", dtick=1)
fig.update_yaxes(title_text="Percentage of respondents", range=[0, 100])
fig.show()


In [126]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

educ_order = ['No/Primary', 'Secondary', 'Post-sec / Short tertiary', 'University+']
educ_colors = {
    'No/Primary': '#4c78a8',
    'Secondary': '#f28e2b',
    'Post-sec / Short tertiary': '#59a14f',
    'University+': '#b07aa1'
}

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["TOPBOT", "educ_4_label"]).copy()
    clean['educ_4_label'] = pd.Categorical(clean['educ_4_label'], categories=educ_order, ordered=True)

    for level in educ_order:
        subset = clean[clean['educ_4_label'] == level]
        if subset.empty:
            continue
        fig.add_trace(
            go.Box(
                y=subset['TOPBOT'],
                name=level,
                marker_color=educ_colors[level],
                boxmean=True,
                showlegend=(idx == 1),
                hovertemplate=f"Education={level}<br>TOPBOT=%{{y}}<extra>{year}</extra>"
            ),
            row=1, col=idx
        )

fig.update_layout(
    height=600, width=2000,
    boxmode='group',
    title="Social Status score distribution by education level",
    legend_title_text='Education level',
    margin=dict(t=60, b=80)
)
fig.update_yaxes(title_text="Social Status score", range=[0, 10])
fig.show()


In [127]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("2002", "2012", "2022"))

bar_color = "#4c72b0"

for idx, (data, year) in enumerate([(df_2002, "2002"), (df_2012, "2012"), (df_2022, "2022")], start=1):
    clean = data.dropna(subset=["TOPBOT", "COUNTRY"])
    avg_topbot = clean.groupby('COUNTRY')['TOPBOT'].mean().sort_values()

    fig.add_trace(
        go.Bar(
            x=avg_topbot.values,
            y=avg_topbot.index,
            orientation='h',
            marker_color=bar_color,
            text=[f"{v:.2f}" for v in avg_topbot.values],
            textposition='outside',
            showlegend=False,
            hovertemplate="Country=%{y}<br>Mean TOPBOT=%{x:.2f}<extra>" + year + "</extra>"
        ),
        row=1, col=idx
    )

fig.update_layout(
    height=900, width=2600,
    title="Mean TOPBOT by country",
    margin=dict(t=60, b=40, r=120),
    bargap=0.2
)
fig.update_xaxes(title_text="Mean TOPBOT", range=[0, 10])
fig.update_yaxes(title_text="Country")
fig.show()


### Determination of the Number of Factors

The number of factors to retain was determined using two commonly used criteria:

- **Kaiser Criterion**:  
  According to the Kaiser criterion, only factors with eigenvalues greater than 1 should be retained. In our analysis, two factors had eigenvalues exceeding 1.

- **Scree Plot Inspection**:  
  Visual inspection of the scree plot shows a clear elbow at the second factor, after which the eigenvalues level off. 

Based on both the Kaiser criterion and the scree plot, **two latent dimensions** are considered for the data
