In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

In [2]:
df_2002 = pd.read_csv('../data/clean_csv/2002.csv')
df_2012 = pd.read_csv('../data/clean_csv/2012.csv')
df_2022 = pd.read_csv('../data/clean_csv/2022.csv')

In [3]:
cols = ['urban_rural', 'TOPBOT', 'spouse_work_status', 'SPWRKHRS', 'sex', 'C_ALPHAN', 'LIVWOMAR', 'WWYKS', 'WWYKUS', 'SP_DEGREE', 'MOMORFAF', 'MEWH', 'HHTODD', 'HHCHILDR', 'HHADULT', 'FAM_DIF', 'code_higher_income', 'SHARE_HH', 'HW_FULFIL', 'WO_WANT', 'WW_FAM_SUFFER', 'WW_CHILD_SUFFER', 'WW_WARM', 'DIV_HH_COOK', 'DIV_HH_CLEAN', 'DIV_HH_GROC', 'DIV_HH_CARE', 'DIV_HH_LAUND', 'code_income_control', 'SP_HH_FAM', 'SP_HH', 'LIFE_HAP', 'DIFF_CONC_WORK', 'HH_TIRED', 'HH_FAM', 'WORK_TIRED', 'hh_work_hrs', 'HH_WEEKEND', 'work_status', 'COHAB', 'marital', 'HOMPOP', 'wrk_hrs', 'highest_education', 'COUNTRY', 'age', 'CASEID']


remove_missing = [i for i in df_2002.columns if 'egal' in i]

# Remove rows where the first 6 'egal' columns are all null
print(f"2002 - Before removing rows with all null in first 6 'egal' columns: {len(df_2002)} rows")
df_2002 = df_2002[~df_2002[remove_missing[:6]].isnull().any(axis=1)]
print(f"2002 - After: {len(df_2002)} rows")

print(f"\n2012 - Before removing rows with all null in first 6 'egal' columns: {len(df_2012)} rows")
remove_missing_2012 = [i for i in df_2012.columns if 'egal' in i]
df_2012 = df_2012[~df_2012[remove_missing_2012[:6]].isnull().any(axis=1)]
print(f"2012 - After: {len(df_2012)} rows")

print(f"\n2022 - Before removing rows with all null in first 6 'egal' columns: {len(df_2022)} rows")
remove_missing_2022 = [i for i in df_2022.columns if 'egal' in i]
df_2022 = df_2022[~df_2022[remove_missing_2022[:6]].isnull().any(axis=1)]
print(f"2022 - After: {len(df_2022)} rows")




df_2002.drop(columns=[i for i in df_2002.columns if 'egal' in i], inplace=True)
df_2002.drop(columns=['work_status', 'spouse_work_status'], inplace=True)
# print(set(cols).difference(set(df_2002.columns)))
# print((set(df_2002.columns)).difference(set(cols)))

df_2012.drop(columns=[i for i in df_2012.columns if 'egal' in i], inplace=True)
df_2012.drop(columns=[ 'work_status', 'spouse_work_status'], inplace=True)
# print(set(cols).difference(set(df_2012.columns)))
# print((set(df_2012.columns)).difference(set(cols)))

df_2022.drop(columns=[i for i in df_2022.columns if 'egal' in i], inplace=True)
df_2022.drop(columns=[ 'work_status', 'spouse_work_status'], inplace=True)
# print(set(cols).difference(set(df_2022.columns)))
# print((set(df_2022.columns)).difference(set(cols)))


df_2002_married = df_2002[df_2002['marital'] == 'Married']
df_2012_married = df_2012[df_2012['marital'] == 'Married']
df_2022_married = df_2022[df_2022['marital'] == 'Married']

df_2002_married = df_2002_married[df_2002_married['age']>=18]
df_2012_married = df_2012_married[df_2012_married['age']>=18]
df_2022_married = df_2022_married[df_2022_married['age']>=18]

df_2002_married = df_2002_married[(df_2002_married['sex']== 'Male') | (df_2002_married['sex']== 'Female')]
df_2012_married = df_2012_married[(df_2012_married['sex']== 'Male') | (df_2012_married['sex']== 'Female')]
df_2022_married = df_2022_married[(df_2022_married['sex']== 'Male') | (df_2022_married['sex']== 'Female')]

df_2002_married['TOPBOT'] = df_2002_married[['TOPBOT', 'C_ALPHAN']].apply(lambda x: 11-x[0] if x[1] == 'NO' else x[0], axis=1)
df_2012_married['TOPBOT'] = df_2012_married[['TOPBOT', 'C_ALPHAN']].apply(lambda x: 11-x[0] if x[1] == 'NO' else x[0], axis=1)
df_2022_married['TOPBOT'] = df_2022_married[['TOPBOT', 'C_ALPHAN']].apply(lambda x: 11-x[0] if x[1] == 'NO' else x[0], axis=1)

df_2002_married['code_income_control'] = df_2002_married['code_income_control'].apply(lambda x: x.split(' ')[-1] if pd.notna(x) else x)
df_2012_married['code_income_control'] = df_2012_married['code_income_control'].apply(lambda x: x.split(' ')[-1] if pd.notna(x) else x)
df_2022_married['code_income_control'] = df_2022_married['code_income_control'].apply(lambda x: x.split(' ')[-1] if pd.notna(x) else x)


def normalize_column(series):
    """
    Normalize a numeric series to 0-1 range using min-max scaling.
    """
    return (series - series.min()) / (series.max() - series.min())


nan_pct = df_2002_married.isnull().sum(axis=1) / len(df_2002_married.columns)
df_2002_married = df_2002_married[nan_pct < 0.6].copy()

nan_pct = df_2012_married.isnull().sum(axis=1) / len(df_2012_married.columns)
df_2012_married = df_2012_married[nan_pct < 0.6].copy()

nan_pct = df_2022_married.isnull().sum(axis=1) / len(df_2022_married.columns)
df_2022_married = df_2022_married[nan_pct < 0.6].copy()

# Apply to eg_score columns
df_2002_married['eg_score_norm'] = normalize_column(df_2002_married['eg_score'])
df_2012_married['eg_score_norm'] = normalize_column(df_2012_married['eg_score'])
df_2022_married['eg_score_norm'] = normalize_column(df_2022_married['eg_score'])

2002 - Before removing rows with all null in first 6 'egal' columns: 46638 rows
2002 - After: 40395 rows

2012 - Before removing rows with all null in first 6 'egal' columns: 61754 rows
2012 - After: 52736 rows

2022 - Before removing rows with all null in first 6 'egal' columns: 45762 rows
2022 - After: 40768 rows


  df_2002_married['TOPBOT'] = df_2002_married[['TOPBOT', 'C_ALPHAN']].apply(lambda x: 11-x[0] if x[1] == 'NO' else x[0], axis=1)
  df_2012_married['TOPBOT'] = df_2012_married[['TOPBOT', 'C_ALPHAN']].apply(lambda x: 11-x[0] if x[1] == 'NO' else x[0], axis=1)
  df_2022_married['TOPBOT'] = df_2022_married[['TOPBOT', 'C_ALPHAN']].apply(lambda x: 11-x[0] if x[1] == 'NO' else x[0], axis=1)


In [4]:
df_2002_married.shape, df_2012_married.shape, df_2022_married.shape

((23734, 55), (29719, 55), (20510, 55))

In [5]:
# Combine datasets with year labels
df_2002_married['year'] = 2002
df_2012_married['year'] = 2012
df_2022_married['year'] = 2022

df_combined = pd.concat([df_2002_married, df_2012_married, df_2022_married], ignore_index=True)

In [6]:
df_combined['CASEID'].nunique(), df_combined.shape

(73963, (73963, 56))

<!-- ## 1. Temporal Evolution of Financial Control by Gender -->

In [11]:
# # Income control over time by gender
# income_control_temporal = df_combined.groupby(['year', 'sex'])['code_income_control'].value_counts(normalize=True).reset_index()
# income_control_temporal.columns = ['year', 'sex', 'income_control', 'proportion']

# # Create grouped bar chart
# fig1 = px.bar(income_control_temporal, 
#               x='year', 
#               y='proportion', 
#               color='income_control',
#               facet_col='sex',
#               barmode='group',
#               title='Financial Control Distribution by Gender Across Time',
#               labels={'proportion': 'Proportion', 'year': 'Year', 'income_control': 'Income Control'},
#               color_discrete_sequence=px.colors.qualitative.Set2,
#               height=500)

# fig1.update_layout(
#     font=dict(size=12, family='Arial'),
#     title_font_size=16,
#     plot_bgcolor='white',
#     paper_bgcolor='white',
#     legend_title_text='Income Control'
# )

# fig1.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
# fig1.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# fig1.show()

## 1. Gender Distribution by Financial Control Type

In [7]:
# Calculate overall gender distribution by year
overall_gender = df_combined.groupby(['year', 'sex']).size().reset_index(name='total_count')
overall_gender['overall_percentage'] = overall_gender.groupby('year')['total_count'].transform(lambda x: x / x.sum() * 100)

# Calculate gender distribution for each income control type by year
control_gender = df_combined.groupby(['year', 'code_income_control', 'sex']).size().reset_index(name='count')
control_gender['percentage'] = control_gender.groupby(['year', 'code_income_control'])['count'].transform(lambda x: x / x.sum() * 100)

# Merge with overall gender distribution
control_gender = control_gender.merge(overall_gender[['year', 'sex', 'overall_percentage']], on=['year', 'sex'])

# Calculate normalized percentage (actual percentage divided by expected percentage)
control_gender['normalized_percentage'] = (control_gender['percentage'] / control_gender['overall_percentage']) * 100

# Create stacked bar chart with normalized values
fig2 = px.bar(control_gender, 
              x='code_income_control', 
              y='normalized_percentage',
              color='sex',
              facet_col='year',
              title='Gender Distribution Across Financial Control Types by Year (Normalized)<br><sub>Values > 100 indicate over-representation, < 100 under-representation</sub>',
              labels={'normalized_percentage': 'Normalized Percentage (100 = expected)', 'code_income_control': 'Financial Control Type', 'sex': 'Gender'},
              color_discrete_map={'Female': '#E377C2', 'Male': '#1F77B4'},
              text='normalized_percentage',
              height=500)

fig2.update_traces(texttemplate='%{text:.1f}', textposition='inside')

In [8]:
# Calculate overall gender distribution by year
overall_gender = df_combined.groupby(['year', 'sex']).size().reset_index(name='total_count')
overall_gender['overall_percentage'] = overall_gender.groupby('year')['total_count'].transform(lambda x: x / x.sum() * 100)

# Calculate overall distribution of each control type by year (baseline)
control_baseline = df_combined.groupby(['year', 'code_income_control']).size().reset_index(name='baseline_count')
control_baseline['baseline_percentage'] = control_baseline.groupby('year')['baseline_count'].transform(lambda x: x / x.sum() * 100)

# Calculate gender distribution for each income control type by year
control_gender = df_combined.groupby(['year', 'code_income_control', 'sex']).size().reset_index(name='count')
control_gender['percentage'] = control_gender.groupby(['year', 'code_income_control'])['count'].transform(lambda x: x / x.sum() * 100)

# Merge with overall gender distribution
control_gender = control_gender.merge(overall_gender[['year', 'sex', 'overall_percentage']], on=['year', 'sex'])

# Merge with baseline control type distribution
control_gender = control_gender.merge(control_baseline[['year', 'code_income_control', 'baseline_percentage']], on=['year', 'code_income_control'])

# Calculate double-normalized percentage: 
# First normalize by expected gender distribution, then weight by control type prevalence
control_gender['normalized_percentage'] = (control_gender['percentage'] / control_gender['overall_percentage']) * 100

# Alternative: Calculate expected count and compare to actual
control_gender['expected_count'] = (control_gender['overall_percentage'] / 100) * control_gender.groupby(['year', 'code_income_control'])['count'].transform('sum')
control_gender['representation_index'] = (control_gender['count'] / control_gender['expected_count']) * 100

# Create visualization with representation index
fig2 = px.bar(control_gender, 
              x='code_income_control', 
              y='representation_index',
              color='sex',
              facet_col='year',
              barmode='group',  # Changed to 'group' to show male/female side-by-side
              title='Gender Representation in Financial Control Types (Normalized)<br><sub>100 = proportional representation; >100 = over-represented; <100 = under-represented</sub>',
              labels={'representation_index': 'Representation Index (100 = expected)', 
                      'code_income_control': 'Financial Control Type', 
                      'sex': 'Gender'},
              color_discrete_map={'Female': '#E377C2', 'Male': '#1F77B4'},
              text='representation_index',
              height=500)

fig2.update_traces(texttemplate='%{text:.1f}', textposition='outside')

fig2.update_layout(
    font=dict(size=12, family='Arial'),
    title_font_size=16,
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend_title_text='Gender'
)

fig2.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', tickangle=-45)
fig2.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# Add horizontal line at 100 (expected representation)
fig2.add_hline(y=100, line_dash="dash", line_color="gray", opacity=0.5)

fig2.show()

# Optional: Print summary statistics
print("\nRepresentation Index Summary:")
print(control_gender.groupby(['year', 'code_income_control', 'sex'])['representation_index'].first().unstack())


Representation Index Summary:
sex                           Female        Male
year code_income_control                        
2002 partner               77.988350  127.010073
     respondent           112.868952   84.208756
     separate             108.830975   89.163680
     shared               100.285939   99.649130
2012 partner               87.757647  113.044726
     respondent           110.142544   89.192706
     separate             101.290502   98.624918
     shared                99.009744  101.055158
2022 partner               75.506591  127.057112
     respondent           113.467732   85.122613
     separate             106.242558   93.104039
     shared                99.933687  100.073254


In [9]:
# Calculate overall education distribution by year and gender (baseline)
overall_education = df_combined.groupby(['year', 'sex', 'educ_4_label']).size().reset_index(name='total_count')
overall_education['overall_percentage'] = overall_education.groupby(['year', 'sex'])['total_count'].transform(lambda x: x / x.sum() * 100)

# Calculate education distribution for each TOPBOT level by year and gender
topbot_education = df_combined.groupby(['year', 'TOPBOT', 'sex', 'educ_4_label']).size().reset_index(name='count')
topbot_education['percentage'] = topbot_education.groupby(['year', 'TOPBOT', 'sex'])['count'].transform(lambda x: x / x.sum() * 100)

# Merge with overall education distribution
topbot_education = topbot_education.merge(overall_education[['year', 'sex', 'educ_4_label', 'overall_percentage']], 
                                          on=['year', 'sex', 'educ_4_label'])

# Calculate representation index
topbot_education['expected_count'] = (topbot_education['overall_percentage'] / 100) * topbot_education.groupby(['year', 'TOPBOT', 'sex'])['count'].transform('sum')
topbot_education['representation_index'] = (topbot_education['count'] / topbot_education['expected_count']) * 100

# Create visualization with representation index
fig_control_age_gender = px.bar(topbot_education,
                                 x='TOPBOT',
                                 y='representation_index',
                                 color='educ_4_label',
                                 facet_col='sex',
                                 facet_row='year',
                                 title='Education Level Representation by TOPBOT and Gender (Normalized)<br><sub>100 = expected; >100 = over-represented; <100 = under-represented</sub>',
                                 labels={'representation_index': 'Representation Index (100 = expected)',
                                         'TOPBOT': 'TOPBOT',
                                         'educ_4_label': 'Education Level'},
                                 color_discrete_sequence=px.colors.qualitative.Set1,
                                 height=900)

fig_control_age_gender.update_layout(
    font=dict(size=11, family='Arial'),
    title_font_size=16,
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend_title_text='Education Level',
    barmode='group'
)

fig_control_age_gender.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', tickangle=-45)
fig_control_age_gender.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# Add horizontal line at 100 (expected representation)
fig_control_age_gender.add_hline(y=100, line_dash="dash", line_color="gray", opacity=0.5)

fig_control_age_gender.show()

## 3. Temporal Evolution of Highest education by Gender

In [15]:
# Pie charts showing education distribution by gender for each year
from plotly.subplots import make_subplots

# Prepare data
edu_pie_data = df_combined.groupby(['year', 'sex', 'educ_4_label']).size().reset_index(name='count')
edu_pie_total = df_combined.groupby(['year', 'sex']).size().reset_index(name='total')
edu_pie_data = edu_pie_data.merge(edu_pie_total, on=['year', 'sex'])
edu_pie_data['proportion'] = edu_pie_data['count'] / edu_pie_data['total']

# Create subplots - 2 rows (Male/Female) x 3 columns (Years)
fig_pie = make_subplots(
    rows=2, cols=3,
    specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}],
           [{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}]],
    subplot_titles=('Male - 2002', 'Male - 2012', 'Male - 2022',
                    'Female - 2002', 'Female - 2012', 'Female - 2022'),
    vertical_spacing=0.15,
    horizontal_spacing=0.05
)

# Define colors for education levels (consistent across all pie charts)
edu_colors = {
    'No degree': '#d62728',
    'Lower secondary': '#ff7f0e',
    'Upper secondary': '#2ca02c',
    'University': '#1f77b4'
}

# Add pie charts
for row_idx, gender in enumerate(['Male', 'Female'], start=1):
    for col_idx, year in enumerate([2002, 2012, 2022], start=1):
        data_subset = edu_pie_data[(edu_pie_data['sex'] == gender) & (edu_pie_data['year'] == year)]
        
        # Sort by education level for consistency
        data_subset = data_subset.sort_values('educ_4_label')
        
        colors = [edu_colors.get(edu, '#gray') for edu in data_subset['educ_4_label']]
        
        fig_pie.add_trace(
            go.Pie(
                labels=data_subset['educ_4_label'],
                values=data_subset['count'],
                marker=dict(colors=colors),
                textposition='inside',
                textinfo='percent',
                textfont=dict(size=14, color='white'),
                insidetextorientation='radial',
                hovertemplate='<b>%{label}</b><br>Percentage: %{percent}<br>Count: %{value}<extra></extra>',
                showlegend=(row_idx == 1 and col_idx == 1)  # Show legend only once
            ),
            row=row_idx, col=col_idx
        )

fig_pie.update_layout(
    title_text='Education Level Distribution by Gender and Year',
    title_font_size=20,
    font=dict(size=13, family='Arial'),
    height=1000,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.05,
        xanchor="center",
        x=0.5,
        font=dict(size=14)
    )
)

# Update subplot titles font size
for annotation in fig_pie['layout']['annotations']:
    annotation['font'] = dict(size=15)

fig_pie.show()

## EDUCATION LEVEL OF THE PARTNER WITH CONTROL - make different for male and female

In [16]:
# Filter for cases where one person has control (exclude shared and separate)
df_single_control = df_combined[
    (df_combined['code_income_control'] == 'respondent') 
].copy()

# Create a column for the education level of the person with control
df_single_control['controller_education'] = df_single_control.apply(
    lambda row: row['educ_4_label'] if row['code_income_control'] == 'respondent' 
                else row['SP_DEGREE'], 
    axis=1
)

# Create simplified control label
df_single_control['controller'] = df_single_control['code_income_control'].apply(
    lambda x: 'Respondent Controls' if x == 'respondent' else 'Partner Controls'
)

# Remove rows with missing education data
df_single_control = df_single_control[df_single_control['controller_education'].notna()]

# Plot 1: Education distribution of financial controllers across years
controller_edu_year = df_single_control.groupby(['controller_education', 'year']).size().reset_index(name='count')
controller_edu_year_total = df_single_control.groupby('year').size().reset_index(name='total')
controller_edu_year = controller_edu_year.merge(controller_edu_year_total, on='year')
controller_edu_year['proportion'] = controller_edu_year['count'] / controller_edu_year['total']

# Convert year to string for discrete x-axis
controller_edu_year['year'] = controller_edu_year['year'].astype(str)

fig_controller_edu_year = px.bar(controller_edu_year,
                                   x='year',
                                   y='proportion',
                                   color='controller_education',
                                   barmode='stack',
                                   title='Education Level of Person with Financial Control (2002-2022)<br><sub>Excluding Joint and Separate Control Cases</sub>',
                                   labels={'proportion': 'Proportion',
                                           'year': 'Year',
                                           'controller_education': 'Education Level'},
                                   color_discrete_sequence=px.colors.qualitative.Set3,
                                   height=600)

fig_controller_edu_year.update_layout(
    font=dict(size=12, family='Arial'),
    title_font_size=16,
    plot_bgcolor='white',
    paper_bgcolor='white'
)

fig_controller_edu_year.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', tickangle=45)
fig_controller_edu_year.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig_controller_edu_year.show()

## Relationship Between Gender Equality Score and Education/Financial Control

In [17]:
# First, let's explore the eg_score_norm distribution and check for missing values
print("EG Score Statistics:")
print(df_combined['eg_score_norm'].describe())
print(f"\nMissing values: {df_combined['eg_score_norm'].isna().sum()}")
print(f"Total records: {len(df_combined)}")

# Create a cleaned dataset for analysis
df_eg_analysis = df_combined[df_combined['eg_score_norm'].notna()].copy()
print(f"\nRecords with valid EG score: {len(df_eg_analysis)}")

EG Score Statistics:
count    76832.000000
mean         0.534666
std          0.200677
min          0.000000
25%          0.388161
50%          0.516857
75%          0.669386
max          1.000000
Name: eg_score_norm, dtype: float64

Missing values: 1606
Total records: 78438

Records with valid EG score: 76832


### 1. EG Score by Education Level

In [18]:
# Calculate mean EG score by education level and year

# print("EG Score by Education Level and Year:")
# print(eg_by_education_year.to_string(index=False))

# # Line plot showing temporal evolution of EG scores by education level
# fig_eg_edu_temporal = px.line(eg_by_education_year,
#                                x='year',
#                                y='mean',
#                                color='educ_4_label',
#                                markers=True,
#                                title='Gender Equality Score Evolution by Education Level (2002-2022)<br><sub>Higher Score = More Equality</sub>',
#                                labels={'mean': 'Mean Gender Equality Score',
#                                        'year': 'Year',
#                                        'educ_4_label': 'Education Level'},
#                                color_discrete_sequence=px.colors.qualitative.Set2,
#                                height=500)

# fig_eg_edu_temporal.update_layout(
#     font=dict(size=12, family='Arial'),
#     title_font_size=16,
#     plot_bgcolor='white',
#     paper_bgcolor='white',
#     legend_title_text='Education Level'
# )

# fig_eg_edu_temporal.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
# fig_eg_edu_temporal.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# fig_eg_edu_temporal.show()

In [19]:
# # Bar chart showing mean EG score by education level over time
# eg_by_education_year = df_eg_analysis.groupby(['year', 'educ_4_label'])['eg_score_norm'].agg(['mean', 'median', 'std', 'count']).reset_index()

# fig_eg_edu_bar = px.bar(eg_by_education_year,
#                         x='year',
#                         y='mean',
#                         color='educ_4_label',
#                         barmode='group',
#                         title='Average Gender Equality Score by Education Level Across Time<br><sub>Comparing Educational Groups Over Years</sub>',
#                         labels={'mean': 'Mean Gender Equality Score',
#                                 'year': 'Year',
#                                 'educ_4_label': 'Education Level'},
#                         color_discrete_sequence=px.colors.qualitative.Set2,
#                         text='mean',
#                         height=500)

# fig_eg_edu_bar.update_traces(texttemplate='%{text:.2f}', textposition='outside')

# fig_eg_edu_bar.update_layout(
#     font=dict(size=12, family='Arial'),
#     title_font_size=16,
#     plot_bgcolor='white',
#     paper_bgcolor='white',
#     legend_title_text='Education Level'
# )

# fig_eg_edu_bar.update_xaxes(showgrid=False)
# fig_eg_edu_bar.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# fig_eg_edu_bar.show()

In [20]:
# Calculate overall education distribution by year (baseline)
overall_education_baseline = df_eg_analysis.groupby(['year', 'educ_4_label']).size().reset_index(name='total_count')
overall_education_baseline['overall_percentage'] = overall_education_baseline.groupby('year')['total_count'].transform(lambda x: x / x.sum() * 100)

# For each EG score bin, calculate education distribution
# First, create EG score bins
df_eg_analysis['eg_bin'] = pd.cut(df_eg_analysis['eg_score_norm'], bins=10)

# Calculate education distribution within each EG bin
eg_edu_distribution = df_eg_analysis.groupby(['year', 'eg_bin', 'educ_4_label']).size().reset_index(name='count')

# Merge with baseline
eg_edu_distribution = eg_edu_distribution.merge(
    overall_education_baseline[['year', 'educ_4_label', 'overall_percentage']], 
    on=['year', 'educ_4_label']
)

# Calculate representation index
eg_edu_distribution['expected_count'] = (eg_edu_distribution['overall_percentage'] / 100) * eg_edu_distribution.groupby(['year', 'eg_bin'])['count'].transform('sum')
eg_edu_distribution['representation_index'] = (eg_edu_distribution['count'] / eg_edu_distribution['expected_count']) * 100

# Create box plot with mean representation index
fig_eg_edu_box = px.box(eg_edu_distribution,
                        x='year',
                        y='representation_index',
                        color='educ_4_label',
                        title='Education Level Representation Across EG Score Ranges (Normalized)<br><sub>100 = expected; >100 = over-represented; <100 = under-represented</sub>',
                        labels={'representation_index': 'Representation Index (100 = expected)',
                                'year': 'Year',
                                'educ_4_label': 'Education Level'},
                        color_discrete_sequence=px.colors.qualitative.Set2,
                        height=500)

fig_eg_edu_box.update_layout(
    font=dict(size=12, family='Arial'),
    title_font_size=16,
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend_title_text='Education Level'
)

fig_eg_edu_box.add_hline(y=100, line_dash="dash", line_color="gray", opacity=0.5)

fig_eg_edu_box.update_xaxes(showgrid=False)
fig_eg_edu_box.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig_eg_edu_box.show()

### 2. EG Score by Financial Control

In [21]:
# # Calculate mean EG score by financial control type and year
# eg_by_control = df_eg_analysis.groupby(['year', 'code_income_control'])['eg_score_norm'].agg(['mean', 'median', 'std', 'count']).reset_index()

# print("EG Score by Financial Control Type and Year:")
# print(eg_by_control.to_string(index=False))

# # Box plot showing distribution of EG scores by financial control over time
# fig_eg_control_box = px.box(df_eg_analysis,
#                              x='code_income_control',
#                              y='eg_score_norm',
#                              color='year',
#                              title='Gender Equality Score Distribution by Financial Control Type and Year<br><sub>Higher Score = More Equality</sub>',
#                              labels={'eg_score_norm': 'Gender Equality Score (Normalized)',
#                                      'code_income_control': 'Financial Control Type',
#                                      'year': 'Year'},
#                              color_discrete_sequence=px.colors.qualitative.Set2,
#                              height=500)

# fig_eg_control_box.update_layout(
#     font=dict(size=12, family='Arial'),
#     title_font_size=16,
#     plot_bgcolor='white',
#     paper_bgcolor='white',
#     legend_title_text='Year'
# )

# fig_eg_control_box.update_xaxes(showgrid=False, tickangle=-45)
# fig_eg_control_box.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# fig_eg_control_box.show()

# do this by divide gender

In [22]:
# # Bar chart showing mean EG score by financial control over time
# fig_eg_control_bar = px.bar(eg_by_control,
#                              x='year',
#                              y='mean',
#                              color='code_income_control',
#                              barmode='group',
#                              title='Average Gender Equality Score by Financial Control Type Across Years<br><sub>Temporal Evolution of Equality Scores</sub>',
#                              labels={'mean': 'Mean Gender Equality Score',
#                                      'year': 'Year',
#                                      'code_income_control': 'Financial Control Type'},
#                              color_discrete_sequence=px.colors.qualitative.Pastel,
#                              text='mean',
#                              height=500)
# fig_eg_control_bar.update_traces(texttemplate='%{text:.2f}', textposition='outside')
# fig_eg_control_bar.update_traces(texttemplate='%{text:.3f}', textposition='outside')

# fig_eg_control_bar.update_layout(
#     font=dict(size=12, family='Arial'),
#     title_font_size=16,
#     plot_bgcolor='white',
#     legend_title_text='Financial Control',
#     showlegend=False
# )

# fig_eg_control_bar.update_xaxes(showgrid=False, tickangle=-45)
# fig_eg_control_bar.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# fig_eg_control_bar.show()

In [23]:
# Create binned version of eg_score_norm
df_combined['eg_score_binned'] = pd.cut(df_combined['eg_score_norm'], 
                                         bins=10, 
                                         labels=[f'{i/10:.1f}-{(i+1)/10:.1f}' for i in range(10)])

# Calculate education distribution by EG score bins, gender, and year
eg_edu_binned = df_combined.groupby(['year', 'eg_score_binned', 'sex', 'educ_4_label']).size().reset_index(name='count')
eg_edu_binned_total = df_combined.groupby(['year', 'eg_score_binned', 'sex']).size().reset_index(name='total')
eg_edu_binned = eg_edu_binned.merge(eg_edu_binned_total, on=['year', 'eg_score_binned', 'sex'])
eg_edu_binned['percentage'] = eg_edu_binned['count'] / eg_edu_binned['total'] * 100

# Stacked bar chart faceted by gender and year
fig_eg_edu_binned = px.bar(eg_edu_binned,
                            x='eg_score_binned',
                            y='percentage',
                            color='educ_4_label',
                            facet_col='sex',
                            facet_row='year',
                            title='Education Level by Gender Equality Score (Binned) and Gender Across Years<br><sub>How Education Relates to Gender Equality Attitudes</sub>',
                            labels={'percentage': 'Percentage (%)',
                                    'eg_score_binned': 'Gender Equality Score Range',
                                    'educ_4_label': 'Education Level'},
                            color_discrete_sequence=px.colors.qualitative.Set2,
                            height=900)

fig_eg_edu_binned.update_layout(
    font=dict(size=11, family='Arial'),
    title_font_size=16,
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend_title_text='Education Level',
    barmode='stack'
)

fig_eg_edu_binned.update_xaxes(showgrid=False, tickangle=-90)
fig_eg_edu_binned.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig_eg_edu_binned.show()







## Age Relationships: Education and Financial Control

### 1. Education Level Distribution by Age

In [24]:
# Create age groups for better visualization
df_combined['age_group'] = pd.cut(df_combined['age'], 
                                   bins=[18, 30, 40, 50, 60, 70, 100],
                                   labels=['18-30', '31-40', '41-50', '51-60', '61-70', '70+'])

# Calculate education distribution by age group, gender, and year
edu_by_age_gender = df_combined.groupby(['year', 'age_group', 'sex', 'educ_4_label']).size().reset_index(name='count')
edu_by_age_gender_total = df_combined.groupby(['year', 'age_group', 'sex']).size().reset_index(name='total')
edu_by_age_gender = edu_by_age_gender.merge(edu_by_age_gender_total, on=['year', 'age_group', 'sex'])
edu_by_age_gender['percentage'] = edu_by_age_gender['count'] / edu_by_age_gender['total'] * 100

# Create a combined column for x-axis grouping
edu_by_age_gender['age_gender'] = edu_by_age_gender['age_group'].astype(str) + ' - ' + edu_by_age_gender['sex'].str[0]

# Grouped bar chart with gender side-by-side, faceted by year
fig_edu_age = px.bar(edu_by_age_gender,
                     x='age_gender',
                     y='percentage',
                     color='educ_4_label',
                     facet_col='year',
                     barmode='stack',
                     title='Education Level Distribution by Age Group and Gender Across Years<br><sub>Male and Female Side-by-Side Comparison</sub>',
                     labels={'percentage': 'Percentage (%)',
                             'age_gender': 'Age Group - Gender',
                             'educ_4_label': 'Education Level'},
                     color_discrete_sequence=px.colors.qualitative.Set2,
                     height=500)

fig_edu_age.update_layout(
    font=dict(size=12, family='Arial'),
    title_font_size=16,
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend_title_text='Education Level'
)

# Add vertical lines to separate age groups
age_groups_list = ['18-30', '31-40', '41-50', '51-60', '61-70', '70+']
for i in range(len(age_groups_list) - 1):
    # Add line between age groups (after Female of each group)
    fig_edu_age.add_vline(x=i * 2 + 1.5, line_width=2, line_dash="dash", line_color="gray", opacity=0.5)

fig_edu_age.update_xaxes(showgrid=False, tickangle=-90)
fig_edu_age.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig_edu_age.show()







### 2. Financial Control Distribution by Age

In [25]:
# # Calculate financial control distribution by age group and year
# control_by_age = df_combined.groupby(['year', 'age_group', 'code_income_control']).size().reset_index(name='count')
# control_by_age_total = df_combined.groupby(['year', 'age_group']).size().reset_index(name='total')
# control_by_age = control_by_age.merge(control_by_age_total, on=['year', 'age_group'])
# control_by_age['percentage'] = control_by_age['count'] / control_by_age['total'] * 100

# # Stacked bar chart faceted by year
# fig_control_age = px.bar(control_by_age,
#                          x='age_group',
#                          y='percentage',
#                          color='code_income_control',
#                          facet_col='year',
#                          title='Financial Control Distribution by Age Group Across Years<br><sub>How Control Arrangements Vary with Age Over Time</sub>',
#                          labels={'percentage': 'Percentage (%)',
#                                  'age_group': 'Age Group',
#                                  'code_income_control': 'Financial Control'},
#                          color_discrete_sequence=px.colors.qualitative.Set1,
#                          height=500)

# fig_control_age.update_layout(
#     font=dict(size=12, family='Arial'),
#     title_font_size=16,
#     plot_bgcolor='white',
#     paper_bgcolor='white',
#     legend_title_text='Financial Control',
#     barmode='stack'
# )

# fig_control_age.update_xaxes(showgrid=False)
# fig_control_age.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# fig_control_age.show()

In [26]:
# # Line chart showing how control types evolve with age
# control_age_line = df_combined.groupby(['year', 'age', 'code_income_control']).size().reset_index(name='count')
# control_age_total = df_combined.groupby(['year', 'age']).size().reset_index(name='total')
# control_age_line = control_age_line.merge(control_age_total, on=['year', 'age'])
# control_age_line['percentage'] = control_age_line['count'] / control_age_line['total'] * 100

# # Filter to reasonable age range for smoother visualization
# control_age_line = control_age_line[(control_age_line['age'] >= 18) & (control_age_line['age'] <= 80)]

# fig_control_age_trend = px.line(control_age_line,
#                                  x='age',
#                                  y='percentage',
#                                  color='code_income_control',
#                                  facet_col='year',
#                                  title='Financial Control Patterns Across Age by Year<br><sub>Trends in Control Arrangements by Age Over Time</sub>',
#                                  labels={'percentage': 'Percentage (%)',
#                                          'age': 'Age',
#                                          'code_income_control': 'Financial Control'},
#                                  color_discrete_sequence=px.colors.qualitative.Set1,
#                                  height=500)

# fig_control_age_trend.update_layout(
#     font=dict(size=12, family='Arial'),
#     title_font_size=16,
#     plot_bgcolor='white',
#     paper_bgcolor='white',
#     legend=dict(
#         title='Financial Control',
#         orientation="h",
#         yanchor="top",
#         y=-0.15,
#         xanchor="center",
#         x=0.5
#     )
# )

# fig_control_age_trend.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
# fig_control_age_trend.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# fig_control_age_trend.show()

- eg_score_norm updated
- analysis top_bottom

In [27]:
df_combined['TOPBOT'] = df_combined[['TOPBOT', 'C_ALPHAN']].apply(lambda x: 11-x[0] if x[1] == 'NO' else x[0], axis=1)


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

