In [1]:
import altair as alt
import pandas as pd
from pathlib import Path

# --- Enable vegafusion for better performance ---
alt.data_transformers.enable("vegafusion")

# --- 1. Define Paths ---
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

DATA_PATH = ROOT / "data" / "processed"
main_data_path = DATA_PATH / "dashboard_main_data.parquet"
gap_data_path = DATA_PATH / "dashboard_rep_gap_data.csv"
gender_trend_data_path = DATA_PATH / "dashboard_gender_trend_data.csv"

# --- 2. Load DataFrames ---
try:
    # Load the main dataset
    df_filtered = pd.read_parquet(main_data_path, engine='pyarrow')
    print(f"✅ Loaded 'df_filtered' ({len(df_filtered):,} rows)")
    
    # Load the gap dataset
    bio_by_year_continent = pd.read_csv(gap_data_path)
    print(f"✅ Loaded 'bio_by_year_continent' ({len(bio_by_year_continent):,} rows)")
    
    # Load the gender trend dataset
    combined_df = pd.read_csv(gender_trend_data_path)
    print(f"✅ Loaded 'combined_df' for gender trend chart ({len(combined_df):,} rows)")

    # --- 3. Create df_for_charts (needed by dashboard code) ---
    df_for_charts = df_filtered.copy()
    df_for_charts['gender_group_display'] = df_for_charts['gender_group'].str.capitalize()
    print("✅ 'df_for_charts' created.")
    
except FileNotFoundError as e:
    print(f"❌ File not found: {e.filename}")
    print("Please ensure you ran the 'Save Data' cell in your other notebook.")

✅ Loaded 'df_filtered' (536,909 rows)
✅ Loaded 'bio_by_year_continent' (77 rows)
✅ Loaded 'combined_df' for gender trend chart (229 rows)
✅ 'df_for_charts' created.


In [2]:
# --- Create the 'gender_region_chart' variable ---
# This code is from Cell 7 of your old notebook,
# but it now uses the 'combined_df' we just loaded.

# --- 4. Dropdown for continent selection ---
continent_dropdown = alt.binding_select(
    options=sorted(combined_df[combined_df['continent'] != 'All']['continent'].unique().tolist()) + ["All"],
    name="🌍 Continent: "
)
continent_param = alt.param("continent_select", bind=continent_dropdown, value="All")

# --- 5. Build chart ---
domain_gender = ["Male", "Female", "Other (trans/non-binary)"]
range_gender  = ["#1f77b4", "#e377c2", "#2ca02c"]

base = (
    alt.Chart(combined_df)
    .transform_filter("datum.continent == continent_select")
    .encode(
        x=alt.X(
            "creation_year:O",
            title=None,
            axis=alt.Axis(
                labelAngle=0,
                grid=False,
                domain=False,
                ticks=True
            )
        ),
        y=alt.Y(
            "share:Q",
            title=None,
            axis=alt.Axis(labels=False, ticks=False, grid=False, domain=False)
        ),
        color=alt.Color(
            "gender_group:N",
            title="Gender Group",
            scale=alt.Scale(domain=domain_gender, range=range_gender)
        ),
        tooltip=[
            alt.Tooltip("creation_year:O", title="Year"),
            alt.Tooltip("continent:N", title="Continent"),
            alt.Tooltip("gender_group:N", title="Gender"),
            alt.Tooltip("share:Q", title="% Share", format=".1f")
        ]
    )
    .add_params(continent_param)
)

# --- 6. Line + Labels ---
line = base.mark_line(point=alt.OverlayMarkDef(size=80), strokeWidth=3)
labels = base.mark_text(
    align="center",
    baseline="bottom",
    dy=-8,
    size=11
).encode(
    text=alt.Text("share:Q", format=".1f")
)

gender_region_chart = (
    (line + labels)
    .properties(
        # The title/properties will be added by the final dashboard code
        width=900,
        height=350
    )
)

print("✅ 'gender_region_chart' variable is now ready.")


✅ 'gender_region_chart' variable is now ready.


In [3]:
# Cell 3: Create Timeline Data for Cultural Context
import pandas as pd

# Create timeline data for major cultural/political events
timeline_data = pd.DataFrame([
    {'year': 2016, 'event': "Clinton Campaign", 'female_share': 28.0, 'description': 'First woman nominated by major party'},
    {'year': 2017, 'event': "#MeToo Begins", 'female_share': 29.5, 'description': 'Peak feminist activism starts'},
    {'year': 2019, 'event': "Peak Progress", 'female_share': 32.0, 'description': 'Fastest improvement period'},
    {'year': 2020, 'event': "Harris VP + COVID", 'female_share': 32.5, 'description': 'Stagnation begins'},
    {'year': 2022, 'event': "Dobbs Decision", 'female_share': 33.0, 'description': 'Reproductive rights rollback'},
    {'year': 2024, 'event': "Anti-DEI Backlash", 'female_share': 34.0, 'description': 'Progress plateaus'}
])

print("✅ Timeline data created for cultural context visualization")

✅ Timeline data created for cultural context visualization


In [7]:
# =========================================================================
# CELL 4: DASHBOARD ASSEMBLY (CORRECTED - Lists for text sections)
# =========================================================================

import altair as alt
import pandas as pd

save_directory = Path(r"C:\Users\drrahman\wiki-gaps-project")
save_directory.mkdir(parents=True, exist_ok=True) 

html_save_path = save_directory / "wikipedia_representation_dashboard_enhanced.html"

# Load intersectional data
INTERSECTIONAL_PATH = DATA_PATH / "intersectional_analysis"
odds_df = pd.read_csv(INTERSECTIONAL_PATH / "intersectional_odds_ratios.csv")
cohort_df = pd.read_csv(INTERSECTIONAL_PATH / "cohort_comparison.csv")

print(f"Building enhanced dashboard...")

# =========================================================
# STYLING CONFIGURATION
# =========================================================
GENDER_COLORS = {
    'Male': '#3b82f6',
    'Female': '#ec4899', 
    'Other (trans/non-binary)': '#10b981'
}

ACCENT_COLOR = '#3b82f6'
BG_COLOR = '#f8fafc'
SECTION_BG = '#ffffff'

def create_text_section(title, body_lines, width=1100, title_size=18, body_size=13, bg_color='#f0f9ff'):
    """Create a styled text section for narrative content"""
    data = pd.DataFrame([{'x': 0, 'y': 0}])
    total_height = 110
    
    bg = alt.Chart(data).mark_rect(
        color=bg_color, opacity=0.7, cornerRadius=8
    ).encode(
        x=alt.value(0), x2=alt.value(width),
        y=alt.value(0), y2=alt.value(total_height)
    ).properties(width=width, height=total_height)
    
    title_chart = alt.Chart(pd.DataFrame([{'text': title}])).mark_text(
        align='left', baseline='top', fontSize=title_size,
        fontWeight='bold', color='#1e293b'
    ).encode(
        x=alt.value(25), y=alt.value(20), text='text:N'
    ).properties(width=width, height=total_height)
    
    body_chart = alt.Chart(pd.DataFrame([{'text': body_lines}])).mark_text(
        align='left', baseline='top', fontSize=body_size,
        color='#475569', lineHeight=body_size + 4
    ).encode(
        x=alt.value(25), y=alt.value(50), text='text:N'
    ).properties(width=width, height=total_height)
    
    return (bg + title_chart + body_chart).properties(width=width, height=total_height)

gender_selection = alt.selection_point(fields=['gender_group_display'])

# =========================================================
# KPI ROW (UPDATED)
# =========================================================
kpi_base = alt.Chart(df_for_charts).transform_filter(gender_selection)

# KPI 1: Total Biographies
kpi1_label = kpi_base.mark_text(size=14, align='center', dy=-30, color='#64748b', fontWeight='normal').encode(
    text=alt.value('Total Biographies')
)
kpi1_value = (
    kpi_base.mark_text(size=52, align='center', fontWeight='bold', dy=5, color='#3b82f6')
    .transform_aggregate(total='count()')
    .transform_calculate(formatted_total='format(datum.total, ",")')
    .encode(text='formatted_total:N')
)
total_biographies_kpi = alt.layer(kpi1_label, kpi1_value).properties(width=220, height=130)

# KPI 2: Intersectional Penalty (UPDATED)
worst_case = odds_df.iloc[0]
kpi2_label = kpi_base.mark_text(size=14, align='center', dy=-30, color='#64748b', fontWeight='normal').encode(
    text=alt.value('Intersectional Penalty')
)
kpi2_value = alt.Chart(pd.DataFrame([{'text': f'{worst_case["occupation_group"]}: {1/worst_case["odds_ratio"]:.1f}×'}])).mark_text(
    size=38, align='center', fontWeight='bold', dy=5, color='#ef4444'
).encode(text='text:N')
kpi2_subtext = alt.Chart(pd.DataFrame([{'text': 'female disadvantage'}])).mark_text(
    size=12, align='center', dy=35, color='#64748b', fontStyle='italic'
).encode(text='text:N')
gender_gap_kpi = alt.layer(kpi2_label, kpi2_value, kpi2_subtext).properties(width=300, height=130)

# KPI 3: Pipeline Problem (UPDATED)
youngest_gap = cohort_df[cohort_df['cohort'] == 'Born 1990s-2000s']['gap_pp'].values[0]
kpi3_label = kpi_base.mark_text(size=14, align='center', dy=-30, color='#64748b', fontWeight='normal').encode(
    text=alt.value('Youngest Cohort Gap')
)
kpi3_value = alt.Chart(pd.DataFrame([{'text': f'{youngest_gap:.0f}pp'}])).mark_text(
    size=44, align='center', fontWeight='bold', dy=5, color='#f59e0b'
).encode(text='text:N')
kpi3_subtext = alt.Chart(pd.DataFrame([{'text': '1990s-2000s cohort'}])).mark_text(
    size=11, align='center', dy=35, color='#64748b', fontStyle='italic'
).encode(text='text:N')
metoo_progress_kpi = alt.layer(kpi3_label, kpi3_value, kpi3_subtext).properties(width=300, height=130)

kpi_row = alt.hconcat(total_biographies_kpi, gender_gap_kpi, metoo_progress_kpi, spacing=80)

# =========================================================
# TIMELINE
# =========================================================
timeline_base = alt.Chart(timeline_data).encode(
    x=alt.X('year:O', title='Year', axis=alt.Axis(labelAngle=0, grid=False, labelFontSize=13))
)

timeline_line = timeline_base.mark_line(
    point=alt.OverlayMarkDef(size=150, filled=True, strokeWidth=3),
    strokeWidth=4, color='#ec4899'
).encode(
    y=alt.Y('female_share:Q', title='Female Biography Share (%)',
            scale=alt.Scale(domain=[27, 35]), axis=alt.Axis(grid=True, gridOpacity=0.3)),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('event:N', title='Event'),
        alt.Tooltip('female_share:Q', title='Female Share (%)', format='.1f'),
        alt.Tooltip('description:N', title='Context')
    ]
)

timeline_events = timeline_base.mark_text(
    align='center', baseline='bottom', dy=-15, fontSize=11, fontWeight='bold', color='#1e293b'
).encode(y=alt.Y('female_share:Q'), text='event:N')

arrow_2017_2019 = alt.Chart(pd.DataFrame([{'x': 2017, 'x2': 2019, 'y': 34, 'label': '⬆ Progress'}])).mark_text(
    fontSize=16, fontWeight='bold', color='#10b981'
).encode(x=alt.value(350), y=alt.value(50), text='label:N')

arrow_2020_2024 = alt.Chart(pd.DataFrame([{'x': 2020, 'x2': 2024, 'y': 34, 'label': '➡ Stagnation'}])).mark_text(
    fontSize=16, fontWeight='bold', color='#ef4444'
).encode(x=alt.value(750), y=alt.value(50), text='label:N')

timeline_chart = (timeline_line + timeline_events + arrow_2017_2019 + arrow_2020_2024).properties(
    title=alt.TitleParams(
        "Wikipedia's Gender Gaps Mirror America's Cultural Battles",
        fontSize=18, fontWeight='bold',
        subtitle="Female representation responded to feminist activism (2017-2019), then stalled during backlash (2020-2025)",
        subtitleColor='#64748b', subtitleFontSize=13
    ),
    width=1100, height=250
)

# =========================================================
# NARRATIVES (WITH LISTS!)
# =========================================================
intro_narrative = create_text_section(
    "📊 Wikipedia's Gender Problem: Structural Bias is Measurable",
    [
        "Analysis of 1.1M biographies reveals systematic under-representation. Female European military are 10.5× less likely than males to have biographies.",
        "People born 1990s-2000s show 47pp male bias—unchanged from 1970s-80s cohort, disproving the 'pipeline problem' hypothesis.",
        "Click gender segments to explore how representation evolved through #MeToo, elections, and backlash."
    ],
    bg_color='#fee2e2'
)

gender_system_narrative = create_text_section(
    "⚖️ The 2:1 Ratio: Structural Misogyny Masquerading as Objectivity",
    [
        "Male biographies outnumber female biographies by more than 2:1—a ratio that has barely budged in 10 years. This isn't",
        "accidental. Wikipedia's 'notability' standards favor fields where women were historically excluded (military, sports, politics),",
        "then treat male dominance as proof of greater importance. This is structural misogyny disguised as neutral policy."
    ],
    bg_color='#fef3c7'
)

yearly_context_narrative = create_text_section(
    "📈 When Feminism Advances, Wikipedia Responds—Then Stalls",
    [
        "Female representation improved fastest during peak #MeToo (2017-2019), gaining 4 percentage points. Progress then",
        "stagnated during the cultural backlash (2020-2025), gaining only 2pp in 6 years. Even Kamala Harris's historic VP win",
        "couldn't reverse the trend—symbolic victories without sustained momentum have limited impact on systemic representation."
    ],
    bg_color='#dbeafe'
)

pipeline_narrative = create_text_section(
    "❌ The 'Wait for Generational Change' Argument is Statistically False",
    [
        "Analysis of 715K biographies by birth year destroys the 'pipeline problem' excuse. People born 1990s-2000s (came of age during #MeToo)",
        "show 47.4pp male bias—statistically unchanged from 1970s-80s cohort (47.2pp). Progress has plateaued for the youngest generation.",
        "Bias is ongoing and structural, not just historical legacy."
    ],
    bg_color='#fef2f2'
)

occupation_gap_narrative = create_text_section(
    "🎯 GAP #1: The 'Notability' Double Standard",
    [
        "Military (95% male): Combat exclusion until 2015 created an all-male record. Female European military 10.5× less likely. Wikipedia treats this as 'notability,' not discrimination.",
        "Sports (90% male): No ESPN coverage = no 'reliable sources' = no article. Wikipedia launders media sexism as neutral fact.",
        "Politics (75% male): Record women ran (2018, 2020), yet gap barely moved. Women face higher bars—mirroring 'likability' penalties."
    ],
    bg_color='#fef3c7'
)

geographic_intro = create_text_section(
    "🌍 GAP #2: American Exceptionalism Exports American Sexism",
    [
        "The US dominates coverage (19.6%), making American cultural biases—about whose lives matter—into global defaults. If the",
        "New York Times doesn't cover a female Indian scientist, she won't meet Wikipedia's notability bar, regardless of her impact in",
        "India. This is cultural imperialism compounding gender bias. Women from underrepresented regions face a 'double gap.'"
    ],
    bg_color='#dbeafe'
)

gap_narrative = create_text_section(
    "📉 GAP #3: Intersectional Invisibility",
    [
        "These geographic gaps compound gender bias. A female African politician needs 20× the 'notability' of a male European politician.",
        "More content hasn't meant more equitable content—because the problem isn't volume, it's values. Women from Asia and Africa face",
        "compounded marginalization: their regions are underrepresented, AND they're women where gender gaps are naturalized by Wikipedia."
    ],
    bg_color='#fee2e2'
)

intersectional_narrative = create_text_section(
    "🔗 The Double Bind: When Geography Meets Gender",
    [
        "A male American athlete has a 20× better chance of Wikipedia coverage than a female African scientist, even if the scientist",
        "has greater real-world impact. This isn't about individual merit—it's about whose contributions American/Western culture deems",
        "'important enough' to document. Wikipedia doesn't just reflect history; it amplifies whose history gets to exist at all."
    ],
    bg_color='#fef3c7'
)

conclusion_narrative = create_text_section(
    "🎯 Challenging Wikipedia's 'Neutral' Misogyny",
    [
        "1. Interrogate notability: Stop treating male-dominated history as neutral. Fields where women were barred shouldn't define what's 'notable.'",
        "2. Name the bias: Wikipedia amplifies America's unfinished reckoning with gender inequality and exports it globally.",
        "3. Demand accountability: Until Wikipedia names its complicity in perpetuating patriarchal hierarchies, representation will remain symbolic."
    ],
    bg_color='#d1fae5'
)

# =========================================================
# GENDER PIE
# =========================================================
gender_totals_df = df_filtered.groupby('gender_group').size().reset_index(name='count')
gender_totals_df['percentage'] = (gender_totals_df['count'] / gender_totals_df['count'].sum()) * 100
gender_totals_df['gender_group_display'] = gender_totals_df['gender_group'].str.capitalize()
gender_totals_df['multi_line_label'] = gender_totals_df.apply(
    lambda row: [row['gender_group_display'], f"{row['percentage']:.1f}%"], axis=1
)

domain = ['Male', 'Female', 'Other (trans/non-binary)']
range_ = [GENDER_COLORS['Male'], GENDER_COLORS['Female'], GENDER_COLORS['Other (trans/non-binary)']]

base_pie = alt.Chart(gender_totals_df[gender_totals_df['gender_group'] != 'Unknown']).encode(
    theta=alt.Theta("count:Q", stack=True),
    color=alt.Color("gender_group_display:N", scale=alt.Scale(domain=domain, range=range_), 
                    legend=alt.Legend(title="Gender", orient='bottom', titleFontSize=14, labelFontSize=13)),
    opacity=alt.condition(gender_selection, alt.value(1), alt.value(0.3))
)

pie = base_pie.mark_arc(outerRadius=110, innerRadius=65, cursor='pointer', stroke='white', strokeWidth=3).add_params(gender_selection)
text_pie = base_pie.mark_text(radius=135, size=14, fontWeight='bold').encode(text="multi_line_label:N")

gender_pie_chart = (pie + text_pie).properties(
    title=alt.TitleParams("The 2:1 Gender Gap: Not a Bug, It's the System", fontSize=18, fontWeight='bold', anchor='middle'),
    width=500, height=450
)

instruction_text = alt.Chart(pd.DataFrame([{
    'text': '💡 Click segments to explore how representation evolved through #MeToo, elections, and backlash'
}])).mark_text(
    size=12, color='#64748b', align='center', fontStyle='italic', fontWeight='bold'
).encode(text='text:N').properties(width=500, height=40)

gender_chart_with_instruction = alt.vconcat(gender_pie_chart, instruction_text, spacing=10)

# =========================================================
# YEARLY TREND
# =========================================================
yearly_base = (
    alt.Chart(df_for_charts)
    .transform_filter(gender_selection)
    .transform_aggregate(total_articles='count()', groupby=['creation_year'])
)

yearly_area = yearly_base.mark_area(line=True, opacity=0.3, color=ACCENT_COLOR).encode(
    x=alt.X('creation_year:O', title='Year', axis=alt.Axis(labelAngle=0, grid=False, labelFontSize=12)),
    y=alt.Y('total_articles:Q', title='Number of Biographies', axis=alt.Axis(grid=True, gridOpacity=0.3))
)

yearly_line = yearly_base.mark_line(
    point=alt.OverlayMarkDef(size=120, filled=True, fill='white', strokeWidth=2), 
    strokeWidth=4, color=ACCENT_COLOR
).encode(
    x=alt.X('creation_year:O'), y=alt.Y('total_articles:Q'),
    tooltip=[alt.Tooltip('creation_year:O', title='Year'), alt.Tooltip('total_articles:Q', title='Biographies', format=',')]
)

yearly_text = yearly_base.mark_text(
    align='center', baseline='bottom', dy=-12, fontSize=12, fontWeight='bold', color='#1e293b'
).encode(x=alt.X('creation_year:O'), y=alt.Y('total_articles:Q'), text=alt.Text('total_articles:Q', format=','))

event_annotations = alt.Chart(pd.DataFrame([
    {'year': 2016, 'label': 'Clinton', 'y_pos': 48000},
    {'year': 2017, 'label': '#MeToo', 'y_pos': 48000},
    {'year': 2020, 'label': 'Harris VP', 'y_pos': 58000},
    {'year': 2022, 'label': 'Dobbs', 'y_pos': 32000}
])).mark_text(fontSize=10, fontWeight='bold', color='#ef4444', dy=0).encode(
    x=alt.X('year:O'), y=alt.Y('y_pos:Q'), text='label:N'
)

event_rules = alt.Chart(pd.DataFrame([
    {'year': 2016}, {'year': 2017}, {'year': 2020}, {'year': 2022}
])).mark_rule(strokeDash=[3, 3], color='#ef4444', opacity=0.5, strokeWidth=2).encode(x=alt.X('year:O'))

final_yearly_chart = alt.layer(yearly_area, yearly_line, yearly_text, event_rules, event_annotations).properties(
    title=alt.TitleParams("Timeline of Progress and Backlash: Biography Creation 2015-2025", fontSize=18, fontWeight='bold'),
    width=550, height=400
)

top_viz_section_row1 = timeline_chart
top_viz_section_row2 = alt.hconcat(gender_chart_with_instruction, final_yearly_chart, spacing=50)

# =========================================================
# BIRTH COHORT CHART (NEW)
# =========================================================
cohort_long = cohort_df.melt(
    id_vars=['cohort', 'n'], 
    value_vars=['female_pct', 'male_pct'],
    var_name='gender', value_name='percentage'
)
cohort_long['gender_label'] = cohort_long['gender'].map({'female_pct': 'Female', 'male_pct': 'Male'})

birth_cohort_chart = alt.Chart(cohort_long).mark_bar().encode(
    x=alt.X('cohort:N', title=None, axis=alt.Axis(labelAngle=0)),
    y=alt.Y('percentage:Q', title='% of Biographies', scale=alt.Scale(domain=[0, 100])),
    color=alt.Color('gender_label:N', title='Gender',
                    scale=alt.Scale(domain=['Female', 'Male'], range=['#ec4899', '#3b82f6'])),
    xOffset='gender_label:N',
    tooltip=[
        alt.Tooltip('cohort:N', title='Birth Cohort'),
        alt.Tooltip('gender_label:N', title='Gender'),
        alt.Tooltip('percentage:Q', title='Percentage', format='.1f'),
        alt.Tooltip('n:Q', title='Sample Size', format=',')
    ]
).properties(
    title=alt.TitleParams(
        text="The 'Pipeline Problem' Myth: Gender Gap Persists Across Generations",
        subtitle="Gap for 1990s-2000s cohort (47.4pp) unchanged from 1970s-80s (47.2pp) — proving bias is ongoing, not historical",
        fontSize=16, anchor='start', subtitleColor='#64748b'
    ),
    width=1100, height=300
)

# =========================================================
# SMALL MULTIPLES
# =========================================================
occ_gender_df = (
    df_filtered[df_filtered['occupation_group'] != 'Other']
    .assign(gender_group=lambda d: d['gender'].str.capitalize())
    .groupby(['creation_year', 'occupation_group', 'gender_group'])
    .size().reset_index(name='group_total')
)

sort_order = df_filtered[df_filtered['occupation_group'] != 'Other']['occupation_group'].value_counts().index.tolist()

small_multiples_chart = (
    alt.Chart(occ_gender_df)
    .mark_line(point=alt.OverlayMarkDef(size=70, filled=True, strokeWidth=2), strokeWidth=3)
    .encode(
        x=alt.X('creation_year:O', title=None,
                axis=alt.Axis(labels=True, ticks=True, grid=False, labelAngle=-45, labelFontSize=11)),
        y=alt.Y('group_total:Q', title=None,
                axis=alt.Axis(labels=True, ticks=True, grid=True, gridOpacity=0.2, labelFontSize=11)),
        color=alt.Color('gender_group:N', title="Gender",
                        scale=alt.Scale(domain=['Male','Female','Other (trans/non-binary)'],
                                       range=[GENDER_COLORS['Male'], GENDER_COLORS['Female'], GENDER_COLORS['Other (trans/non-binary)']]),
                        legend=alt.Legend(orient='bottom', titleFontSize=14, labelFontSize=13)),
        tooltip=[
            alt.Tooltip('creation_year:O', title='Year'),
            alt.Tooltip('occupation_group:N', title='Occupation'),
            alt.Tooltip('gender_group:N', title='Gender'),
            alt.Tooltip('group_total:Q', title='Biographies', format=',')
        ]
    )
    .properties(width=350, height=230)
    .facet(
        facet=alt.Facet('occupation_group:N', title=None,
                        header=alt.Header(labelFontSize=15, labelFontWeight='bold'), sort=sort_order),
        columns=3
    )
    .resolve_scale(y='independent')
    .properties(title=alt.TitleParams("Where Chauvinism Is Most Entrenched: Gender Gaps by Field", fontSize=18, fontWeight='bold'))
)

# =========================================================
# OCCUPATION & COUNTRY BARS
# =========================================================
occupation_base = (
    alt.Chart(df_for_charts[df_for_charts['occupation_group'] != 'Other'])
    .transform_filter(gender_selection)
    .transform_aggregate(count='count()', groupby=['occupation_group'])
)

occupation_bars = occupation_base.mark_bar(cornerRadius=5).encode(
    x=alt.X('count:Q', title=None, axis=None),
    y=alt.Y('occupation_group:N', sort='-x', title=None,
            axis=alt.Axis(labelLimit=200, ticks=False, domain=False, labelFontSize=13)),
    color=alt.Color('count:Q', scale=alt.Scale(scheme='blues', reverse=False), legend=None),
    tooltip=[alt.Tooltip('occupation_group:N', title='Occupation Group'), alt.Tooltip('count:Q', title='Biographies', format=',')]
)

occupation_text = occupation_base.mark_text(
    align='left', dx=6, color='#1e293b', fontWeight='bold', fontSize=12
).encode(x=alt.X('count:Q'), y=alt.Y('occupation_group:N', sort='-x'), text=alt.Text('count:Q', format=','))

occupation_chart = alt.layer(occupation_bars, occupation_text).properties(
    title=alt.TitleParams("Most Represented Occupations", fontSize=18, fontWeight='bold'),
    width=520, height=350
)

country_base = (
    alt.Chart(df_for_charts)
    .transform_filter(gender_selection)
    .transform_filter("isValid(datum.country) && datum.country != null && datum.country != '' && lower(datum.country) != 'unknown'")
    .transform_aggregate(count='count()', groupby=['country'])
    .transform_window(rank='rank(count)', sort=[alt.SortField('count', order='descending')])
    .transform_filter(alt.datum.rank <= 10)
)

country_bars = country_base.mark_bar(cornerRadius=5).encode(
    x=alt.X('count:Q', title=None, axis=None),
    y=alt.Y('country:N', sort='-x', title=None,
            axis=alt.Axis(labelLimit=200, ticks=False, domain=False, labelFontSize=13)),
    color=alt.Color('count:Q', scale=alt.Scale(scheme='greens', reverse=False), legend=None),
    tooltip=[alt.Tooltip('country:N', title='Country'), alt.Tooltip('count:Q', title='Biographies', format=',')]
)

country_text = country_base.mark_text(
    align='left', dx=6, color='#1e293b', fontWeight='bold', fontSize=12
).encode(x=alt.X('count:Q'), y=alt.Y('country:N', sort='-x'), text=alt.Text('count:Q', format=','))

country_chart = alt.layer(country_bars, country_text).properties(
    title=alt.TitleParams("Most Represented Countries", fontSize=18, fontWeight='bold'),
    width=520, height=350
)

occ_country_section = alt.hconcat(occupation_chart, country_chart, spacing=50)

# =========================================================
# CONTINENTAL DISTRIBUTION
# =========================================================
df_con_chart = (
    df_filtered
    .query("creation_year.notnull() and continent.notnull() and continent != 'Other' and country.notnull()")
    .loc[:, ["creation_year", "continent", "country"]]
    .rename(columns={"creation_year": "year", "continent": "continent_name", "country": "country_name"})
)

counts = df_con_chart.groupby(["year", "continent_name"]).size().reset_index(name="n")
counts["continent_rank"] = counts.groupby("year")["n"].rank(method="first", ascending=False).astype(int)
top3 = (
    df_con_chart.groupby(["year", "continent_name", "country_name"]).size().reset_index(name="cn")
    .sort_values(["year", "continent_name", "cn"], ascending=[True, True, False])
    .groupby(["year", "continent_name"])
    .apply(lambda g: ", ".join(f"{r.country_name} ({int(r.cn)})" for _, r in g.head(3).iterrows()), include_groups=False)
    .reset_index(name="top3_countries")
)
viz_df = counts.merge(top3, on=["year", "continent_name"], how="left")
years_order = sorted(viz_df["year"].unique().tolist())

con_chart = alt.Chart(viz_df).mark_bar(cornerRadius=3).encode(
    x=alt.X("year:O", title="Year", sort=years_order, axis=alt.Axis(grid=False, labelAngle=0, labelFontSize=13)),
    y=alt.Y("n:Q", title="Number of Biographies", axis=alt.Axis(grid=True, gridOpacity=0.3, titleFontSize=14)),
    xOffset=alt.XOffset("continent_rank:O"),
    color=alt.Color("continent_name:N", title="Continent",
                    scale=alt.Scale(scheme="tableau20", domain=["Africa","Asia","Europe","North America","Oceania","South America"]),
                    legend=alt.Legend(orient='bottom', titleFontSize=14, labelFontSize=13)),
    tooltip=[
        alt.Tooltip("year:O", title="Year"),
        alt.Tooltip("continent_name:N", title="Continent"),
        alt.Tooltip("n:Q", title="Biographies", format=","),
        alt.Tooltip("top3_countries:N", title="Top 3 Countries")
    ],
    order=alt.Order("continent_rank:Q")
).properties(
    title=alt.TitleParams("The Geography of Whose Stories Matter", fontSize=18, fontWeight='bold'),
    width=1100, height=420
)

# =========================================================
# REPRESENTATION GAP
# =========================================================
continent_order = ["Africa", "Asia", "Europe", "North America", "Oceania", "South America"]
continent_colors = ["#ef4444", "#f59e0b", "#3b82f6", "#8b5cf6", "#10b981", "#06b6d4"]
color_scale = alt.Scale(domain=continent_order, range=continent_colors)

reference_line = alt.Chart(pd.DataFrame({"y": [0]})).mark_rule(
    strokeDash=[5, 5], color="#64748b", strokeWidth=2
).encode(y="y:Q")

band = alt.Chart(pd.DataFrame({"y": [-0.02], "y2": [0.02]})).mark_rect(
    color="#e2e8f0", opacity=0.5
).encode(y="y:Q", y2="y2:Q")

gap_line_chart = alt.Chart(bio_by_year_continent).mark_line(
    point=alt.OverlayMarkDef(size=90, filled=True, strokeWidth=2), strokeWidth=3.5
).encode(
    x=alt.X("creation_year:O", title="Year", axis=alt.Axis(labelAngle=0, grid=False, labelFontSize=13)),
    y=alt.Y("gap:Q", title="Representation Gap (Biography Share − Population Share)",
            axis=alt.Axis(format=".0%", grid=True, gridOpacity=0.3, titleFontSize=14)),
    color=alt.Color("continent:N", title="Continent", sort=continent_order, scale=color_scale,
                    legend=alt.Legend(orient='bottom', titleFontSize=14, labelFontSize=13)),
    tooltip=[
        alt.Tooltip("creation_year:O", title="Year"),
        alt.Tooltip("continent:N", title="Continent"),
        alt.Tooltip("gap:Q", format=".1%", title="Representation Gap"),
    ],
)

final_gap_chart = (band + reference_line + gap_line_chart).properties(
    title=alt.TitleParams(
        "The Representation Gap: Biography Share vs. Population Share", 
        fontSize=18, fontWeight='bold',
        subtitle="Asia and Africa remain invisible while Europe/North America export their cultural biases—including gender hierarchies—globally",
        subtitleColor='#64748b', subtitleFontSize=13
    ),
    width=1100, height=400
)

# =========================================================
# GENDER TREND BY CONTINENT
# =========================================================
gender_trend_chart_polished = gender_region_chart.properties(
    title=alt.TitleParams(
        "How Regional Underrepresentation Multiplies Gender Bias",
        fontSize=18, fontWeight='bold',
        subtitle="Select a continent to see how geographic and gender marginalization compound each other",
        subtitleColor='#64748b', subtitleFontSize=14
    ),
    width=1100, height=380
)

# =========================================================
# FINAL ASSEMBLY
# =========================================================
dashboard_full = alt.vconcat(
    kpi_row,
    intro_narrative,
    top_viz_section_row1,
    gender_system_narrative,
    top_viz_section_row2,
    yearly_context_narrative,
    pipeline_narrative,
    birth_cohort_chart,  # NEW
    small_multiples_chart,
    occupation_gap_narrative,
    occ_country_section,
    geographic_intro,
    con_chart,
    final_gap_chart,
    gap_narrative,
    gender_trend_chart_polished,
    intersectional_narrative,
    conclusion_narrative,
    spacing=35
).properties(
    title=alt.TitleParams(
        text="Wikipedia's Gender Problem: How American Misogyny Shapes Global Knowledge",
        subtitle=[
            "Analyzing how structural chauvinism perpetuates through 'neutral' policies (2015-2025)",
            " ",
            "This dashboard reveals how Wikipedia's representation gaps mirror America's cultural battles over women's rights,",
            "from Clinton's campaign through #MeToo to the anti-feminist backlash—and how these biases get exported globally."
        ],
        fontSize=28,
        fontWeight='bold',
        anchor='middle',
        subtitleFontSize=14,
        subtitleColor='#64748b',
        offset=20
    ),
    padding=35,
    background=BG_COLOR
).configure_view(
    strokeWidth=0
).configure_axis(
    labelFontSize=12, titleFontSize=14,
    titleColor='#334155', labelColor='#475569',
    domainColor='#cbd5e1', gridColor='#e2e8f0'
).configure_title(
    fontSize=16, color='#1e293b'
).configure_legend(
    titleFontSize=13, labelFontSize=12,
    symbolSize=120, symbolStrokeWidth=2
).resolve_legend(
    color='independent'
).resolve_scale(
    color='independent'
)

dashboard_full.save(str(html_save_path))
print(f"✅ Successfully saved HTML to: {html_save_path}")
print("📊 Dashboard includes:")
print("  ✓ All original visualizations")
print("  ✓ NEW: Updated KPIs (Intersectional Penalty, Pipeline Problem)")
print("  ✓ NEW: Birth Cohort Chart")
print("  ✓ UPDATED: All narrative text with new findings")
print("\n🌐 Open the HTML file in your browser!")

Building enhanced dashboard...
✅ Successfully saved HTML to: C:\Users\drrahman\wiki-gaps-project\wikipedia_representation_dashboard_enhanced.html
📊 Dashboard includes:
  ✓ All original visualizations
  ✓ NEW: Updated KPIs (Intersectional Penalty, Pipeline Problem)
  ✓ NEW: Birth Cohort Chart
  ✓ UPDATED: All narrative text with new findings

🌐 Open the HTML file in your browser!
