In [1]:
# Cell 1: Setup and Load Aggregated Data

import pandas as pd
import altair as alt
alt.data_transformers.enable("vegafusion")
from pathlib import Path

# --- Path Setup ---
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

DATA_PATH = ROOT / "data" / "processed" / "yearly_aggregates.csv"

# --- Load the Data ---
try:
    agg_df = pd.read_csv(DATA_PATH)
    
    # --- Verification ---
    print(f"✅ Successfully loaded the aggregated dataset from: {DATA_PATH.name}")
    print(f"Total rows: {len(agg_df):,}")
    
    print("\nDataFrame Info:")
    agg_df.info()
    
    print("\nSample of the data:")
    display(agg_df.head())

except FileNotFoundError:
    print(f"❌ Error: The aggregated data file was not found at '{DATA_PATH}'.")
    print("Please ensure the '03_aggregate_and_qc.ipynb' notebook has been run successfully.")

✅ Successfully loaded the aggregated dataset from: yearly_aggregates.csv
Total rows: 49,406

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49406 entries, 0 to 49405
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   creation_year     49406 non-null  float64
 1   gender            49406 non-null  object 
 2   country           49406 non-null  object 
 3   occupation_group  49406 non-null  object 
 4   count             49406 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 1.9+ MB

Sample of the data:


Unnamed: 0,creation_year,gender,country,occupation_group,count
0,2015.0,female,Afghanistan,Arts & Culture,6
1,2015.0,female,Afghanistan,Aviation,1
2,2015.0,female,Afghanistan,Politics & Law,6
3,2015.0,female,Afghanistan,STEM & Academia,1
4,2015.0,female,Afghanistan,Sports,1


In [2]:
# Cell 2: Calculate Yearly Shares

print("Calculating yearly totals to determine shares...")

# 1. Calculate the total number of articles created each year.
# We group by year, sum the 'count' column, and create a mapping Series.
yearly_totals = agg_df.groupby('creation_year')['count'].sum()

# 2. Map these yearly totals back to the main DataFrame.
# Now, each row will have a 'yearly_total' column.
agg_df['yearly_total'] = agg_df['creation_year'].map(yearly_totals)

# 3. Calculate the share (percentage) for each group.
agg_df['share'] = (agg_df['count'] / agg_df['yearly_total']) * 100

# --- Verification ---
print("\n✅ Share calculation complete.")
print("New 'yearly_total' and 'share' columns have been added.")

print("\nSample of the data with shares calculated:")
display(agg_df.head())

# Optional check: Sum of shares for one year should be close to 100
print("\nVerifying shares for the year 2020 (should be ~100%):")
share_2020 = agg_df[agg_df['creation_year'] == 2020]['share'].sum()
print(f"Sum of shares for 2020: {share_2020:.2f}%")

Calculating yearly totals to determine shares...

✅ Share calculation complete.
New 'yearly_total' and 'share' columns have been added.

Sample of the data with shares calculated:


Unnamed: 0,creation_year,gender,country,occupation_group,count,yearly_total,share
0,2015.0,female,Afghanistan,Arts & Culture,6,51419,0.011669
1,2015.0,female,Afghanistan,Aviation,1,51419,0.001945
2,2015.0,female,Afghanistan,Politics & Law,6,51419,0.011669
3,2015.0,female,Afghanistan,STEM & Academia,1,51419,0.001945
4,2015.0,female,Afghanistan,Sports,1,51419,0.001945



Verifying shares for the year 2020 (should be ~100%):
Sum of shares for 2020: 100.00%


# Who is Represented on Wikipedia? An Analysis of Biographies

Wikipedia reflects our collective knowledge, but who does that knowledge include? This dashboard analyzes biographies created since 2015 to explore representation gaps and track how the shares of different genders, nationalities, and professions are changing over time.

In [3]:
# Cell to Correctly Load and Prepare the Detailed DataFrame

# This cell correctly loads all the necessary data and includes the final
# version of the occupation bucketing logic.

print("Loading and preparing the complete detailed dataset...")

# --- 1. Load the raw detailed data ---
NORMALIZED_DIR = ROOT / "data" / "processed" / "tmp_normalized"
all_files = sorted(NORMALIZED_DIR.glob("normalized_chunk_*.csv"))
df_list = [pd.read_csv(f) for f in all_files]
df_detailed = pd.concat(df_list, ignore_index=True)

# --- 2. Load and merge the timestamps ---
seed_path = sorted((ROOT / "data" / "raw").glob("seed_enwiki_*.csv"))[-1]
seed_df = pd.read_csv(seed_path)
df_detailed = pd.merge(df_detailed, seed_df[['qid', 'first_edit_ts']], on='qid', how='left')
df_detailed['first_edit_ts'] = pd.to_datetime(df_detailed['first_edit_ts'])
df_detailed['creation_year'] = df_detailed['first_edit_ts'].dt.year

# --- 3. Filter by year to create the final 'df_filtered' ---
df_filtered = df_detailed[df_detailed['creation_year'] >= 2015].copy()

# --- 4. Add the 'gender_group' column ---
def bucket_gender(gender):
    if gender in ['non-binary', 'trans woman', 'trans man']: return 'Other (Trans/Non-binary)'
    elif gender in ['male', 'female']: return gender
    else: return 'Unknown'
df_filtered['gender_group'] = df_filtered['gender'].apply(bucket_gender)

# --- 5. Add the 'occupation_group' column ---
print("Applying occupation bucketing...")
# This is the final, complete dictionary of occupation buckets.
OCCUPATION_BUCKETS = {
    "Sports": ["association football player", "american football player", "basketball player", "cricketer", "athletics competitor", "ice hockey player", "baseball player", "rugby union player", "sport cyclist", "swimmer", "racing automobile driver", "coach", "boxer", "athlete", "tennis player", "rower", "australian rules football player", "rugby league player", "handball player", "volleyball player", "judoka", "racing driver", "golfer", "chess player", "badminton player", "sprinter", "figure skater", "sport shooter", "weightlifter", "fencer", "artistic gymnast", "curler", "mixed martial arts fighter", "professional wrestler", "water polo player", "association football manager", "basketball coach", "amateur wrestler", "field hockey player", "canoeist", "alpine skier", "sailor", "canadian football player", "cross-country skier", "motorcycle racer", "biathlete", "table tennis player", "speed skater", "hurler", "rhythmic gymnast", "gaelic football player", "archer", "taekwondo athlete", "competitive diver", "long-distance runner", "equestrian", "ski jumper", "squash player", "head coach", "association football referee", "marathon runner", "freestyle skier", "bobsledder", "snowboarder", "gymnast", "luger", "triathlete", "bowls player", "poker player", "middle-distance runner", "kayaker", "darts player", "karateka", "sports commentator", "ice dancer", "softball player", "snooker player", "jockey", "kickboxer", "orienteer", "modern pentathlete", "speedway rider", "short-track speed skater", "lacrosse player", "synchronized swimmer", "netballer", "rikishi", "track cyclist", "thai boxer", "professional gamer", "american football coach", "rally driver", "beach volleyball player", "mountaineer", "sports executive", "professional baseball player", "nordic combined skier", "javelin thrower", "surfer", "skateboarder", "hurdler", "para swimmer", "coxswain", "powerlifter", "para athletics competitor", "dressage rider", "skeleton racer", "skipper", "horse trainer", "futsal player", "pole vaulter", "bodybuilder", "rugby sevens player", "bridge player", "trampoline gymnast", "pool player", "martial artist", "racewalker", "bowler", "high jumper", "show jumper", "ice hockey coach", "wheelchair curler", "motocross rider", "windsurfer", "go professional", "long jumper", "rock climber", "ski mountaineer", "paralympic athlete", "handball coach", "cyclo-cross cyclist", "hammer thrower", "acrobatic gymnast", "para badminton player", "para table tennis player", "shot putter", "wheelchair tennis player", "formula one driver", "referee", "rugby union coach", "baseball umpire", "ultramarathon runner", "kabaddi player", "discus thrower", "wrestler", "event rider", "nascar team owner", "bandy player", "skier", "runner", "triple jumper", "softball coach", "cricket umpire", "sitting volleyball player", "steeplechase runner", "tennis coach", "professional golfer"],
    "Politics & Law": ["politician", "lawyer", "judge", "diplomat", "civil servant", "activist", "human rights activist", "jurist", "police officer", "trade unionist", "legal scholar", "lgbtq rights activist", "official", "barrister", "political activist", "women's rights activist", "lobbyist", "aristocrat", "justice of the peace", "member of the state duma", "political adviser", "magistrate", "peace activist", "social activist", "statesperson", "spy", "climate activist"],
    "Arts & Culture": ["actor", "writer", "singer", "journalist", "film director", "musician", "artist", "photographer", "painter", "poet", "rapper", "composer", "screenwriter", "record producer", "model", "comedian", "television presenter", "singer-songwriter", "songwriter", "film producer", "television actor", "opera singer", "jazz musician", "pianist", "sculptor", "guitarist", "conductor", "stage actor", "radio personality", "disc jockey", "fashion designer", "comics artist", "dancer", "seiyū", "drummer", "voice actor", "television producer", "designer", "visual artist", "chef", "beauty pageant contestant", "playwright", "choreographer", "illustrator", "cinematographer", "cartoonist", "theatrical director", "editor", "mangaka", "violinist", "television director", "film editor", "curator", "filmmaker", "ballet dancer", "youtuber", "audio engineer", "pornographic actor", "graphic designer", "columnist", "drag queen", "animator", "literary critic", "sports journalist", "director", "presenter", "documentary filmmaker", "publisher", "children's writer", "science fiction writer", "make-up artist", "non-fiction writer", "saxophonist", "costume designer", "contemporary artist", "blogger", "restaurateur", "organist", "cellist", "bassist", "news presenter", "installation artist", "magician", "performance artist", "motivational speaker", "video artist", "essayist", "announcer", "cook", "biographer", "film critic", "trumpeter", "game designer", "stand-up comedian", "interior designer", "art collector", "art dealer", "child actor", "exhibition curator", "clarinetist", "lyricist", "art critic", "printmaker", "television personality", "entertainer", "percussionist", "keyboardist", "newspaper editor", "photojournalist", "japanese idol", "vlogger", "podcaster", "comics writer", "socialite", "fiddler", "penciller", "art director", "production designer", "puppeteer", "club dj", "autobiographer", "classical guitarist", "fashion model", "bandleader", "reality television participant", "multimedia artist", "music video director", "vocalist", "circus performer", "flautist", "video game developer", "classical pianist", "jewelry designer", "textile artist", "caricaturist", "glass artist", "banjoist", "lighting designer", "bass guitarist", "street artist", "weather presenter", "talent agent", "owarai tarento", "opinion journalist", "board game designer", "potter", "music critic", "film score composer", "scenographer", "radio producer", "influencer", "musical instrument maker"],
    "STEM & Academia": ["physician", "scientist", "engineer", "academic", "computer scientist", "mathematician", "historian", "economist", "researcher", "physicist", "university teacher", "psychologist", "architect", "chemist", "biologist", "philosopher", "political scientist", "linguist", "sociologist", "anthropologist", "teacher", "theologian", "translator", "astronomer", "art historian", "professor", "neuroscientist", "biochemist", "archaeologist", "statistician", "botanist", "psychiatrist", "musicologist", "environmentalist", "geneticist", "geologist", "electrical engineer", "epidemiologist", "astrophysicist", "geographer", "ecologist", "civil engineer", "inventor", "librarian", "nurse", "social worker", "social scientist", "explorer", "programmer", "zoologist", "paleontologist", "astronaut", "educator", "immunologist", "mechanical engineer", "microbiologist", "meteorologist", "music educator", "literary scholar", "academic administrator", "oncologist", "molecular biologist", "neurologist", "chemical engineer", "pedagogue", "philologist", "pediatrician", "cardiologist", "ceramicist", "landscape architect", "lecturer", "ophthalmologist", "virologist", "military historian", "classical scholar", "historian of modern age", "entomologist", "criminologist", "oceanographer", "climatologist", "veterinarian", "dentist", "materials scientist", "pharmacist", "psychotherapist", "biophysicist", "gynecologist", "cryptographer", "pathologist", "geophysicist", "classical philologist", "archivist", "neurosurgeon", "artificial intelligence researcher", "medical researcher", "biostatistician", "literary historian", "religious studies scholar", "software developer", "conservationist", "islamicist", "ornithologist", "biblical scholar", "pharmacologist", "physiologist", "marine biologist", "theoretical physicist", "bioinformatician", "medievalist", "nutritionist", "herpetologist", "draftsperson", "evolutionary biologist", "sinologist", "egyptologist"],
    "Business": ["businessperson", "entrepreneur", "business executive", "banker", "chief executive officer", "manager", "accountant", "music executive", "financier", "business theorist", "philanthropist", "consultant", "manufacturer", "executive", "investment banker", "investor", "executive producer"],
    "Military": ["military personnel", "military officer", "military leader", "naval officer", "military flight engineer", "soldier", "army officer", "air force officer"],
    "Religion": ["catholic priest", "anglican priest", "rabbi", "priest", "pastor", "missionary", "christian minister", "eastern orthodox priest", "ʿālim", "imam"],
    "Criminal": ["serial killer", "drug trafficker", "criminal", "terrorist"],
    "Aviation": ["aircraft pilot"],
    "Agriculture": ["farmer", "agronomist", "horticulturist", "winegrower"]
}
occupation_to_bucket = {occ: bucket for bucket, occs in OCCUPATION_BUCKETS.items() for occ in occs}
def bucket_occupation(occupation):
    clean_occupation = str(occupation).strip()
    return occupation_to_bucket.get(clean_occupation, 'Other')
df_filtered['occupation_group'] = df_filtered['occupation'].apply(bucket_occupation)

print("\n✅ 'df_filtered' has been correctly created.")
print("It now contains the following columns:")
print(df_filtered.columns)

Loading and preparing the complete detailed dataset...
Applying occupation bucketing...

✅ 'df_filtered' has been correctly created.
It now contains the following columns:
Index(['qid', 'title', 'gender', 'country', 'occupation', 'first_edit_ts',
       'creation_year', 'gender_group', 'occupation_group'],
      dtype='object')


In [4]:
# ================================
# Safe Continent Mapping (Full Cell — latest 50 + Timor-Leste & Kosovo fixes)
# ================================
# Requires: pip install pycountry-convert pycountry

import math
import pandas as pd
import pycountry_convert as pc

print("Mapping countries to continents (safe mode) ...")

# ------------------------------------------------------------
# 0) Replace placeholder strings with nulls (e.g., "unknown")
# ------------------------------------------------------------
_PLACEHOLDER_NULLS = {"unknown", "Unknown", "UNKNOWN", "N/A", "None", "none"}
df_filtered["country"] = (
    df_filtered["country"]
    .astype(str)
    .map(lambda s: None if s.strip() in _PLACEHOLDER_NULLS else s.strip())
)

# ------------------------------------------------------------
# 1) Alias dictionary: unify messy names/legacy entities/cities -> ISO country names
#    (Includes everything from your previous lists + the latest 50)
# ------------------------------------------------------------
_ALIAS = {
    # --- Common alternates / ISO oddities ---
    "USA": "United States",
    "U.S.": "United States",
    "United States of America": "United States",
    "UK": "United Kingdom",
    "South Korea": "Korea, Republic of",
    "North Korea": "Korea, Democratic People's Republic of",
    "Russia": "Russian Federation",
    "Czech Republic": "Czechia",
    "Vatican City": "Holy See (Vatican City State)",
    "Iran": "Iran, Islamic Republic of",
    "Syria": "Syrian Arab Republic",
    "Bolivia": "Bolivia, Plurinational State of",
    "Tanzania": "Tanzania, United Republic of",
    "Moldova": "Moldova, Republic of",
    "Venezuela": "Venezuela, Bolivarian Republic of",
    "Laos": "Lao People's Democratic Republic",
    "Palestine": "Palestine, State of",
    "Ivory Coast": "Côte d'Ivoire",
    "Cape Verde": "Cabo Verde",
    "Micronesia": "Micronesia, Federated States of",
    "Swaziland": "Eswatini",
    "East Timor": "Timor-Leste",  # unify to Timor-Leste spelling

    # --- Prior batches (cities/legacy states -> countries) ---
    "Soviet Union": "Russian Federation",
    "Czechoslovakia": "Czechia",
    "London": "United Kingdom",
    "British Hong Kong": "Hong Kong",
    "State of Palestine": "Palestine, State of",
    "England": "United Kingdom",
    "Sydney": "Australia",
    "The Gambia": "Gambia",
    "Dublin": "Ireland",
    "Toronto": "Canada",
    "Socialist Federal Republic of Yugoslavia": "Serbia",
    "Belgrade": "Serbia",
    "German Democratic Republic": "Germany",
    "Athens": "Greece",
    "Kosovo": "Kosovo",  # alpha-2/continent override below
    "Moscow": "Russian Federation",
    "Johannesburg": "South Africa",
    "French protectorate of Tunisia": "Tunisia",
    "The Bahamas": "Bahamas",
    "Yugoslavia": "Serbia",
    "Tehran": "Iran, Islamic Republic of",
    "Cape Town": "South Africa",
    "Karachi": "Pakistan",
    "Melbourne": "Australia",
    "Buenos Aires": "Argentina",
    "Timor-Leste": "Timor-Leste",  # explicit override also below
    "Glasgow": "United Kingdom",
    "Scotland": "United Kingdom",
    "Trinidad": "Trinidad and Tobago",
    "Montreal": "Canada",
    "Saint Petersburg": "Russian Federation",
    "Bucharest": "Romania",
    "Mumbai": "India",
    "Berlin": "Germany",
    "Lahore": "Pakistan",
    "Sofia": "Bulgaria",
    "Thessaloniki": "Greece",
    "Montevideo": "Uruguay",
    "Adelaide": "Australia",
    "Paris": "France",
    "Lagos": "Nigeria",
    "Birmingham": "United Kingdom",
    "Brisbane": "Australia",
    "New York City": "United States",
    "Mexico City": "Mexico",
    "Chennai": "India",
    "Nairobi": "Kenya",
    "Manchester": "United Kingdom",
    "Kingston": "Jamaica",
    "Kingdom of Italy": "Italy",
    "Zagreb": "Croatia",
    "Sarajevo": "Bosnia and Herzegovina",
    "Kyiv": "Ukraine",
    "Accra": "Ghana",
    "Vancouver": "Canada",
    "Edinburgh": "United Kingdom",
    "Tbilisi": "Georgia",
    "Barcelona": "Spain",
    "Durban": "South Africa",
    "Belfast": "United Kingdom",
    "Bangkok": "Thailand",
    "Manila": "Philippines",
    "Pretoria": "South Africa",
    "Stockholm": "Sweden",
    "Seoul": "Korea, Republic of",
    "Kolkata": "India",
    "Prague": "Czechia",
    "Calgary": "Canada",
    "Liverpool": "United Kingdom",
    "Colombo": "Sri Lanka",
    "Caracas": "Venezuela, Bolivarian Republic of",
    "Madrid": "Spain",
    "Gqeberha": "South Africa",
    "Winnipeg": "Canada",
    "Tokyo": "Japan",
    "East London": "South Africa",
    "Skopje": "North Macedonia",
    "Bratislava": "Slovakia",
    "Munich": "Germany",
    "Wales": "United Kingdom",
    "Hokkaido": "Japan",
    "Leeds": "United Kingdom",
    "Harare": "Zimbabwe",
    "Rome": "Italy",
    "Ottawa": "Canada",
    "Beirut": "Lebanon",
    "Edmonton": "Canada",

    # --- Your latest 50 (this round) ---
    "Tashkent": "Uzbekistan",
    "Vienna": "Austria",
    "Stuttgart": "Germany",
    "Portsmouth": "United Kingdom",
    "Larissa": "Greece",
    "British Raj": "India",
    "Bradford": "United Kingdom",
    "Malacca": "Malaysia",
    "Beijing": "China",
    "Rosario": "Argentina",
    "Victoria": "Australia",  # heuristic: state of Victoria (AU)
    "Newcastle upon Tyne": "United Kingdom",
    "Bamako": "Mali",
    "Milan": "Italy",
    "Serbia and Montenegro": "Serbia",
    "Damascus": "Syrian Arab Republic",
    "Manipur": "India",
    "Boston": "United States",
    "Gothenburg": "Sweden",
    "Kingston upon Hull": "United Kingdom",
    "Surrey": "United Kingdom",  # heuristic (could be CA too)
    "Prishtina": "Kosovo",
    "Detroit": "United States",
    "San Jose": "United States",  # heuristic (could be CR)
    "Pasadena": "United States",
    "Selangor": "Malaysia",
    "Tirana": "Albania",
    "Santa Monica": "United States",
    "Windhoek": "Namibia",
    "Wigan": "United Kingdom",
    "Cologne": "Germany",
    "Bengaluru": "India",
    "Penang": "Malaysia",
    "Kampala": "Uganda",
    "Jerusalem": "Israel",
    "Alexandria": "Egypt",
    "Bandung": "Indonesia",
    "Rawalpindi": "Pakistan",
    "Johor": "Malaysia",
    "Santo Domingo": "Dominican Republic",
    "West Germany": "Germany",
    "Hamilton": "Canada",
    "Almaty": "Kazakhstan",
    "Hamburg": "Germany",
    "Georgetown": "Guyana",  # heuristic
    "Santiago": "Chile",
    "Havana": "Cuba",
    "Chicago": "United States",
    "Lusaka": "Zambia",
    "Tel Aviv": "Israel",
    "Baku": "Azerbaijan",
    "Nottingham": "United Kingdom",
    "Leicester": "United Kingdom",
    "Halifax": "Canada",
    "Perth": "Australia",
    "Split": "Croatia",
    "Kerala": "India",
    "Los Angeles": "United States",
    "New Delhi": "India",
    "Jacksonville": "United States",
    "Jakarta": "Indonesia",
    "Yangon": "Myanmar",
    "Amman": "Jordan",
    "Cork": "Ireland",
    "Novi Sad": "Serbia",
    "Rio de Janeiro": "Brazil",
    "Brooklyn": "United States",
    "Minsk": "Belarus",
    "Bristol": "United Kingdom",
    "Warsaw": "Poland",
    "São Paulo": "Brazil",
    "Delhi": "India",
    "Casablanca": "Morocco",
    "Yerevan": "Armenia",
    "Oxford": "United Kingdom",
    "Frankfurt": "Germany",
    "Cairo": "Egypt",
    "Philadelphia": "United States",
    "Malé": "Maldives",
    "Gdańsk": "Poland",
    "Lviv": "Ukraine",
    "Bogotá": "Colombia",
    "Cardiff": "United Kingdom",
    "Kuala Lumpur": "Malaysia",
    "Kharkiv": "Ukraine",
    "Monrovia": "Liberia",
    "Taipei": "Taiwan",
}

# ------------------------------------------------------------
# 2) Special-case overrides (alpha-2 or continent)
#    - Kosovo uses "XK" which some libs don't map to a continent; force Europe.
#    - Timor-Leste can be finicky in some environments; force alpha-2 "TL".
# ------------------------------------------------------------
_ALPHA2_OVERRIDES = {
    "Kosovo": "XK",
    "Timor-Leste": "TL",  # <-- ensures consistent resolution
}
_CONTINENT_OVERRIDES_BY_ALPHA2 = {
    "XK": "Europe",       # Kosovo
    # "TL" resolves normally to Asia; no continent override needed
}

# ------------------------------------------------------------
# 3) Helper functions
# ------------------------------------------------------------
def _normalize_country(name):
    if name is None:
        return None
    if isinstance(name, float) and math.isnan(name):
        return None
    s = str(name).strip()
    if s == "" or s.lower() == "other":
        return None
    return _ALIAS.get(s, s)

def _alpha2_from_name(name):
    # explicit alpha-2 overrides first
    if name in _ALPHA2_OVERRIDES:
        return _ALPHA2_OVERRIDES[name]
    try:
        return pc.country_name_to_country_alpha2(name)
    except Exception:
        try:
            import pycountry
            return pycountry.countries.lookup(name).alpha_2
        except Exception:
            return None

def _continent_from_alpha2(a2):
    # explicit continent override
    if a2 in _CONTINENT_OVERRIDES_BY_ALPHA2:
        return _CONTINENT_OVERRIDES_BY_ALPHA2[a2]
    code = pc.country_alpha2_to_continent_code(a2)
    return pc.convert_continent_code_to_continent_name(code)

def country_to_continent_safe(country_name):
    n = _normalize_country(country_name)
    if n is None:
        return "Other"
    a2 = _alpha2_from_name(n)
    if not a2:
        return "Other"
    try:
        return _continent_from_alpha2(a2)
    except Exception:
        return "Other"

# ------------------------------------------------------------
# 4) Apply mapping
# ------------------------------------------------------------
df_filtered["continent"] = df_filtered["country"].apply(country_to_continent_safe)

# ------------------------------------------------------------
# 5) Verification & diagnostics
# ------------------------------------------------------------
print("\n✅ Continent mapping complete.")

print("\nTop 10 continents:")
print(df_filtered["continent"].value_counts().head(10))

print("\nMost frequent remaining 'Other' country values (top 40):")
unmapped_sample = (
    df_filtered.loc[df_filtered["continent"] == "Other", "country"]
    .dropna()
    .value_counts()
    .head(40)
)
print(unmapped_sample)


Mapping countries to continents (safe mode) ...

✅ Continent mapping complete.

Top 10 continents:
continent
Europe           148563
Other            145895
North America     88019
Asia              83891
Africa            29757
South America     23500
Oceania           17284
Name: count, dtype: int64

Most frequent remaining 'Other' country values (top 40):
country
Timor-Leste    128
Richmond        29
Hyderabad       29
Tamil Nadu      29
Brno            29
Poznań          28
Shanghai        28
West Bengal     28
Sheffield       28
Abidjan         28
Yaoundé         28
Dakar           28
Plovdiv         27
Mansfield       27
Wakefield       26
Bridgetown      26
Mississauga     26
Ipswich         26
Hollywood       26
Cambridge       26
Orange          26
York            25
Goa             25
Chișinău        25
Hanover         25
Šibenik         25
Sialkot         25
Dnipro          25
Niš             25
Saint Kitts     25
Brampton        25
Windsor         25
Columbus        25
Gene

In [5]:
# --- Hard fix for Timor-Leste: force country + continent to Asia ---

import re, unicodedata

def _is_timor_leste(s: str) -> bool:
    if s is None:
        return False
    # Normalize unicode, collapse funky hyphens to a simple '-'
    t = unicodedata.normalize("NFKC", str(s)).strip().lower()
    t = re.sub(r"[\u2010-\u2015\u2212\u2043\-]+", "-", t)  # any hyphen-like -> '-'
    # Remove common prefixes and normalize variants
    t = t.replace("democratic republic of ", "")
    t = t.replace("timor leste", "timor-leste")
    t = t.replace("east-timor", "east timor")
    # Final checks
    return t in {"timor-leste", "east timor", "tl"}

mask_tl = df_filtered["country"].apply(_is_timor_leste)

# Standardize country name
df_filtered.loc[mask_tl, "country"] = "Timor-Leste"
# Force continent
df_filtered.loc[mask_tl, "continent"] = "Asia"

print(f"✅ Timor-Leste rows fixed: {int(mask_tl.sum())}")
print(df_filtered.loc[mask_tl, ["country","continent"]].head())


✅ Timor-Leste rows fixed: 128
            country continent
580162  Timor-Leste      Asia
581748  Timor-Leste      Asia
588886  Timor-Leste      Asia
593145  Timor-Leste      Asia
593147  Timor-Leste      Asia


In [6]:
import pandas as pd
import altair as alt

print("Building: Continental Biography Distribution by Year ...")

# --- 1) Prep base data (rename to avoid vega name clashes) ---
df_con_chart = (
    df_filtered
    .query("creation_year.notnull() and continent.notnull() and continent != 'Other' and country.notnull()")
    .loc[:, ["creation_year", "continent", "country"]]
    .rename(columns={
        "creation_year": "year",
        "continent": "continent_name",
        "country": "country_name"
    })
)

# --- 2) Counts per (year, continent) ---
counts = (
    df_con_chart
    .groupby(["year", "continent_name"])
    .size()
    .reset_index(name="n")
)

# --- 3) Rank continents within each year (for left→right ordering) ---
counts = counts.sort_values(["year", "n"], ascending=[True, False])
counts["continent_rank"] = counts.groupby("year")["n"].rank(
    method="first", ascending=False
).astype(int)

# --- 4) Build "Top 3 countries" strings per (year, continent) for the tooltip ---
top3_countries = (
    df_con_chart
    .groupby(["year", "continent_name", "country_name"])
    .size()
    .reset_index(name="cn")
    .sort_values(["year", "continent_name", "cn"], ascending=[True, True, False])
    .groupby(["year", "continent_name"])
    .apply(
        lambda g: ", ".join(
            f"{row.country_name} ({int(row.cn)})" for _, row in g.head(3).iterrows()
        ),
        include_groups=False   # ✅ Future-proof change
    )
    .reset_index(name="top3_countries")
)

# --- 5) Merge tooltip info onto counts ---
viz_df = counts.merge(top3_countries, on=["year", "continent_name"], how="left")

# --- 6) Build chart ---
years_order = sorted(viz_df["year"].unique().tolist())
chart_width = max(1200, 40 * len(years_order))  # dynamic width

con_chart = (
    alt.Chart(viz_df)
    .mark_bar()
    .encode(
        x=alt.X(
            "year:O",
            title="",
            sort=years_order,
            axis=alt.Axis(
                grid=False,
                labelAngle=0
            )
        ),
        y=alt.Y(
            "n:Q",
            title="Number of biographies",
            axis=alt.Axis(grid=False)
        ),
        xOffset=alt.XOffset("continent_rank:O"),
        color=alt.Color(
            "continent_name:N",
            title="Continent",
            sort=["Africa", "Asia", "Europe", "North America", "Oceania", "South America"]
        ),
        tooltip=[
            alt.Tooltip("year:O", title="Year"),
            alt.Tooltip("continent_name:N", title="Continent"),
            alt.Tooltip("n:Q", title="Biographies", format=","),
            alt.Tooltip("top3_countries:N", title="Top 3 countries")
        ],
        order=alt.Order("continent_rank:Q")
    )
    .properties(
        title="Continental Biography Distribution by Year",
        width=chart_width,
        height=400
    )
)

con_chart


Building: Continental Biography Distribution by Year ...


In [7]:
import altair as alt
import pandas as pd

print("Creating the gender representation trend chart with region filter (final polished version)...")

# --- 1. Prepare data ---
def bucket_gender_for_trend(g):
    g = (g or "").strip().lower()
    if g in ["non-binary", "nonbinary", "trans woman", "trans man", "transgender", "genderqueer", "agender"]:
        return "Other (trans/non-binary)"
    elif g == "male":
        return "Male"
    elif g == "female":
        return "Female"
    else:
        return "Unknown"

trend_df = (
    df_filtered
    .loc[df_filtered["continent"].notnull() & (df_filtered["continent"] != "Other")]
    .assign(gender_group=lambda d: d["gender"].apply(bucket_gender_for_trend))
)

# --- 2. Aggregate by year × continent × gender ---
agg_region_df = (
    trend_df
    .groupby(["creation_year", "continent", "gender_group"], as_index=False)
    .size()
    .rename(columns={"size": "count"})
)

agg_region_df["yearly_total"] = (
    agg_region_df.groupby(["creation_year", "continent"])["count"].transform("sum")
)
agg_region_df["share"] = agg_region_df["count"] / agg_region_df["yearly_total"] * 100
agg_region_df = agg_region_df[agg_region_df["gender_group"] != "Unknown"]

# --- 3. Add global "All" (aggregated across continents) ---
global_df = (
    agg_region_df
    .groupby(["creation_year", "gender_group"], as_index=False)["count"].sum()
)
global_df["continent"] = "All"
global_df["yearly_total"] = (
    global_df.groupby(["creation_year"])["count"].transform("sum")
)
global_df["share"] = global_df["count"] / global_df["yearly_total"] * 100

combined_df = pd.concat([agg_region_df, global_df], ignore_index=True)

# --- 4. Dropdown for continent selection ---
continent_dropdown = alt.binding_select(
    options=sorted(agg_region_df["continent"].unique().tolist()) + ["All"],
    name="🌍 Continent: "
)
continent_param = alt.param("continent_select", bind=continent_dropdown, value="All")

# --- 5. Build chart ---
domain_gender = ["Male", "Female", "Other (trans/non-binary)"]
range_gender  = ["#1f77b4", "#e377c2", "#2ca02c"]

base = (
    alt.Chart(combined_df)
    .transform_filter("datum.continent == continent_select")
    .encode(
        x=alt.X(
            "creation_year:O",
            title=None,
            axis=alt.Axis(
                labelAngle=0,
                grid=False,
                domain=False,
                ticks=True
            )
        ),
        y=alt.Y(
            "share:Q",
            title=None,
            axis=alt.Axis(labels=False, ticks=False, grid=False, domain=False)
        ),
        color=alt.Color(
            "gender_group:N",
            title="Gender Group",
            scale=alt.Scale(domain=domain_gender, range=range_gender)
        ),
        tooltip=[
            alt.Tooltip("creation_year:O", title="Year"),
            alt.Tooltip("continent:N", title="Continent"),
            alt.Tooltip("gender_group:N", title="Gender"),
            alt.Tooltip("share:Q", title="% Share", format=".1f")
        ]
    )
    .add_params(continent_param)
)

# --- 6. Line + Labels ---
line = base.mark_line(point=alt.OverlayMarkDef(size=80), strokeWidth=3)
labels = base.mark_text(
    align="center",
    baseline="bottom",
    dy=-8,
    size=11
).encode(
    text=alt.Text("share:Q", format=".1f")
)

gender_region_chart = (
    (line + labels)
    .properties(
        title="Gender Representation Over Time (Filterable by Continent)",
        width=900,
        height=350
    )
)

gender_region_chart


Creating the gender representation trend chart with region filter (final polished version)...


In [8]:
# Cell for the Final Polished Yearly Trend Chart

# This cell creates the final, customized version of the yearly trend line chart.

print("Creating the final polished yearly trend chart...")

# --- 1. Data Preparation ---
yearly_counts_df = df_filtered.groupby('creation_year').size().reset_index(name='total_articles')

# --- 2. Chart Creation ---
# Create a base chart that both layers can inherit from
base = alt.Chart(yearly_counts_df).encode(
    # MODIFICATION: Customize the x-axis to show labels and ticks
    x=alt.X('creation_year:O', 
            title=None, 
            axis=alt.Axis(labels=True, ticks=True, domain=False, grid=False, labelAngle=0)),
    
    # Y-axis remains hidden
    y=alt.Y('total_articles:Q', axis=None),
    
    tooltip=[
        alt.Tooltip('creation_year', title='Year:'),
        alt.Tooltip('total_articles', title='Biographies:', format=',')
    ]
)

# Layer 1: The line with points
line = base.mark_line(
    point=alt.OverlayMarkDef(size=80),
    strokeWidth=3,
    color='#1f77b4'
)

# Layer 2: The text labels
text = base.mark_text(
    align='center',
    baseline='bottom',
    dy=-10
).encode(
    text=alt.Text('total_articles:Q', format=',')
)

# Layer the two charts together and apply final properties
final_yearly_chart = alt.layer(line, text).properties(
    title='New Biographies Created per Year',
    width=700,
    height=300
)

# Display the chart
final_yearly_chart

Creating the final polished yearly trend chart...


In [9]:
import pandas as pd
import altair as alt

print("Creating the static gender-split Small Multiples chart for occupations...")

# =========================================================
# 1️⃣ Data prep: aggregate by year × occupation × gender
# =========================================================
df_gendered = df_filtered.copy()
df_gendered["gender_group_display"] = df_gendered["gender_group"].str.capitalize()

group_trends_df = (
    df_gendered[df_gendered["occupation_group"] != "Other"]
    .groupby(["creation_year", "occupation_group", "gender_group_display"])
    .size()
    .reset_index(name="group_total")
)

# Top 3 occupations for tooltips
top_occupations_tooltip = (
    df_gendered[df_gendered["occupation"] != "unknown"]
    .groupby(["creation_year", "occupation_group", "occupation"])
    .size()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
    .groupby(["creation_year", "occupation_group"])
    .head(3)
)

tooltip_strings = (
    top_occupations_tooltip
    .groupby(["creation_year", "occupation_group"])
    .apply(
        lambda g: ", ".join(f"{row['occupation']} ({int(row['count'])})"
                            for _, row in g.iterrows()),
        include_groups=False
    )
    .reset_index(name="top_3_tooltip")
)

final_plot_df = (
    pd.merge(
        group_trends_df,
        tooltip_strings,
        on=["creation_year", "occupation_group"],
        how="left"
    )
    .fillna({"top_3_tooltip": "N/A"})
)

# =========================================================
# 2️⃣ Build the static chart
# =========================================================
domain_gender = ["Male", "Female", "Other (trans/non-binary)"]
range_gender  = ["#1f77b4", "#e377c2", "#2ca02c"]  # same as your pie/trend palette

sort_order = (
    df_gendered[df_gendered["occupation_group"] != "Other"]["occupation_group"]
    .value_counts()
    .index
    .tolist()
)

small_multiples_gender_chart = (
    alt.Chart(final_plot_df)
    .mark_line(point=True, strokeWidth=2)
    .encode(
        x=alt.X(
            "creation_year:O",
            title=None,
            axis=alt.Axis(labels=True, ticks=True, grid=False, labelAngle=-90)
        ),
        y=alt.Y("group_total:Q",
                title="Number of Biographies",
                axis=alt.Axis(grid=False)),
        color=alt.Color(
            "gender_group_display:N",
            title="Gender",
            scale=alt.Scale(domain=domain_gender, range=range_gender)
        ),
        tooltip=[
            alt.Tooltip("creation_year", title="Year"),
            alt.Tooltip("occupation_group", title="Occupation Group"),
            alt.Tooltip("gender_group_display", title="Gender"),
            alt.Tooltip("group_total:Q", title="Total Biographies", format=","),
            alt.Tooltip("top_3_tooltip:N", title="Top 3 Occupations"),
        ]
    )
    .properties(width=250, height=200)
    .facet(
        facet=alt.Facet(
            "occupation_group:N",
            title=None,
            header=alt.Header(labelFontSize=14),
            sort=sort_order
        ),
        columns=3
    )
    .resolve_scale(y="independent")
    .resolve_axis(x="independent")
    .properties(
        title="Yearly Trends for Each Occupation Group, by Gender"
    )
)

small_multiples_gender_chart


Creating the static gender-split Small Multiples chart for occupations...


In [10]:
# Cell for the Final Polished Gender Pie Chart

# This final version capitalizes the labels and removes the tooltips,
# without adding a background color.

print("Creating the final polished Gender Distribution pie chart...")

# --- 1. Data Preparation ---
gender_totals_df = df_filtered.groupby('gender_group').size().reset_index(name='count')
gender_totals_df['percentage'] = (gender_totals_df['count'] / gender_totals_df['count'].sum()) * 100

# Capitalize the first letter of the gender groups for display
gender_totals_df['gender_group_display'] = gender_totals_df['gender_group'].str.capitalize()

# Create a column with a list of strings for multi-line labels
gender_totals_df['multi_line_label'] = gender_totals_df.apply(
    lambda row: [row['gender_group_display'], f"{row['percentage']:.1f}%"],
    axis=1
)

# Define the custom color scheme
# The domain must be updated to match the capitalized values
domain = ['Male', 'Female', 'Other (trans/non-binary)']
range_ = ['#1f77b4', '#e377c2', '#2ca02c'] # Blue, Pink, Green

# --- 2. Chart Creation ---
# Create a base chart with the core encodings
base = alt.Chart(
    gender_totals_df[gender_totals_df['gender_group'] != 'Unknown']
).encode(
    theta=alt.Theta("count:Q", stack=True),
    color=alt.Color("gender_group_display:N", scale=alt.Scale(domain=domain, range=range_), legend=None)
    # The 'tooltip' parameter has been removed.
)

# Create the pie slices layer
pie = base.mark_arc(outerRadius=90, innerRadius=50)

# Create the text labels layer, positioned outside the pie
text = base.mark_text(
    radius=115,
    size=12,
    align='center'
).encode(
    # Use the new multi-line label column for the text
    text="multi_line_label:N"
)

# Layer the slices and labels together
pie_chart = (pie + text).properties(
    title="Gender Distribution"
)
# MODIFICATION: The .configure_view() call has been removed.

# Display the chart

pie_chart

Creating the final polished Gender Distribution pie chart...


In [11]:
# Cell for the Total Biographies KPI

# This cell creates a simple KPI visualization to show the total
# number of biographies in our analysis dataset (since 2015).

print("Creating the KPI for Total Biographies...")

# --- 1. Data Preparation ---
total_bios_count = len(df_filtered)

# Create a small DataFrame to hold our KPI data
kpi_df = pd.DataFrame([
    {'kpi': 'Total Biographies:', 'value': f"{total_bios_count:,}"}
])

# --- 2. Chart Creation ---
kpi_chart = alt.Chart(kpi_df).mark_text(
    size=24, # Set a larger font size for the KPI
    align='center'
).encode(
    text='kpi:N', # Display the "Total Biographies:" text
).properties(
    width=200,
    )

kpi_value = alt.Chart(kpi_df).mark_text(
    size=36, # Make the number even larger
    align='center',
    fontWeight='bold' # Make the number bold
).encode(
    text='value:N' # Display the formatted number
).properties(
    width=200,
    height=1
)

# Vertically stack the label and the value
total_biographies_kpi = alt.vconcat(
    kpi_chart,
    kpi_value
)

# Display the KPI
total_biographies_kpi

Creating the KPI for Total Biographies...


In [12]:
# Cell for the Standalone Occupation Bar Chart (No Background)

# This version keeps the color gradient for the bars but removes the
# background color from the chart properties.

print("Creating the Occupation Group bar chart...")

# --- 1. Data Preparation ---
occupation_totals_df = df_filtered[
    df_filtered['occupation_group'] != 'Other'
].groupby('occupation_group').size().reset_index(name='count')


# --- 2. Chart Creation ---
# Create a base chart that both layers can inherit from
base = alt.Chart(occupation_totals_df).encode(
    x=alt.X('count:Q', axis=None),
    y=alt.Y('occupation_group:N', sort='-x', title=None, axis=alt.Axis(ticks=False, domain=False)),
    tooltip=[
        alt.Tooltip('occupation_group:N', title='Occupation Group:'),
        alt.Tooltip('count:Q', title='Biographies:', format=',')
    ]
)

# Layer 1: The bars
bars = base.mark_bar().encode(
    color=alt.Color('count:Q', scale=alt.Scale(scheme='tealblues'), legend=None)
)

# Layer 2: The text labels with conditional positioning
# Define the threshold for switching styles
threshold = 25000

# Text for LONG bars (white, inside)
text_long_bars = base.mark_text(
    align='right',
    dx=-7,
    color='white'
).encode(
    text=alt.Text('count:Q', format=',')
).transform_filter(
    alt.datum.count > threshold
)

# Text for SHORT bars (black, outside)
text_short_bars = base.mark_text(
    align='left',
    dx=7,
    color='black'
).encode(
    text=alt.Text('count:Q', format=',')
).transform_filter(
    alt.datum.count <= threshold
)


# Combine all three layers and apply top-level properties
occupation_chart = alt.layer(
    bars, text_long_bars, text_short_bars
).properties(
    title="Which Occupation Groups have the most Biographies?",
    width=600
    # MODIFICATION: The 'background' property has been removed.
)

# Display the chart
occupation_chart

Creating the Occupation Group bar chart...


In [13]:
# Cell for the Standalone Country Bar Chart

# This cell creates the standalone bar chart for the Top 10 countries,
# styled to match the occupation chart.

print("Creating the Top 10 Countries bar chart...")

# --- 1. Data Preparation in pandas ---
# Calculate the total counts for each country
country_totals_df = df_filtered[
    df_filtered['country'] != 'unknown'
].groupby('country').size().reset_index(name='count')

# Calculate the percentage relative to the total of biographies with a known country
total_known_country_bios = country_totals_df['count'].sum()
country_totals_df['percent_of_known_total'] = (country_totals_df['count'] / total_known_country_bios) * 100

# Get the top 10 countries
top_10_countries_df = country_totals_df.nlargest(10, 'count')


# --- 2. Chart Creation ---
# The base chart defines the data source and shared encodings
base = alt.Chart(top_10_countries_df).encode(
    x=alt.X('count:Q', axis=None),
    y=alt.Y('country:N', sort='-x', title=None, axis=alt.Axis(ticks=False, domain=False)),
    tooltip=[
        alt.Tooltip('country:N', title='Country:'),
        alt.Tooltip('count:Q', title='Biographies:', format=','),
        alt.Tooltip('percent_of_known_total:Q', title='% of Known Total:', format='.1f')
    ]
)

# Layer 1: The bars
bars = base.mark_bar().encode(
    color=alt.Color('count:Q', scale=alt.Scale(scheme='tealblues'), legend=None)
)

# Layer 2: The text labels
text = base.mark_text(
    align='right',
    dx=-7,
    color='white'
).encode(
    text=alt.Text('count:Q', format=',')
)

# Layer the two charts together and apply top-level properties
country_chart = alt.layer(bars, text).properties(
    title="What are the Top 10 Countries with the most Biographies?",
    width=600
)

# Display the chart
country_chart

Creating the Top 10 Countries bar chart...


In [14]:
import pandas as pd

pop_path = r"C:\Users\drrahman\wiki-gaps-project\data\baselines\world_population_by_continent.csv"
pop_df = pd.read_csv(pop_path)

print(pop_df.head(10))
print(pop_df.columns)


       continent  population  year                                    source
0           Asia  4835320060  2025  Worldometer (Population by Region, 2025)
1         Africa  1549867579  2025  Worldometer (Population by Region, 2025)
2         Europe   744398832  2025  Worldometer (Population by Region, 2025)
3  North America   604000000  2025  Worldometer (Population by Region, 2025)
4  South America   438000000  2025  Worldometer (Population by Region, 2025)
5        Oceania    43000000  2025  Worldometer (Population by Region, 2025)
Index(['continent', 'population', 'year', 'source'], dtype='object')


In [15]:
# ===============================================================
# 🧮 Build bio_by_year_continent using population baseline (2025 constant)
# ===============================================================

import pandas as pd

# Load population baseline
pop_df = pd.read_csv(r"C:\Users\drrahman\wiki-gaps-project\data\baselines\world_population_by_continent.csv")

# Clean column names
pop_df.columns = pop_df.columns.str.strip().str.lower()

# Standardize continent names
continent_name_map = {
    "Northern America": "North America",
    "Australia/Oceania": "Oceania",
    "Latin America": "South America"
}
pop_df["continent"] = pop_df["continent"].replace(continent_name_map)
pop_df["continent"] = pop_df["continent"].str.strip()

# Ensure correct data types
pop_df["year"] = pop_df["year"].astype(int)
pop_df["population"] = pop_df["population"].astype(float)

# --- Base biography data ---
bio_df = df_filtered.copy()
bio_df = bio_df.query("continent.notnull() and continent != 'Unknown'")
bio_df["creation_year"] = bio_df["creation_year"].astype(int)

# --- Extend population data across all years in biography dataset ---
year_range = sorted(bio_df["creation_year"].unique().tolist())
pop_extended = []
for yr in year_range:
    temp = pop_df.copy()
    temp["year"] = yr
    pop_extended.append(temp)
pop_df = pd.concat(pop_extended, ignore_index=True)

# --- Biography counts per year × continent ---
bio_counts = (
    bio_df.groupby(["creation_year", "continent"])
    .size()
    .reset_index(name="bio_count")
)

# --- Total biographies per year ---
year_totals = bio_counts.groupby("creation_year")["bio_count"].sum().reset_index(name="year_total")

# --- Merge totals and calculate share ---
bio_by_year_continent = bio_counts.merge(year_totals, on="creation_year", how="left")
bio_by_year_continent["bio_share"] = bio_by_year_continent["bio_count"] / bio_by_year_continent["year_total"]

# --- Merge with population baseline ---
bio_by_year_continent = bio_by_year_continent.merge(
    pop_df[["continent", "population", "year"]],
    left_on=["continent", "creation_year"],
    right_on=["continent", "year"],
    how="left"
)

# --- Compute population share per year ---
pop_totals = pop_df.groupby("year")["population"].sum().reset_index(name="world_population")
bio_by_year_continent = bio_by_year_continent.merge(pop_totals, on="year", how="left")
bio_by_year_continent["pop_share"] = bio_by_year_continent["population"] / bio_by_year_continent["world_population"]

# --- Compute representation gap ---
bio_by_year_continent["gap"] = bio_by_year_continent["bio_share"] - bio_by_year_continent["pop_share"]

# --- Clean final columns ---
bio_by_year_continent = bio_by_year_continent[
    ["creation_year", "continent", "bio_count", "bio_share", "pop_share", "gap"]
].sort_values(["creation_year", "continent"])

print("✅ bio_by_year_continent successfully built with constant population baseline (2025 values)")
bio_by_year_continent.head(10)


✅ bio_by_year_continent successfully built with constant population baseline (2025 values)


Unnamed: 0,creation_year,continent,bio_count,bio_share,pop_share,gap
0,2015,Africa,2706,0.052626,0.188673,-0.136046
1,2015,Asia,9169,0.178319,0.588626,-0.410307
2,2015,Europe,16233,0.3157,0.090619,0.225081
3,2015,North America,11898,0.231393,0.073528,0.157865
4,2015,Oceania,1909,0.037126,0.005235,0.031892
5,2015,Other,7394,0.143799,,
6,2015,South America,2110,0.041035,0.05332,-0.012284
7,2016,Africa,3354,0.059271,0.188673,-0.129402
8,2016,Asia,11537,0.203877,0.588626,-0.384749
9,2016,Europe,18248,0.322471,0.090619,0.231852


In [16]:
# ===============================================================
# 📈 Representation Gap by Continent (color-accurate)
# ===============================================================

import altair as alt
import pandas as pd

# Remove Unknown
bio_by_year_continent = bio_by_year_continent.query("continent != 'Unknown'")

# Match same order as the bar chart legend
continent_order = ["Africa", "Asia", "Europe", "North America", "Oceania", "South America"]

# Exact hex codes from your chart legend
continent_colors = [
    "#1f77b4",  # Africa → blue
    "#ff7f0e",  # Asia → orange
    "#d62728",  # Europe → red
    "#17becf",  # North America → light blue / cyan
    "#2ca02c",  # Oceania → green
    "#bcbd22",  # South America → yellow-green
]

color_scale = alt.Scale(domain=continent_order, range=continent_colors)

# Reference line + band
reference_line = alt.Chart(pd.DataFrame({"y": [0]})).mark_rule(
    strokeDash=[4, 4], color="gray"
).encode(y="y:Q")

band = alt.Chart(pd.DataFrame({"y": [-0.02], "y2": [0.02]})).mark_rect(
    color="lightgray", opacity=0.2
).encode(y="y:Q", y2="y2:Q")

# Main line chart
gap_line_chart = (
    alt.Chart(bio_by_year_continent)
    .mark_line(point=True, strokeWidth=2)
    .encode(
        x=alt.X("creation_year:O", title="Year", axis=alt.Axis(labelAngle=0)),
        y=alt.Y(
            "gap:Q",
            title="Representation Gap (Bio share − Pop share)",
            axis=alt.Axis(format=".0%"),
        ),
        color=alt.Color(
            "continent:N",
            title="Continent",
            sort=continent_order,
            scale=color_scale,
        ),
        tooltip=[
            alt.Tooltip("creation_year:O", title="Year"),
            alt.Tooltip("continent:N", title="Continent"),
            alt.Tooltip("gap:Q", format=".1%", title="Gap"),
        ],
    )
    .properties(title="Where Wikipedia Representation Falls Short: Continent-Level Gaps (2015–2025)", width=800, height=400)
)

final_gap_chart = (band + reference_line + gap_line_chart).configure_axis(
    labelFontSize=11, titleFontSize=13
).configure_title(fontSize=16, anchor="start")

final_gap_chart


In [17]:
# =========================================================
# 💾 CELL TO SAVE PROCESSED DATA FOR THE DASHBOARD
# =========================================================
# This cell saves the two essential, processed DataFrames
# so the dashboard notebook can load them instantly.
!pip install pyarrow

import pandas as pd
from pathlib import Path



# --- 1. Define Save Paths ---
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

SAVE_PATH = ROOT / "data" / "processed"
SAVE_PATH.mkdir(exist_ok=True)

# Define file paths
main_data_path = SAVE_PATH / "dashboard_main_data.parquet"
gap_data_path = SAVE_PATH / "dashboard_rep_gap_data.csv"
gender_trend_data_path = SAVE_PATH / "dashboard_gender_trend_data.csv"

print(f"Saving processed data to: {SAVE_PATH}")

# --- 2. Save df_filtered (The main dataset) ---
try:
    # We need to save the version from Cell 5, *after* continent
    # mapping and the Timor-Leste fix.
    
    # Convert 'first_edit_ts' to a compatible format if it exists
    if 'first_edit_ts' in df_filtered.columns:
        df_to_save = df_filtered.drop(columns=['first_edit_ts'])
    else:
        df_to_save = df_filtered.copy()

    df_to_save.to_parquet(main_data_path, index=False, engine='pyarrow')
    print(f"✅ Successfully saved 'df_filtered' to: {main_data_path.name}")
except NameError:
    print("❌ Error: 'df_filtered' not found. Please run Cell 3, 4, and 5 first.")
except Exception as e:
    print(f"❌ An error occurred while saving df_filtered: {e}")


# --- 3. Save bio_by_year_continent (For the Gap Chart) ---
try:
    bio_by_year_continent.to_csv(gap_data_path, index=False)
    print(f"✅ Successfully saved 'bio_by_year_continent' to: {gap_data_path.name}")
except NameError:
    print("❌ Error: 'bio_by_year_continent' not found. Please run Cell 15 first.")
except Exception as e:
    print(f"❌ An error occurred while saving bio_by_year_continent: {e}")

# --- 4. Save combined_df (For the Gender Trend Chart) ---
# This is the data used to build 'gender_region_chart'
try:
    combined_df.to_csv(gender_trend_data_path, index=False)
    print(f"✅ Successfully saved 'combined_df' to: {gender_trend_data_path.name}")
except NameError:
    print("❌ Error: 'combined_df' not found. Please run Cell 7 first.")
except Exception as e:
    print(f"❌ An error occurred while saving combined_df: {e}")

print("\nAll necessary data has been saved.")

Saving processed data to: C:\Users\drrahman\wiki-gaps-project\data\processed
✅ Successfully saved 'df_filtered' to: dashboard_main_data.parquet
✅ Successfully saved 'bio_by_year_continent' to: dashboard_rep_gap_data.csv
✅ Successfully saved 'combined_df' to: dashboard_gender_trend_data.csv

All necessary data has been saved.
