In [None]:
import pandas as pd
from pathlib import Path

RAW_PATH = Path("../data/raw/international_bestsellers.csv")
OUT_PATH = Path("../data/derived/women_share_rank1_vs_top10_yearly.csv")

countries = ["France", "Germany", "Italy", "Spain", "United States"]

df = pd.read_csv(RAW_PATH)

# Basic cleanup
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df[df["country"].isin(countries)].copy()
df["year"] = df["date"].dt.year

# --- Gender normalization ---
# gender examples: "w", "m", "m; w", "w; w", "m; m", sometimes missing
def gender_bucket(g):
    if pd.isna(g):
        return "unknown"
    parts = [p.strip().lower() for p in str(g).split(";")]
    parts = [p for p in parts if p]  # remove empty
    s = set(parts)
    # Some datasets use 'n' for unknown; treat as unknown
    if "n" in s:
        return "unknown"
    if s == {"w"}:
        return "female_only"
    if s == {"m"}:
        return "male_only"
    if "w" in s and "m" in s:
        return "mixed"
    return "unknown"

df["gender_bucket"] = df["gender"].apply(gender_bucket)

# We'll compute "women share" on entries where gender is unambiguous (female_only vs male_only)
df_valid = df[df["gender_bucket"].isin(["female_only", "male_only"])].copy()
df_valid["is_woman"] = (df_valid["gender_bucket"] == "female_only").astype(int)

# Metric 1: women share at rank #1
rank1 = df_valid[df_valid["rank"] == 1].groupby(["country", "year"]).agg(
    women_share_rank1=("is_woman", "mean"),
    n_rank1_valid=("is_woman", "size")
).reset_index()

# Metric 2: women share across ranks #1â€“10
top10 = df_valid[df_valid["rank"].between(1, 10)].groupby(["country", "year"]).agg(
    women_share_top10=("is_woman", "mean"),
    n_top10_valid=("is_woman", "size")
).reset_index()

# Merge
out = pd.merge(rank1, top10, on=["country", "year"], how="outer")

# Reorder columns to match the CSV schema
out = out[["year", "country", "women_share_rank1", "women_share_top10", "n_rank1_valid", "n_top10_valid"]]
out = out.sort_values(["country", "year"])

# Write
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False)

out.head(10), out.tail(10), out.shape


(   year country  women_share_rank1  women_share_top10  n_rank1_valid  \
 0  2013  France           0.333333           0.400000              6   
 1  2014  France           0.416667           0.313559             12   
 2  2015  France           0.500000           0.425000             12   
 3  2016  France           0.500000           0.403361             12   
 4  2017  France           0.500000           0.521008             12   
 5  2018  France           0.416667           0.449153             12   
 6  2019  France           0.166667           0.457627             12   
 7  2020  France           0.090909           0.305085             11   
 8  2021  France           0.250000           0.474576             12   
 9  2022  France           0.181818           0.373832             11   
 
    n_top10_valid  
 0             60  
 1            118  
 2            120  
 3            119  
 4            119  
 5            118  
 6            118  
 7            118  
 8            1