In [2]:
import pandas as pd
from pathlib import Path

# Paths
raw_path = Path("../data/raw/international_bestsellers.csv")
out_dir  = Path("../data/derived")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "domestic_share_top10_marketgroup_yearly_2013_2022.csv"

# Load
df = pd.read_csv(raw_path)

# Keep only what we need
keep_countries = ["Germany", "France", "Italy", "Spain", "United States"]
df = df[df["country"].isin(keep_countries)].copy()

# Parse year (date is like YYYY-MM-DD)
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year

# Time range
df = df[(df["year"] >= 2013) & (df["year"] <= 2022)].copy()

# Top-10 only
df = df[df["rank"].between(1, 10)].copy()

# Domestic flag:
# nationality can contain multiple values separated by ';'
def is_domestic(nationality, market_country):
    if pd.isna(nationality) or pd.isna(market_country):
        return pd.NA
    parts = [p.strip() for p in str(nationality).split(";")]
    return market_country in parts

df["domestic"] = df.apply(lambda r: is_domestic(r["nationality"], r["country"]), axis=1)

# Valid rows = domestic is True/False (not missing)
df_valid = df[df["domestic"].notna()].copy()

# Country-year domestic share (not pooled across countries yet)
country_year = (
    df_valid
    .groupby(["year", "country"], as_index=False)
    .agg(
        n_top10_valid=("domestic", "size"),
        n_domestic=("domestic", lambda s: int((s == True).sum()))
    )
)
country_year["domestic_share_top10_pct"] = 100 * country_year["n_domestic"] / country_year["n_top10_valid"]

# Germany and US as-is
germany = country_year[country_year["country"] == "Germany"].copy()
germany["market_group"] = "Germany"

us = country_year[country_year["country"] == "United States"].copy()
us["market_group"] = "United States"

# EU peers (avg) = mean of country-level shares (France, Italy, Spain), not pooled
eu_peers = country_year[country_year["country"].isin(["France", "Italy", "Spain"])].copy()

eu_avg = (
    eu_peers
    .groupby("year", as_index=False)
    .agg(
        domestic_share_top10_pct=("domestic_share_top10_pct", "mean"),
        n_top10_valid=("n_top10_valid", "sum"),
        n_domestic=("n_domestic", "sum"),
    )
)
eu_avg["market_group"] = "EU peers (avg)"

# Final table
final = pd.concat(
    [
        germany[["year", "market_group", "domestic_share_top10_pct", "n_top10_valid", "n_domestic"]],
        eu_avg[["year", "market_group", "domestic_share_top10_pct", "n_top10_valid", "n_domestic"]],
        us[["year", "market_group", "domestic_share_top10_pct", "n_top10_valid", "n_domestic"]],
    ],
    ignore_index=True
).sort_values(["year", "market_group"])

final.to_csv(out_path, index=False)
print("Wrote:", out_path)
display(final.head(10))

Wrote: ../data/derived/domestic_share_top10_marketgroup_yearly_2013_2022.csv


Unnamed: 0,year,market_group,domestic_share_top10_pct,n_top10_valid,n_domestic
10,2013,EU peers (avg),59.920635,200,122
0,2013,Germany,28.571429,70,20
20,2013,United States,80.0,70,56
11,2014,EU peers (avg),67.777778,360,244
1,2014,Germany,32.5,120,39
21,2014,United States,85.833333,120,103
12,2015,EU peers (avg),70.277778,360,253
2,2015,Germany,33.333333,120,40
22,2015,United States,83.333333,120,100
13,2016,EU peers (avg),71.267507,359,256
