In [None]:
import re
import unicodedata
import pandas as pd


In [None]:
authors_institution_df = pd.read_csv('data_A_Z.csv')

authors_institution_df['share_pct'] = authors_institution_df['share_pct'].fillna(100)
authors_institution_df['institution'].value_counts().head(10)

In [None]:
A_B_df = pd.read_csv('RePEc_Full_A_B.csv')
E_Z_df = pd.read_csv('RePEc_Full_E_Z.csv')
D_df = pd.read_csv('RePEc-D-not-full.csv')
S_Y_df = pd.read_csv('RePEc-from-S-to-Y-full.csv')
J_df = pd.read_csv('RePEc_J.csv')

In [None]:
JEL_df = pd.concat(
    [A_B_df, D_df, E_Z_df, S_Y_df, J_df],
    axis=0,
    ignore_index=True
)

JEL_df = JEL_df.drop_duplicates()

In [None]:
authors_institution_df["author_name"].head()

In [None]:
JEL_df["Author(s)"].head(100)

In [None]:
JEL_df = JEL_df.dropna(subset=["Author(s)"])


In [None]:
JEL_df = (
    JEL_df
        .assign(Author=lambda x: x["Author(s)"].str.split(";"))
        .explode("Author")
        .assign(Author=lambda x: x["Author"].str.strip())
        .query("Author != ''")
        .drop(columns=["Author(s)"])
        .rename(columns={"Author": "Author(s)"})
        .reset_index(drop=True)
)


In [None]:
JEL_df["Author(s)"].head(20)


In [None]:
def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def standardize_name(raw):
    if raw is None or pd.isna(raw):
        return ""

    s = str(raw).strip()
    if s == "":
        return ""

    # minuscules + accents
    s = strip_accents(s).lower()

    # "Last, First Middle" -> "First Middle Last"
    if "," in s:
        last, rest = s.split(",", 1)
        s = rest.strip() + " " + last.strip()

    # enlever ponctuation (garde lettres / chiffres)
    s = re.sub(r"[^\w\s]", " ", s)

    # espaces propres
    s = normalize_whitespace(s)

    return s


In [None]:
JEL_df["author_std"] = (
    JEL_df["Author(s)"]
        .astype(str)
        .apply(standardize_name)
)


In [None]:
authors_institution_df["author_std"] = (
    authors_institution_df["author_name"]
        .astype(str)
        .apply(standardize_name)
)


In [None]:
JEL_df[["Author(s)", "author_std"]].head(15)


In [None]:
authors_institution_df[
    ["author_name", "author_std"]
].head(15)


In [None]:
def author_key_unordered(raw):
    s = standardize_name(raw)
    if not s:
        return ""

    parts = s.split()

    # on trie les tokens
    parts_sorted = sorted(parts)

    return "|".join(parts_sorted)


In [None]:
JEL_df["author_key_unordered"] = (
    JEL_df["Author(s)"]
        .astype(str)
        .apply(author_key_unordered)
)

authors_institution_df["author_key_unordered"] = (
    authors_institution_df["author_name"]
        .astype(str)
        .apply(author_key_unordered)
)


In [None]:
authors_institution_df[
    ["author_name", "author_std", "author_key_unordered"]
].head(15)

In [None]:
JEL_df[["Author(s)", "author_std", "author_key_unordered"]].head(15)

In [None]:
JEL_matched[["Author(s)", "author_name", "author_key_unordered"]].head(15)

In [None]:
JEL_matched[JEL_matched["author_name"]=="Blanchard, Olivier J"]


In [None]:
authors_institution_df[authors_institution_df["author_name"]=="Blanchard, Olivier J"]

In [None]:
JEL_df[JEL_df["Author(s)"]=="Blanchard Olivier"]

In [None]:
JEL_matched = (
    JEL_df
        .merge(
            authors_institution_df,
            on="author_key_unordered",
            how="inner",
            suffixes=("_jel", "_inst")
        )
)


on regarde plus en détail les non matchés mtn


In [None]:
def author_key_unordered_no_initial(raw):
    s = standardize_name(raw)
    if not s:
        return ""

    tokens = s.split()

    # enlever les tokens d'une seule lettre
    tokens = [t for t in tokens if len(t) > 1]

    # trier pour rendre l'ordre indifférent
    tokens = sorted(tokens)

    return "|".join(tokens)


In [None]:
JEL_df["author_key_unordered2"] = (
    JEL_df["Author(s)"].apply(author_key_unordered_no_initial)
)

authors_institution_df["author_key_unordered2"] = (
    authors_institution_df["author_name"].apply(author_key_unordered_no_initial)
)


In [None]:
JEL_matched_v2 = (
    JEL_df
        .merge(
            authors_institution_df,
            on="author_key_unordered2",
            how="inner",
            suffixes=("_jel", "_inst")
        )
)


In [None]:
JEL_matched_v2["author_name"].nunique()


In [None]:
matched_keys = set(JEL_matched_v2["author_key_unordered2"])

JEL_unmatched = (
    JEL_df[
        ~JEL_df["author_key_unordered2"].isin(matched_keys)
    ]
    .copy()
)


In [None]:
JEL_unmatched["author_std"].str.split().str.len().value_counts().sort_index()


In [None]:
authors_institution_df[
    authors_institution_df["author_name"]
        .str.contains("isabelle", case=False, na=False)
    &
    authors_institution_df["author_name"]
        .str.contains("salle", case=False, na=False)
]


In [None]:
JEL_unmatched.sample(30, random_state=42)[
    ["Author(s)", "author_std", "author_key_unordered2"]
]


In [None]:
def author_tokens(raw):
    s = standardize_name(raw)
    if not s:
        return set()
    return {t for t in s.split() if len(t) > 1}


In [None]:
JEL_unmatched["tokens"] = JEL_unmatched["Author(s)"].apply(author_tokens)
authors_institution_df["tokens"] = authors_institution_df["author_name"].apply(author_tokens)


In [None]:
def lastname_from_tokens(tokens):
    if not tokens:
        return ""
    return max(tokens, key=len)  # proxy nom de famille


In [None]:
authors_institution_df["lastname"] = authors_institution_df["tokens"].apply(lastname_from_tokens)

inst_by_lastname = (
    authors_institution_df
        .dropna(subset=["lastname"])
        .groupby("lastname")
        .apply(lambda x: list(zip(x["tokens"], x["author_name"])))
        .to_dict()
)


In [None]:
def subset_match_author(row):
    toks = row["tokens"]
    if not toks:
        return None

    lname = lastname_from_tokens(toks)
    if lname not in inst_by_lastname:
        return None

    for inst_tokens, inst_name in inst_by_lastname[lname]:
        # règle clé : inclusion
        if toks.issubset(inst_tokens) or inst_tokens.issubset(toks):
            return inst_name

    return None


In [None]:
JEL_unmatched["author_name_subset"] = JEL_unmatched.apply(subset_match_author, axis=1)


In [None]:
JEL_unmatched[
    JEL_unmatched["Author(s)"].str.contains("Salle", case=False, na=False)
][
    ["Author(s)", "tokens", "author_name_subset"]
]


In [None]:
JEL_unmatched["author_name_subset"].notna().nunique()


In [None]:
JEL_unmatched.col