In [33]:
import pandas as pd
from rapidfuzz import process, fuzz
import re
import unicodedata

In [None]:
#Load the two datasets 
df_Authors=pd.read_csv("data_A_Z.csv")
df_A_B_Code=pd.read_csv("RePEc_Full_A_B.csv")

In [None]:
#Identify Na authors and drop them 
sum(df_A_B_Code["Author(s)"].isna())
df_A_B_Code = df_A_B_Code.dropna(subset=["Author(s)"])

In [None]:
#Split authors by ;
#Duplicate the row for each author for the specific paper
df_A_B_Code = (
    df_A_B_Code
        .assign(**{
            "Author(s)": df_A_B_Code["Author(s)"].str.split(";")
        })
        .explode("Author(s)")
        .assign(**{
            "Author(s)": lambda x: x["Author(s)"].str.strip()
        })
        .reset_index(drop=True)
)

In [1]:
def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def standardize_name(raw):
    """
    Returns a cleaned full-name string with consistent formatting.
    Handles:
      - casefold/lowercase
      - accents removed
      - punctuation removed
      - 'Last, First' -> 'First Last'
      - extra spaces collapsed
    """
    if pd.isna(raw):
        return ""

    s = str(raw).strip()
    if s == "":
        return ""

    s = strip_accents(s).lower()

    # Convert "Last, First Middle" -> "First Middle Last"
    if "," in s:
        parts = [p.strip() for p in s.split(",", 1)]
        if len(parts) == 2 and parts[0] and parts[1]:
            s = f"{parts[1]} {parts[0]}"

    # Replace punctuation with spaces (keeps letters/numbers)
    s = re.sub(r"[^\w\s]", " ", s)

    # Collapse whitespace
    s = normalize_whitespace(s)

    return s

def author_key(raw):
    """
    Robust matching key: "<lastname> <firstinitials>"
    Example: "John Ronald Reuel Tolkien" -> "tolkien jrr"
    """
    s = standardize_name(raw)
    if not s:
        return ""

    parts = s.split()
    if len(parts) == 1:
        return parts[0]

    last = parts[-1]
    initials = "".join(p[0] for p in parts[:-1] if p)
    return f"{last} {initials}"


In [2]:
df_Authors_std = pd.DataFrame({
    "author_std": df_Authors["author_name"].apply(standardize_name)
})

NameError: name 'pd' is not defined

In [38]:
df_Authors_std = df_Authors.copy()

df_Authors_std["author_std"] = (
    df_Authors_std["author_name"]
        .astype(str)
        .apply(standardize_name)
)


In [42]:
# Reference list to match against (standardized author names)
choices = (
    df_A_B_Code_std["author_std"]
        .dropna()
        .astype(str)
        .tolist()
)

def match_std_name(name):
    if pd.isna(name) or str(name).strip() == "":
        return pd.Series([None, 0, None], index=["matched_author_std", "match_score", "matched_pos"])
    m = process.extractOne(str(name), choices, scorer=fuzz.WRatio)  
    return pd.Series([m[0], m[1], m[2]], index=["matched_author_std", "match_score", "matched_pos"])

df_Authors_std[["matched_author_std", "match_score", "matched_pos"]] = (
    df_Authors_std["author_std"].apply(match_std_name)
)

df_Authors_std["matched_df_index"] = df_Authors_std["matched_pos"].map(
    lambda p: df_A_B_Code_std.index[p] if pd.notna(p) else None
)


In [47]:
df_Authors_std_sorted = df_Authors_std.sort_values("match_score", ascending=False)
df_Authors_std_sorted[df_Authors_std_sorted["match_score"] >= 95]

Unnamed: 0,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url,matched_author,match_score,matched_index,author_std,matched_author_std,matched_pos,matched_df_index
1150,"Andrei, Tudorel",pan198,"Facultatea de Cibernetica, Statistica şi Infor...",,"Bucureşti, Romania",edi:feasero,https://ideas.repec.org/e/pan198.html,Andrei Tudorel,100.0,13960,tudorel andrei,tudorel andrei,40969,40969
25037,"Schlepper, Marcel",psc950,ifo Institut - Leibniz-Institut für Wirtschaft...,90%,"München, Germany",edi:ifooode,https://ideas.repec.org/f/psc950.html,Marcel Schlepper,100.0,32690,marcel schlepper,marcel schlepper,40340,40340
24698,"Streb, Jochen",pst382,Abteilung für Volkswirtschaftslehre Universitä...,,"Mannheim, Germany",edi:fvmande,https://ideas.repec.org/f/pst382.html,Jochen Streb,100.0,42900,jochen streb,jochen streb,59284,59284
26261,"Tillmann, Peter",pti99,Fachbereich Wirtschaftswissenschaften Justus-L...,,"Gießen, Germany",edi:fwgiede,https://ideas.repec.org/e/pti99.html,Peter Tillmann,100.0,16311,peter tillmann,peter tillmann,19811,19811
4458,"Condrea, Elena",pco736,Facultatea de Ştiinţe Economice Universitatea ...,,"Constanta, Romania",edi:feoviro,https://ideas.repec.org/f/pco736.html,Elena Condrea,100.0,10674,elena condrea,elena condrea,12536,12536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,"Argandona, Antonio",par213,IESE Business School Universidad de Navarra,,"Barcelona, Spain",edi:ienaves,https://ideas.repec.org/f/par213.html,Argandona Antonio,95.0,4147,antonio argandona,argandona antonio,4733,4733
1048,"Ausloos, Marcel",pau47,Academia de Studii Economice din Bucureşti,50%,"Bucureşti, Romania",edi:aseeero,https://ideas.repec.org/e/pau47.html,Ausloos Marcel,95.0,2870,marcel ausloos,ausloos marcel,3322,3322
1677,"Bagus, Philipp",pba545,Departamento de Economía Aplicada I Universida...,,"Madrid, Spain",edi:darjces,https://ideas.repec.org/f/pba545.html,Bagus Philipp,95.0,34878,philipp bagus,bagus philipp,44870,44870
1148,"Andrei, Liviu Catalin",pan318,Facultatea de Administraţie Publică Şcoala Naţ...,,"Bucureşti, Romania",edi:fasnsro,https://ideas.repec.org/f/pan318.html,Andrei Liviu Catalin,95.0,21664,liviu catalin andrei,andrei liviu catalin,26566,26566


In [22]:
#Do fuzzy matching for the authors in df_Authors to find the match in df_A_B_Code
#Obtain matching scores 
choices = (
    df_A_B_Code["Author(s)"]
        .dropna()
        .astype(str)
        .unique()
        .tolist()
)

def fuzzy_match_one(name):
    if pd.isna(name) or str(name).strip() == "":
        return pd.Series(
            [None, 0, None],
            index=["matched_author", "match_score", "matched_index"]
        )

    match, score, idx = process.extractOne(
        str(name),
        choices,
        scorer=fuzz.WRatio
    )

    return pd.Series(
        [match, score, idx],
        index=["matched_author", "match_score", "matched_index"]
    )

df_Authors[["matched_author", "match_score", "matched_index"]] = (
    df_Authors["author_name"].apply(fuzzy_match_one)
)

In [26]:
df_Authors_sorted = df_Authors.sort_values(
    by="match_score",
    ascending=False
)

In [32]:
df_Authors_sorted[df_Authors_sorted["match_score"] >= 87]


Unnamed: 0,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url,matched_author,match_score,matched_index
28881,"Zabala Iturriagagoitia, Jon Mikel",pza45,Facultad de Ciencias Económicas y Empresariale...,,"Bilbao/San Sebastián, Spain",edi:fsdeues,https://ideas.repec.org/e/pza45.html,Zabala Iturriagagoitia Jon Mikel,98.461538,43110
4541,"Constantinescu, Lucretia Mariana",pco601,"Departamentul Management, Marketing Facultatea...",,"Targoviste, Romania",edi:dmvalro,https://ideas.repec.org/f/pco601.html,Constantinescu Lucretia Mariana,98.412698,5786
16701,"Montero Ledezma de Hagerf, Paola",pmo1568,Konkurrensverket Government of Sweden,,"Stockholm, Sweden",edi:kkvgvse,https://ideas.repec.org/f/pmo1568.html,Montero Ledezma de Hagerf Paola,98.412698,43411
18088,"Milczarek-Andrzejewska, Dominika",pmi438,Wydział Nauk Ekonomicznych Uniwersytet Warszawski,,"Warszawa, Poland",edi:fesuwpl,https://ideas.repec.org/f/pmi438.html,Milczarek-Andrzejewska Dominika,98.412698,33036
20864,"Prados de la Escosura, Leandro",ppr58,Instituto Figuerola de Historia y Ciencias Soc...,50%,"Madrid, Spain",edi:ilfhees,https://ideas.repec.org/e/ppr58.html,Prados de la Escosura Leandro,98.305085,34363
...,...,...,...,...,...,...,...,...,...,...
6159,"D'Alessio, Giovanni",pda166,Banca d'Italia,,"Roma, Italy",edi:bdigvit,https://ideas.repec.org/e/pda166.html,DâAlessio Giovanni,87.179487,35507
23110,"Soares, Ana Cristina",pso359,Banco de Portugal,,"Lisboa, Portugal",edi:bdpgvpt,https://ideas.repec.org/f/pso359.html,Torres Ana Cristina,87.179487,27702
21115,"Pattarin, Francesco",ppa329,Centro Studi di Banca e Finanza (CEFIN) Dipart...,,"Modena, Italy",edi:cbmodit,https://ideas.repec.org/f/ppa329.html,Cattabrini Francesco,87.179487,13201
11270,"Holtemoeller, Oliver",pho140,Leibniz-Institut für Wirtschaftsforschung Hall...,50%,"Halle, Germany",edi:iwhhhde,https://ideas.repec.org/e/pho140.html,HoltemÃ¶ller Oliver,87.179487,43397


In [None]:
df2[df2["Title"]=="Mass Reproducibility and Replicability: A New Hope"] 