In [1]:
import pandas as pd 
from rapidfuzz import process, fuzz
import re
import unicodedata

In [2]:
authors_institution_df = pd.read_csv('data_A_Z.csv')

authors_institution_df['share_pct'] = authors_institution_df['share_pct'].fillna(100)
authors_institution_df['institution'].value_counts().head(10)

institution
Institute of Labor Economics (IZA)                                                          952
Joint Research Centre European Commission                                                   403
Banca d'Italia                                                                              375
CESifo                                                                                      341
European Central Bank                                                                       276
Paris School of Economics                                                                   214
Banque de France                                                                            199
ifo Institut - Leibniz-Institut für Wirtschaftsforschung an der Universität München e.V.    190
Leibniz-Zentrum für Europäische Wirtschaftsforschung (ZEW)                                  190
Deutsche Bundesbank                                                                         178
Name: count, dtype: int64

isaure

In [3]:
top10_institutions = (
    authors_institution_df['institution']
    .value_counts()
    .head(10)
    .index
)

top10_institutions_df = authors_institution_df[
    authors_institution_df['institution'].isin(top10_institutions)
]

In [4]:
top10_institutions_df

Unnamed: 0,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url
12,"Antipa, Pamfili",pan377,Banque de France,80%,"Paris, France",edi:bdfgvfr,https://ideas.repec.org/f/pan377.html
16,"Antman, Francisca M.",pan425,Institute of Labor Economics (IZA),5%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/f/pan425.html
58,"Antonioli, Federico",pan678,Joint Research Centre European Commission,100,"Sevilla, Spain",edi:ipjrces,https://ideas.repec.org/f/pan678.html
59,"Albanese, Giuseppe",pal552,Banca d'Italia,100,"Roma, Italy",edi:bdigvit,https://ideas.repec.org/f/pal552.html
94,"Albertazzi, Ugo",pal705,European Central Bank,50%,"Frankfurt am Main, Germany",edi:emieude,https://ideas.repec.org/f/pal705.html
...,...,...,...,...,...,...,...
29263,"Zweimuller, Josef",pzw2,Institute of Labor Economics (IZA),1%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/e/pzw2.html
29266,"Zwick, Thomas",pzw6,Leibniz-Zentrum für Europäische Wirtschaftsfor...,42%,"Mannheim, Germany",edi:zemande,https://ideas.repec.org/e/pzw6.html
29271,"Zerpa, Mariana",pze64,Institute of Labor Economics (IZA),10%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/e/pze64.html
29286,"Zylberberg, Andre",pzy2,Paris School of Economics,50%,"Paris, France",edi:eeparfr,https://ideas.repec.org/e/pzy2.html


In [5]:
A_B_df = pd.read_csv('RePEc_Full_A_B.csv')
E_Z_df = pd.read_csv('RePEc_Full_E_Z.csv')
D_df = pd.read_csv('RePEc-D-not-full.csv')
S_Y_df = pd.read_csv('RePEc-from-S-to-Y-full.csv')
J_df = pd.read_csv('RePEc_J.csv')

In [6]:
JEL_df = pd.concat(
    [A_B_df, D_df, E_Z_df, S_Y_df, J_df],
    axis=0,
    ignore_index=True
)

JEL_df = JEL_df.drop_duplicates()

In [7]:
JEL_df

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL
0,A,Working Paper Series,,,,Working Paper,,https://econpapers.repec.org/paper/
1,A,Journals,,,,Journal Article,,https://econpapers.repec.org/article/
2,A,Preparing students for careers using business ...,Nielsen Erland Hejn; Nielsen Steen,,2020.0,Working Paper,Erland Hejn Nielsen: Department of Economics a...,https://econpapers.repec.org/paper/aahaarhec/2...
3,A,"Measuring Democracy - Eight indices: Polity, F...",Paldam Martin,,2021.0,Working Paper,Martin Paldam: Department of Economics and Bus...,https://econpapers.repec.org/paper/aahaarhec/2...
4,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,Hylleberg Svend,,2023.0,Working Paper,Svend Hylleberg: Department of Economics and B...,https://econpapers.repec.org/paper/aahaarhec/2...
...,...,...,...,...,...,...,...,...
183358,K,Firmâs Credit Risk in the Presence of Market...,Xing Haipeng; Yu Yang,Risks,2018.0,Journal Article,Haipeng Xing: Department of Applied Mathematic...,https://econpapers.repec.org/article/gamjrisks...
183359,K,Overdispersed-Poisson Model in Claims Reservin...,Strascia Stefano Cavastracci; Tripodi Agostino,Risks,2018.0,Journal Article,"Stefano Cavastracci Strascia: IVASS, Prudentia...",https://econpapers.repec.org/article/gamjrisks...
183360,K,A General Framework for Portfolio Theory. Part...,Maier-Paape Stanislaus; Zhu Qiji Jim,Risks,2018.0,Journal Article,Stanislaus Maier-Paape: Institut fÃ¼r Mathemat...,https://econpapers.repec.org/article/gamjrisks...
183361,K,Calendar Spread Exchange Options Pricing with ...,Hainaut Donatien,Risks,2018.0,Journal Article,"Donatien Hainaut: Institute of Statistics, Bio...",https://econpapers.repec.org/article/gamjrisks...


In [8]:
sum(JEL_df["Author(s)"].isna())
JEL_df = JEL_df.dropna(subset=["Author(s)"])

JEL_df = (
    JEL_df
        .assign(**{
            "Author(s)": JEL_df["Author(s)"].str.split(";")
        })
        .explode("Author(s)")
        .assign(**{
            "Author(s)": lambda x: x["Author(s)"].str.strip()
        })
        .reset_index(drop=True)
)

In [12]:
def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def standardize_name(raw):
    """
    Returns a cleaned full-name string with consistent formatting.
    Handles:
      - casefold/lowercase
      - accents removed
      - punctuation removed
      - 'Last, First' -> 'First Last'
      - extra spaces collapsed
    """
    if pd.isna(raw):
        return ""

    s = str(raw).strip()
    if s == "":
        return ""

    s = strip_accents(s).lower()

    # Convert "Last, First Middle" -> "First Middle Last"
    if "," in s:
        parts = [p.strip() for p in s.split(",", 1)]
        if len(parts) == 2 and parts[0] and parts[1]:
            s = f"{parts[1]} {parts[0]}"

    # Replace punctuation with spaces (keeps letters/numbers)
    s = re.sub(r"[^\w\s]", " ", s)

    # Collapse whitespace
    s = normalize_whitespace(s)

    return s

def author_key(raw):
    """
    Robust matching key: "<lastname> <firstinitials>"
    Example: "John Ronald Reuel Tolkien" -> "tolkien jrr"
    """
    s = standardize_name(raw)
    if not s:
        return ""

    parts = s.split()
    if len(parts) == 1:
        return parts[0]

    last = parts[-1]
    initials = "".join(p[0] for p in parts[:-1] if p)
    return f"{last} {initials}"


In [13]:
JEL_df_std = pd.DataFrame({
    "author_std": JEL_df["Author(s)"].apply(standardize_name)
})

JEL_df_std = JEL_df.copy()

JEL_df_std["author_std"] = (
    JEL_df_std["Author(s)"]
        .astype(str)
        .apply(standardize_name)
)

In [14]:
JEL_df_std

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL,author_std
0,A,Preparing students for careers using business ...,Nielsen Erland Hejn,,2020.0,Working Paper,Erland Hejn Nielsen: Department of Economics a...,https://econpapers.repec.org/paper/aahaarhec/2...,nielsen erland hejn
1,A,Preparing students for careers using business ...,Nielsen Steen,,2020.0,Working Paper,Erland Hejn Nielsen: Department of Economics a...,https://econpapers.repec.org/paper/aahaarhec/2...,nielsen steen
2,A,"Measuring Democracy - Eight indices: Polity, F...",Paldam Martin,,2021.0,Working Paper,Martin Paldam: Department of Economics and Bus...,https://econpapers.repec.org/paper/aahaarhec/2...,paldam martin
3,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,Hylleberg Svend,,2023.0,Working Paper,Svend Hylleberg: Department of Economics and B...,https://econpapers.repec.org/paper/aahaarhec/2...,hylleberg svend
4,A,Digital Tools in the Educational Environment E...,Andra Diaconescu,Research & Education,2024.0,Journal Article,Diaconescu Andra: Politehnica University of Ti...,https://econpapers.repec.org/article/aaijournl...,andra diaconescu
...,...,...,...,...,...,...,...,...,...
355589,K,A General Framework for Portfolio Theory. Part...,Zhu Qiji Jim,Risks,2018.0,Journal Article,Stanislaus Maier-Paape: Institut fÃ¼r Mathemat...,https://econpapers.repec.org/article/gamjrisks...,zhu qiji jim
355590,K,Calendar Spread Exchange Options Pricing with ...,Hainaut Donatien,Risks,2018.0,Journal Article,"Donatien Hainaut: Institute of Statistics, Bio...",https://econpapers.repec.org/article/gamjrisks...,hainaut donatien
355591,K,On Fund Mapping Regressions Applied to Segrega...,Trottier Denis-Alexandre,Risks,2018.0,Journal Article,Denis-Alexandre Trottier: FacultÃ© des Science...,https://econpapers.repec.org/article/gamjrisks...,trottier denis alexandre
355592,K,On Fund Mapping Regressions Applied to Segrega...,Godin FrÃ©dÃ©ric,Risks,2018.0,Journal Article,Denis-Alexandre Trottier: FacultÃ© des Science...,https://econpapers.repec.org/article/gamjrisks...,godin fra da ric


In [39]:
#Do fuzzy matching for the authors in df_Authors to find the match in df_A_B_Code
#Obtain matching scores 
choices = (
    top10_institutions_df["author_name"]
        .astype(str)
        .unique()
        .tolist()
)

def fuzzy_match_one(name):
    if (
        pd.isna(name)
        or str(name).strip() == ""
        or len(str(name).strip()) < 4
    ):
        return pd.Series(
            [None, 0, None],
            index=["matched_author", "match_score", "matched_index"]
        )

    match, score, idx = process.extractOne(
        str(name),
        choices,
        scorer=fuzz.WRatio
    )

    return pd.Series(
        [match, score, idx],
        index=["matched_author", "match_score", "matched_index"]
    )

JEL_df_std[["matched_author", "match_score", "matched_index"]] = (
    JEL_df_std["author_std"].apply(fuzzy_match_one)
)

In [40]:
JEL_df_std["author_name"] = JEL_df_std["matched_index"].apply(
    lambda i: choices[int(i)] if pd.notna(i) else None
)


In [None]:
JEL_df_std["matched_author"].dropna()

KeyError: 'matched_author'

In [52]:
JEL_df_std.columns

AttributeError: 'Series' object has no attribute 'columns'

In [55]:
JEL_df_std[["author_std", "author_name","Author(s)", "matched_author", "match_score"]].head(10)


KeyError: "None of [Index(['author_std', 'author_name', 'Author(s)', 'matched_author',\n       'match_score'],\n      dtype='object')] are in the [index]"

In [34]:
JEL_df_std[JEL_df_std["Author(s)"]=="E."].head()

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL,author_std,matched_author,match_score,matched_index,author_name
288703,Z,Testing for Peer Effects Using Genetic Data,E.,,2018.0,Working Paper,,https://econpapers.repec.org/paper/yorhectdg/1...,e,"Chen, Xi",90.0,545.0,"Chen, Xi"
91861,D,Irrational Behavior of Youth When Taking Finan...,E.,International Journal of Economics & Business ...,2019.0,Journal Article,,https://econpapers.repec.org/article/ersijebaa...,e,"Chen, Xi",90.0,545.0,"Chen, Xi"


In [31]:
JEL_df_std[JEL_df_std["Author(s)"]=="Andi"].head()

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL,author_std,matched_author,match_score,matched_index,author_name
204001,E,41 Decisiones inaplazables para BogotÃ¡. La ru...,Andi,,2019.0,Working Paper,,https://econpapers.repec.org/paper/col000124/0...,andi,"Gandil, Mikkel",90.0,1104.0,"Gandil, Mikkel"
167505,E,41 Decisiones inaplazables para BogotÃ¡. La ru...,Andi,Coyuntura EconÃ³mica,2018.0,Journal Article,,https://econpapers.repec.org/article/col000438...,andi,"Gandil, Mikkel",90.0,1104.0,"Gandil, Mikkel"


In [21]:
JEL_df_std = JEL_df_std.sort_values(
    by="match_score",
    ascending=False
)

JEL_df_std
JEL_df_std[JEL_df_std["match_score"] >= 77]


Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL,author_std,matched_author,match_score,matched_index,author_name
316378,J,"Informal Markets, Domestic Production and Dema...",Gardes Francois,Economics Bulletin,2017.0,Journal Article,Armagan Aktuna-gunes: Paris School of Economic...,https://econpapers.repec.org/article/eblecbull...,gardes francois,"Gardes, francois",90.322581,1154.0,"Gardes, francois"
95460,D,"Informal Markets, Domestic Production and Dema...",Gardes Francois,Economics Bulletin,2017.0,Journal Article,Armagan Aktuna-gunes: Paris School of Economic...,https://econpapers.repec.org/article/eblecbull...,gardes francois,"Gardes, francois",90.322581,1154.0,"Gardes, francois"
312742,J,SecciÃ³n de opiniÃ³n: Confebask y los sindicat...,Lab,EKONOMIAZ. Revista vasca de EconomÃ­a,1995.0,Journal Article,,https://econpapers.repec.org/article/ekzekonoz...,lab,"Colabella, Andrea",90.000000,482.0,"Colabella, Andrea"
312740,J,SecciÃ³n de opiniÃ³n: Confebask y los sindicat...,Ela,EKONOMIAZ. Revista vasca de EconomÃ­a,1995.0,Journal Article,,https://econpapers.repec.org/article/ekzekonoz...,ela,"Arlia, Daniela",90.000000,42.0,"Arlia, Daniela"
288703,Z,Testing for Peer Effects Using Genetic Data,E.,,2018.0,Working Paper,,https://econpapers.repec.org/paper/yorhectdg/1...,e,"Chen, Xi",90.000000,545.0,"Chen, Xi"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
214812,E,"Moderates Wachstum, Steigende Arbeitslosigkeit",Sandqvist Anna,KOF Analysen,2015.0,Journal Article,Yngve Abrahamsen: KOF Swiss Economic Institute...,https://econpapers.repec.org/article/kofanskof...,sandqvist anna,"Sandqvist, Anna Pauliina",77.142857,2570.0,"Sandqvist, Anna Pauliina"
289008,Z,Early-life Circumstances Predict Measures of T...,Winter Joachim,,2016.0,Working Paper,,https://econpapers.repec.org/paper/cesceswps/_...,winter joachim,"Winter, Joachim Klaus",77.142857,3013.0,"Winter, Joachim Klaus"
89403,D,The nexus of electricity infrastructure invest...,Tian Qi,Utilities Policy,2024.0,Journal Article,,https://econpapers.repec.org/article/eeejuipol...,tian qi,"Braun, Sebastian Till",77.142857,269.0,"Braun, Sebastian Till"
290138,Z,Experimental Effects of an Absent Crowd on Per...,Singleton Carl,,2020.0,Working Paper,Peter Dolton: University of Sussex,https://econpapers.repec.org/paper/izaizadps/d...,singleton carl,"Singleton, Carl Andrew",77.142857,2717.0,"Singleton, Carl Andrew"
