In [107]:
import pandas as pd 
from rapidfuzz import process, fuzz
import re
import unicodedata

In [108]:
authors_institution_df = pd.read_csv('data_A_Z.csv')

authors_institution_df['share_pct'] = authors_institution_df['share_pct'].fillna(100)
authors_institution_df['institution'].value_counts().head(10)

institution
Institute of Labor Economics (IZA)                                                          952
Joint Research Centre European Commission                                                   403
Banca d'Italia                                                                              375
CESifo                                                                                      341
European Central Bank                                                                       276
Paris School of Economics                                                                   214
Banque de France                                                                            199
Leibniz-Zentrum für Europäische Wirtschaftsforschung (ZEW)                                  190
ifo Institut - Leibniz-Institut für Wirtschaftsforschung an der Universität München e.V.    190
Deutsche Bundesbank                                                                         178
Name: count, dtype: int64

isaure

In [109]:
top10_institutions = (
    authors_institution_df['institution']
    .value_counts()
    .head(10)
    .index
)

top10_institutions_df = authors_institution_df[
    authors_institution_df['institution'].isin(top10_institutions)
]

In [110]:
top10_institutions_df

Unnamed: 0,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url
12,"Antipa, Pamfili",pan377,Banque de France,80%,"Paris, France",edi:bdfgvfr,https://ideas.repec.org/f/pan377.html
16,"Antman, Francisca M.",pan425,Institute of Labor Economics (IZA),5%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/f/pan425.html
58,"Antonioli, Federico",pan678,Joint Research Centre European Commission,100,"Sevilla, Spain",edi:ipjrces,https://ideas.repec.org/f/pan678.html
59,"Albanese, Giuseppe",pal552,Banca d'Italia,100,"Roma, Italy",edi:bdigvit,https://ideas.repec.org/f/pal552.html
94,"Albertazzi, Ugo",pal705,European Central Bank,50%,"Frankfurt am Main, Germany",edi:emieude,https://ideas.repec.org/f/pal705.html
...,...,...,...,...,...,...,...
29263,"Zweimuller, Josef",pzw2,Institute of Labor Economics (IZA),1%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/e/pzw2.html
29266,"Zwick, Thomas",pzw6,Leibniz-Zentrum für Europäische Wirtschaftsfor...,42%,"Mannheim, Germany",edi:zemande,https://ideas.repec.org/e/pzw6.html
29271,"Zerpa, Mariana",pze64,Institute of Labor Economics (IZA),10%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/e/pze64.html
29286,"Zylberberg, Andre",pzy2,Paris School of Economics,50%,"Paris, France",edi:eeparfr,https://ideas.repec.org/e/pzy2.html


In [111]:
A_B_df = pd.read_csv('RePEc_Full_A_B.csv')
E_Z_df = pd.read_csv('RePEc_Full_E_Z.csv')
D_df = pd.read_csv('RePEc-D-not-full.csv')
S_Y_df = pd.read_csv('RePEc-from-S-to-Y-full.csv')
J_df = pd.read_csv('RePEc_J.csv')

In [112]:
JEL_df = pd.concat(
    [A_B_df, D_df, E_Z_df, S_Y_df, J_df],
    axis=0,
    ignore_index=True
)

JEL_df = JEL_df.drop_duplicates()

In [113]:
JEL_df

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL
0,A,Working Paper Series,,,,Working Paper,,https://econpapers.repec.org/paper/
1,A,Journals,,,,Journal Article,,https://econpapers.repec.org/article/
2,A,Preparing students for careers using business ...,Nielsen Erland Hejn; Nielsen Steen,,2020.0,Working Paper,Erland Hejn Nielsen: Department of Economics a...,https://econpapers.repec.org/paper/aahaarhec/2...
3,A,"Measuring Democracy - Eight indices: Polity, F...",Paldam Martin,,2021.0,Working Paper,Martin Paldam: Department of Economics and Bus...,https://econpapers.repec.org/paper/aahaarhec/2...
4,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,Hylleberg Svend,,2023.0,Working Paper,Svend Hylleberg: Department of Economics and B...,https://econpapers.repec.org/paper/aahaarhec/2...
...,...,...,...,...,...,...,...,...
183358,K,Firmâs Credit Risk in the Presence of Market...,Xing Haipeng; Yu Yang,Risks,2018.0,Journal Article,Haipeng Xing: Department of Applied Mathematic...,https://econpapers.repec.org/article/gamjrisks...
183359,K,Overdispersed-Poisson Model in Claims Reservin...,Strascia Stefano Cavastracci; Tripodi Agostino,Risks,2018.0,Journal Article,"Stefano Cavastracci Strascia: IVASS, Prudentia...",https://econpapers.repec.org/article/gamjrisks...
183360,K,A General Framework for Portfolio Theory. Part...,Maier-Paape Stanislaus; Zhu Qiji Jim,Risks,2018.0,Journal Article,Stanislaus Maier-Paape: Institut fÃ¼r Mathemat...,https://econpapers.repec.org/article/gamjrisks...
183361,K,Calendar Spread Exchange Options Pricing with ...,Hainaut Donatien,Risks,2018.0,Journal Article,"Donatien Hainaut: Institute of Statistics, Bio...",https://econpapers.repec.org/article/gamjrisks...


In [114]:
sum(JEL_df["Author(s)"].isna())
JEL_df = JEL_df.dropna(subset=["Author(s)"])

JEL_df = (
    JEL_df
        .assign(**{
            "Author(s)": JEL_df["Author(s)"].str.split(";")
        })
        .explode("Author(s)")
        .assign(**{
            "Author(s)": lambda x: x["Author(s)"].str.strip()
        })
        .reset_index(drop=True)
)

In [115]:
def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def standardize_name(raw):
    """
    Returns a cleaned full-name string with consistent formatting.
    Handles:
      - casefold/lowercase
      - accents removed
      - punctuation removed
      - 'Last, First' -> 'First Last'
      - extra spaces collapsed
    """
    if pd.isna(raw):
        return ""

    s = str(raw).strip()
    if s == "":
        return ""

    s = strip_accents(s).lower()

    # Convert "Last, First Middle" -> "First Middle Last"
    if "," in s:
        parts = [p.strip() for p in s.split(",", 1)]
        if len(parts) == 2 and parts[0] and parts[1]:
            s = f"{parts[1]} {parts[0]}"

    # Replace punctuation with spaces (keeps letters/numbers)
    s = re.sub(r"[^\w\s]", " ", s)

    # Collapse whitespace
    s = normalize_whitespace(s)

    return s




In [116]:
JEL_df_std = pd.DataFrame({
    "author_std": JEL_df["Author(s)"].apply(standardize_name)
})

JEL_df_std = JEL_df.copy()

JEL_df_std["author_std"] = (
    JEL_df_std["Author(s)"]
        .astype(str)
        .apply(standardize_name)
)

JEL_df_std = JEL_df_std.drop(columns="Author(s)")

In [117]:
#Do fuzzy matching for the authors in df_Authors to find the match in df_A_B_Code
#Obtain matching scores 
choices = JEL_df_std["author_std"].astype(str).unique().tolist()

def fuzzy_match_one(name):
    if pd.isna(name) or str(name).strip() == "":
        return pd.Series([None, 0, None],
                         index=["matched_author", "match_score", "matched_index"])

    match, score, idx = process.extractOne(str(name), choices, scorer=fuzz.WRatio)
    return pd.Series([match, score, idx],
                     index=["matched_author", "match_score", "matched_index"])

top10_institutions_df[["matched_author", "match_score", "matched_index"]] = (
    top10_institutions_df["author_name"].apply(fuzzy_match_one)
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10_institutions_df[["matched_author", "match_score", "matched_index"]] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10_institutions_df[["matched_author", "match_score", "matched_index"]] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10_institutions_df[["matched_author", "match_s

In [120]:
top10_institutions_df = top10_institutions_df.sort_values(
    by="match_score",
    ascending=False
)


#top10_institutions_df[top10_institutions_df["match_score"] >= 75]

In [122]:
top10_institutions_df.to_csv('top10_institutions_df.csv')