In [1]:
import re
import unicodedata
import pandas as pd
import pathlib


In [2]:

from pathlib import Path

# -----------------------
# BASE PATH
# -----------------------
BASE_PATH = Path("JEL-code-Data")

# -----------------------
# Institutions
# -----------------------
authors_institution_df = pd.read_csv("data_A_Z.csv")
authors_institution_df["share_pct"] = authors_institution_df["share_pct"].fillna(100)

# -----------------------
# RePEc / JEL
# -----------------------
A_B_df = pd.read_csv(BASE_PATH / "RePEc_A_B_Full.csv")
C_df   = pd.read_csv(BASE_PATH / "RePEc_C_Full.csv")
D_df   = pd.read_csv(BASE_PATH / "RePEc_D_Full.csv")
E_Z_df = pd.read_csv(BASE_PATH / "RePEc_E_Z_Full.csv")
F_df   = pd.read_csv(BASE_PATH / "RePEc_F_Full.csv")
G_df   = pd.read_csv(BASE_PATH / "RePEc_G_Full.csv")
H_df   = pd.read_csv(BASE_PATH / "RePec_H_Full.csv")
I_df   = pd.read_csv(BASE_PATH / "RePEc_I_Full.csv")
J_df   = pd.read_csv(BASE_PATH / "RePec_J_Full.csv")
L_df   = pd.read_csv(BASE_PATH / "RePEc_L_Full.csv")
M_df   = pd.read_csv(BASE_PATH / "RePEc_M_Full.csv")
N_df   = pd.read_csv(BASE_PATH / "RePEc_N_Full.csv")
O_df   = pd.read_csv(BASE_PATH / "RePec_O_Not_Full.csv")
P_df   = pd.read_csv(BASE_PATH / "RePec_P_Full.csv")
Q1_df   = pd.read_csv(BASE_PATH / "RePec_Q_1.csv")    
Q2_df   = pd.read_csv(BASE_PATH / "RePec_Q_2.csv")      
Q3_df   = pd.read_csv(BASE_PATH / "RePec_Q_3.csv")  
R_df   = pd.read_csv(BASE_PATH / "RePEc_R_Full.csv")
Y_df   = pd.read_csv(BASE_PATH / "RePEc_Y_Full.csv")

# -----------------------
# CONCAT FINAL
# -----------------------
JEL_df = (
    pd.concat(
        [
            A_B_df, C_df, D_df, E_Z_df, F_df, G_df, H_df, I_df,
            J_df, L_df, M_df, N_df, O_df, P_df, Q1_df,Q2_df, Q3_df, R_df, Y_df
        ],
        ignore_index=True
    )
    .drop_duplicates()
)


# explosion des co-auteurs

In [3]:
JEL_df = (
    JEL_df
        .dropna(subset=["Author(s)"])
        .assign(Author=lambda x: x["Author(s)"].str.split(";"))
        .explode("Author")
        .assign(Author=lambda x: x["Author"].str.strip())
        .query("Author != ''")
        .drop(columns=["Author(s)"])
        .rename(columns={"Author": "Author(s)"})
        .reset_index(drop=True)
)


# nettoyage des noms

In [4]:
def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def standardize_name(raw):
    if raw is None or pd.isna(raw):
        return ""
    s = strip_accents(str(raw).lower())
    if "," in s:
        last, rest = s.split(",", 1)
        s = rest.strip() + " " + last.strip()
    s = re.sub(r"[^\w\s]", " ", s)
    return normalize_whitespace(s)


# standardisation des 2 côtés

In [5]:
JEL_df["author_std"] = JEL_df["Author(s)"].apply(standardize_name)
authors_institution_df["author_std"] = authors_institution_df["author_name"].apply(standardize_name)


# clé auteur principal (ordre libre)

In [6]:
def author_key_unordered_no_initial(raw):
    s = standardize_name(raw)
    if not s:
        return ""
    tokens = [t for t in s.split() if len(t) > 1]
    return "|".join(sorted(tokens))


In [7]:
JEL_df["author_key"] = JEL_df["Author(s)"].apply(author_key_unordered_no_initial)
authors_institution_df["author_key"] = authors_institution_df["author_name"].apply(author_key_unordered_no_initial)


In [8]:
JEL_matched = (
    JEL_df
        .merge(
            authors_institution_df,
            on="author_key",
            how="inner",
            suffixes=("_jel", "_inst")
        )
)

JEL_matched["match_type"] = "exact"


In [9]:
JEL_matched["author_name"].nunique()

16184

In [10]:
JEL_matched

Unnamed: 0.2,JEL Subject,Title,Journal,Year,Type,Affiliations,URL,Unnamed: 0.1,Unnamed: 0,lettre,...,author_key,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url,author_std_inst,match_type
0,A,Preparing students for careers using business ...,,2020.0,Working Paper,Erland Hejn Nielsen: Department of Economics a...,https://econpapers.repec.org/paper/aahaarhec/2...,,,,...,nielsen|steen,"Nielsen, Steen",pni71,Institut for Økonomi Aarhus Universitet,100,"Aarhus, Denmark",edi:ifoaudk,https://ideas.repec.org/e/pni71.html,steen nielsen,exact
1,A,"Measuring Democracy - Eight indices: Polity, F...",,2021.0,Working Paper,Martin Paldam: Department of Economics and Bus...,https://econpapers.repec.org/paper/aahaarhec/2...,,,,...,martin|paldam,"Paldam, Martin",ppa574,School of Economics and Management Institut fo...,100,"Aarhus, Denmark",edi:anaaudk,https://ideas.repec.org/f/ppa574.html,martin paldam,exact
2,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,,2023.0,Working Paper,Svend Hylleberg: Department of Economics and B...,https://econpapers.repec.org/paper/aahaarhec/2...,,,,...,hylleberg|svend,"Hylleberg, Svend",phy1,Center for Research in Econometric Analysis of...,100,"Aarhus, Denmark",edi:creaudk,https://ideas.repec.org/e/phy1.html,svend hylleberg,exact
3,A,How Do Truckers Perceive and Respond to the Ri...,AEI Economic Perspectives,2024.0,Journal Article,Michael Strain: American Enterprise Institute,https://econpapers.repec.org/article/aeijournl...,,,,...,michael|strain,"Strain, Michael R.",pst593,Institute of Labor Economics (IZA),3%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/f/pst593.html,michael r strain,exact
4,A,Architecture to Transform Classic Academic Cou...,The AMFITEATRU ECONOMIC journal,2024.0,Journal Article,Andrei Bobocea: Bucharest University of Econom...,https://econpapers.repec.org/article/aesamfeco...,,,,...,batagan|lorena,"Batagan, Lorena",pba893,Academia de Studii Economice din Bucureşti,100,"Bucureşti, Romania",edi:aseeero,https://ideas.repec.org/f/pba893.html,lorena batagan,exact
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541865,Y,Gross Job Flows for the U.S. Manufacturing Sec...,,2006.0,Working Paper,,https://econpapers.repec.org/paper/cenwpaper/0...,,,Y,...,haltiwanger|john,"Haltiwanger, John",pha231,Institute of Labor Economics (IZA),1%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/f/pha231.html,john haltiwanger,exact
541866,Y,Herausforderungen bei der Messung von Wohlfahrt,Zeitschrift fÃ¼r Wirtschaftspolitik,2018.0,Journal Article,"Raffer Christian: Hertie School of Governance,...",https://econpapers.repec.org/article/luszwipol...,,,Y,...,beate|jochimsen,"Jochimsen, Beate",pjo236,Fachbereich Wirtschaftswissenschaften Hochschu...,100,"Berlin, Germany",edi:fhwbede,https://ideas.repec.org/f/pjo236.html,beate jochimsen,exact
541867,Y,Modelling perceived value as a driver of touri...,,2020.0,Working Paper,,https://econpapers.repec.org/paper/pramprapa/1...,,,Y,...,andrea|guizzardi,"Guizzardi, Andrea",pgu865,"Dipartimento di Scienze Statistiche ""Paolo For...",100,"Bologna, Italy",edi:dsbolit,https://ideas.repec.org/f/pgu865.html,andrea guizzardi,exact
541868,Y,Modelling perceived value as a driver of touri...,,2020.0,Working Paper,,https://econpapers.repec.org/paper/pramprapa/1...,,,Y,...,annalisa|stacchini,"Stacchini, Annalisa",pst909,"Dipartimento di Scienze Statistiche ""Paolo For...",100,"Bologna, Italy",edi:dsbolit,https://ideas.repec.org/f/pst909.html,annalisa stacchini,exact


# travail sur les non matchés à ce stade

In [11]:
matched_keys = set(JEL_matched["author_key"])

JEL_unmatched = (
    JEL_df[
        ~JEL_df["author_key"].isin(matched_keys)
    ]
    .copy()
)


In [12]:
def author_tokens(raw):
    s = standardize_name(raw)
    if not s:
        return set()
    return {t for t in s.split() if len(t) > 1}


In [13]:
JEL_unmatched["tokens"] = JEL_unmatched["Author(s)"].apply(author_tokens)
authors_institution_df["tokens"] = authors_institution_df["author_name"].apply(author_tokens)



In [14]:
inst_tokens_list = list(
    zip(
        authors_institution_df["tokens"],
        authors_institution_df["author_name"]
    )
)


In [15]:
from collections import defaultdict

token_index = defaultdict(list)

for _, row in authors_institution_df.iterrows():
    inst_tokens = row["tokens"]
    if len(inst_tokens) < 2:
        continue

    for tok in inst_tokens:
        if len(tok) > 3:
            token_index[tok].append((inst_tokens, row["author_name"]))



In [16]:
def subset_match_simple_fast(row):
    jel_tokens = row["tokens"]
    if len(jel_tokens) < 2:
        return None

    # tokens longs côté JEL
    long_tokens = [t for t in jel_tokens if len(t) > 3]
    if len(long_tokens) < 2:
        return None

    # candidats = institutions partageant ≥1 token long
    candidates = []
    for tok in long_tokens:
        candidates.extend(token_index.get(tok, []))

    for inst_tokens, inst_name in candidates:
        # mots communs longs
        common = {
            t for t in jel_tokens & inst_tokens
            if len(t) > 3
        }

        if len(common) < 2:
            continue

        # inclusion stricte
        if jel_tokens.issubset(inst_tokens) or inst_tokens.issubset(jel_tokens):
            return inst_name

    return None




In [17]:
JEL_unmatched["author_name_subset_strict"] = (
    JEL_unmatched.apply(subset_match_simple_fast, axis=1)
)



In [18]:
subset_simple = JEL_unmatched[
    JEL_unmatched["author_name_subset_strict"].notna()
]


In [19]:
subset_simple[
    ["Author(s)", "author_name_subset_strict"]
].drop_duplicates().sample(50, random_state=0)


Unnamed: 0,Author(s),author_name_subset_strict
775906,de Jesus Ratinho Lopes Arranhado Ramalho Esmer...,"Ramalho, Esmeralda A."
598763,Nguyen Hoang Phong,"Nguyen, Hoang"
1992300,Manitiu Dorel,"Manitiu, Dorel Nicolae"
275183,Osuntuyi Ayokunle,"Osuntuyi, Ayokunle Anthony"
1101116,Rendon Jose Maria Ruiz,"Maria, Jose R."
787773,Cardenas Luis,"Cardenas del Rey, Luis"
1622743,MuÃ±oz-Torres Francisco,"Torres, Francisco"
2441923,Anwar Muhammad Mushahid,"Anwar, Muhammad"
2564680,Tantau Adrian,"Tantau, Adrian Dumitru"
2614586,Torres-Ruiz Francisco JosÃ©,"Torres, Francisco"


In [20]:
subset_simple = subset_simple.copy()

subset_simple["author_name"] = subset_simple["author_name_subset_strict"]


In [21]:
subset_simple = subset_simple.drop(columns=["author_name_subset_strict"])


In [22]:
subset_simple.head()


Unnamed: 0.2,JEL Subject,Title,Journal,Year,Type,Affiliations,URL,Unnamed: 0.1,Unnamed: 0,lettre,Author(s),author_std,author_key,tokens,author_name
206,A,Influencing ESG Perception in SMEs through CSR...,The AMFITEATRU ECONOMIC journal,2024.0,Journal Article,Jaroslav Belas: Alexander Dubcek University in...,https://econpapers.repec.org/article/aesamfeco...,,,,Balcerzak Adam P.,balcerzak adam p,adam|balcerzak,"{adam, balcerzak}","Balcerzak, Adam P. Sr."
220,A,Size of Government and Economic Growth: A Conv...,The AMFITEATRU ECONOMIC journal,2024.0,Journal Article,Juan David Garcia Gonzalez: University of Alme...,https://econpapers.repec.org/article/aesamfeco...,,,,Manso Jose Ramos Pires,manso jose ramos pires,jose|manso|pires|ramos,"{pires, jose, manso, ramos}","Ramos, Jose"
258,A,Online Teaching Practices and the Effectivenes...,The AMFITEATRU ECONOMIC journal,2020.0,Journal Article,Cristina Venera Tartavulea: The Bucharest Univ...,https://econpapers.repec.org/article/aesamfeco...,,,,Albu Catalin Nicolae,albu catalin nicolae,albu|catalin|nicolae,"{catalin, nicolae, albu}","Albu, Nicolae"
319,A,ESG rating of capital's effect on firms' finan...,Access Journal,2024.0,Journal Article,Sholpan Shalbayeva: Almaty Management Universi...,https://econpapers.repec.org/article/aipaccess...,,,,Ismailov Taner,ismailov taner,ismailov|taner,"{ismailov, taner}","Ismailov, Taner Mustafov"
403,A,A NEW CHALLENGE â INTELLECTUAL CAPITAL EVALU...,Revista Tinerilor Economisti (The Young Econom...,2014.0,Journal Article,Ph. D Student Maria-Luminita Gogan: âPoliteh...,https://econpapers.repec.org/article/aiorteyej...,,,,Gogan Ph. D Student Maria-Luminita,gogan ph d student maria luminita,gogan|luminita|maria|ph|student,"{gogan, student, ph, maria, luminita}","Gogan, Luminita Maria"


In [23]:
subset_simple_merged = (
    subset_simple
        .merge(
            authors_institution_df,
            on="author_name",
            how="inner",
            suffixes=("_jel", "_inst")
        )
)




In [24]:
subset_simple_merged.shape



(72610, 24)

In [25]:
subset_simple_merged.head(50)


Unnamed: 0.2,JEL Subject,Title,Journal,Year,Type,Affiliations,URL,Unnamed: 0.1,Unnamed: 0,lettre,...,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url,author_std_inst,author_key_inst,tokens_inst
0,A,Influencing ESG Perception in SMEs through CSR...,The AMFITEATRU ECONOMIC journal,2024.0,Journal Article,Jaroslav Belas: Alexander Dubcek University in...,https://econpapers.repec.org/article/aesamfeco...,,,,...,"Balcerzak, Adam P. Sr.",pba1247,Wydział Nauk Ekonomicznych Universytet Warmińs...,100,"Olsztyn, Poland",edi:wnuwmpl,https://ideas.repec.org/f/pba1247.html,adam p sr balcerzak,adam|balcerzak|sr,"{adam, balcerzak, sr}"
1,A,Size of Government and Economic Growth: A Conv...,The AMFITEATRU ECONOMIC journal,2024.0,Journal Article,Juan David Garcia Gonzalez: University of Alme...,https://econpapers.repec.org/article/aesamfeco...,,,,...,"Ramos, Jose",pra393,Instituto Valenciano de Investigaciones Económ...,100,"Valencia, Spain",edi:ievages,https://ideas.repec.org/f/pra393.html,jose ramos,jose|ramos,"{jose, ramos}"
2,A,Online Teaching Practices and the Effectivenes...,The AMFITEATRU ECONOMIC journal,2020.0,Journal Article,Cristina Venera Tartavulea: The Bucharest Univ...,https://econpapers.repec.org/article/aesamfeco...,,,,...,"Albu, Nicolae",pal605,Facultatea de Ştiinţe Economice Universitatea ...,100,"Brasov, Romania",edi:fetbvro,https://ideas.repec.org/f/pal605.html,nicolae albu,albu|nicolae,"{nicolae, albu}"
3,A,ESG rating of capital's effect on firms' finan...,Access Journal,2024.0,Journal Article,Sholpan Shalbayeva: Almaty Management Universi...,https://econpapers.repec.org/article/aipaccess...,,,,...,"Ismailov, Taner Mustafov",pis198,D. A. Tsenov Academy of Economics,100,"Svishtov, Bulgaria",edi:tsenobg,https://ideas.repec.org/e/pis198.html,taner mustafov ismailov,ismailov|mustafov|taner,"{ismailov, mustafov, taner}"
4,A,A NEW CHALLENGE â INTELLECTUAL CAPITAL EVALU...,Revista Tinerilor Economisti (The Young Econom...,2014.0,Journal Article,Ph. D Student Maria-Luminita Gogan: âPoliteh...,https://econpapers.repec.org/article/aiorteyej...,,,,...,"Gogan, Luminita Maria",pgo683,Facultatea de Management în Producţie şi Trans...,100,"Timişoara, Romania",edi:ptuptro,https://ideas.repec.org/f/pgo683.html,luminita maria gogan,gogan|luminita|maria,"{gogan, maria, luminita}"
5,A,Sustainability Reporting and the Impact on Acc...,CECCAR Business Review,2024.0,Journal Article,Robert-Aurelian Èova: Bucharest University of...,https://econpapers.repec.org/article/ahdjournl...,,,,...,"Popa, Florina",ppo505,Institutul de Economie Nationala Institutul Na...,100,"Bucureşti, Romania",edi:inacaro,https://ideas.repec.org/f/ppo505.html,florina popa,florina|popa,"{popa, florina}"
6,A,"The Accounting Education, Between Digitalisati...",CECCAR Business Review,2020.0,Journal Article,Robert-Aurelian Èova: Bucharest University of...,https://econpapers.repec.org/article/ahdjournl...,,,,...,"Popa, Florina",ppo505,Institutul de Economie Nationala Institutul Na...,100,"Bucureşti, Romania",edi:inacaro,https://ideas.repec.org/f/ppo505.html,florina popa,florina|popa,"{popa, florina}"
7,A,Accounting Education â Between Digitalisatio...,CECCAR Business Review,2020.0,Journal Article,Robert-Aurelian Èova: Bucharest University of...,https://econpapers.repec.org/article/ahdjournl...,,,,...,"Popa, Florina",ppo505,Institutul de Economie Nationala Institutul Na...,100,"Bucureşti, Romania",edi:inacaro,https://ideas.repec.org/f/ppo505.html,florina popa,florina|popa,"{popa, florina}"
8,A,ECONOMICS OF INTEGRATION. A DEBATE OUTLINE,Internal Auditing and Risk Management,2015.0,Journal Article,,https://econpapers.repec.org/article/athjournl...,,,,...,"Andrei, Liviu Catalin",pan318,Facultatea de Administraţie Publică Şcoala Naţ...,100,"Bucureşti, Romania",edi:fasnsro,https://ideas.repec.org/f/pan318.html,liviu catalin andrei,andrei|catalin|liviu,"{liviu, catalin, andrei}"
9,A,"THE HEALTH OF ACADEMIC ORGANIZATION, A PROJECT...",Management Strategies Journal,2017.0,Journal Article,Vasile Miltiade Stanciu: Spiru Haret Universit...,https://econpapers.repec.org/article/brcjournl...,,,,...,"Stanciu, Miltiade",pst278,Academia de Studii Economice din Bucureşti,100,"Bucureşti, Romania",edi:aseeero,https://ideas.repec.org/f/pst278.html,miltiade stanciu,miltiade|stanciu,"{stanciu, miltiade}"


In [26]:
subset_simple_merged["match_type"] = "subset"



 # concaténation des déjà matchés et des non matchés à ce stade pour obtenir la base finale matchée

In [28]:
JEL_matched_final = pd.concat(
    [JEL_matched, subset_simple_merged],
    axis=0,
    ignore_index=True
)


In [30]:
JEL_matched_final["author_name"].nunique()

18093

# cleaning

In [31]:
JEL_matched_final

Unnamed: 0.2,JEL Subject,Title,Journal,Year,Type,Affiliations,URL,Unnamed: 0.1,Unnamed: 0,lettre,...,share_pct,location,repec_institution_id,author_url,author_std_inst,match_type,author_key_jel,tokens_jel,author_key_inst,tokens_inst
0,A,Preparing students for careers using business ...,,2020.0,Working Paper,Erland Hejn Nielsen: Department of Economics a...,https://econpapers.repec.org/paper/aahaarhec/2...,,,,...,100,"Aarhus, Denmark",edi:ifoaudk,https://ideas.repec.org/e/pni71.html,steen nielsen,exact,,,,
1,A,"Measuring Democracy - Eight indices: Polity, F...",,2021.0,Working Paper,Martin Paldam: Department of Economics and Bus...,https://econpapers.repec.org/paper/aahaarhec/2...,,,,...,100,"Aarhus, Denmark",edi:anaaudk,https://ideas.repec.org/f/ppa574.html,martin paldam,exact,,,,
2,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,,2023.0,Working Paper,Svend Hylleberg: Department of Economics and B...,https://econpapers.repec.org/paper/aahaarhec/2...,,,,...,100,"Aarhus, Denmark",edi:creaudk,https://ideas.repec.org/e/phy1.html,svend hylleberg,exact,,,,
3,A,How Do Truckers Perceive and Respond to the Ri...,AEI Economic Perspectives,2024.0,Journal Article,Michael Strain: American Enterprise Institute,https://econpapers.repec.org/article/aeijournl...,,,,...,3%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/f/pst593.html,michael r strain,exact,,,,
4,A,Architecture to Transform Classic Academic Cou...,The AMFITEATRU ECONOMIC journal,2024.0,Journal Article,Andrei Bobocea: Bucharest University of Econom...,https://econpapers.repec.org/article/aesamfeco...,,,,...,100,"Bucureşti, Romania",edi:aseeero,https://ideas.repec.org/f/pba893.html,lorena batagan,exact,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614475,Y,Modelling Profitability of Private Equity: A F...,,2022.0,Working Paper,,https://econpapers.repec.org/paper/cesceswps/_...,,,Y,...,50%,"Pamplona, Spain",edi:fcnaves,https://ideas.repec.org/e/pgi173.html,luis alberiko gil alana,subset,alana|gil|luis,"{alana, gil, luis}",alana|alberiko|gil|luis,"{alberiko, alana, gil, luis}"
614476,Y,Modelling Profitability of Private Equity: A F...,,2022.0,Working Paper,,https://econpapers.repec.org/paper/cesceswps/_...,,,Y,...,50%,"Pamplona, Spain",edi:cdnaves,https://ideas.repec.org/e/pgi173.html,luis alberiko gil alana,subset,alana|gil|luis,"{alana, gil, luis}",alana|alberiko|gil|luis,"{alberiko, alana, gil, luis}"
614477,Y,The Paradigms of Industry 4.0 and Circular Eco...,Social Sciences,2018.0,Journal Article,Fernando E. Garcia-MuiÃ±a: Department of Busin...,https://econpapers.repec.org/article/gamjscscx...,,,Y,...,100,"Valencia, Spain",edi:deupves,https://ideas.repec.org/f/pga1304.html,fernando garcia garcia,subset,fernando|garcia|muia,"{fernando, muia, garcia}",fernando|garcia|garcia,"{fernando, garcia}"
614478,Y,Identifying the Equilibrium Point between Sust...,Social Sciences,2019.0,Journal Article,Fernando E. Garcia-MuiÃ±a: Department of Busin...,https://econpapers.repec.org/article/gamjscscx...,,,Y,...,100,"Valencia, Spain",edi:deupves,https://ideas.repec.org/f/pga1304.html,fernando garcia garcia,subset,fernando|garcia|muia,"{fernando, muia, garcia}",fernando|garcia|garcia,"{fernando, garcia}"


# on enlève les colonnes inutiles

In [39]:
JEL_machted_final_clean = JEL_matched_final.drop(columns=["Affiliations","Unnamed: 0.1", "Unnamed: 0","lettre", "author_key_jel","author_key","tokens_jel","author_key_inst","author_std_jel", "tokens_inst",])

In [40]:
JEL_machted_final_clean

Unnamed: 0,JEL Subject,Title,Journal,Year,Type,URL,Author(s),author_name,short_id,institution,share_pct,location,repec_institution_id,author_url,author_std_inst,match_type
0,A,Preparing students for careers using business ...,,2020.0,Working Paper,https://econpapers.repec.org/paper/aahaarhec/2...,Nielsen Steen,"Nielsen, Steen",pni71,Institut for Økonomi Aarhus Universitet,100,"Aarhus, Denmark",edi:ifoaudk,https://ideas.repec.org/e/pni71.html,steen nielsen,exact
1,A,"Measuring Democracy - Eight indices: Polity, F...",,2021.0,Working Paper,https://econpapers.repec.org/paper/aahaarhec/2...,Paldam Martin,"Paldam, Martin",ppa574,School of Economics and Management Institut fo...,100,"Aarhus, Denmark",edi:anaaudk,https://ideas.repec.org/f/ppa574.html,martin paldam,exact
2,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,,2023.0,Working Paper,https://econpapers.repec.org/paper/aahaarhec/2...,Hylleberg Svend,"Hylleberg, Svend",phy1,Center for Research in Econometric Analysis of...,100,"Aarhus, Denmark",edi:creaudk,https://ideas.repec.org/e/phy1.html,svend hylleberg,exact
3,A,How Do Truckers Perceive and Respond to the Ri...,AEI Economic Perspectives,2024.0,Journal Article,https://econpapers.repec.org/article/aeijournl...,Strain Michael,"Strain, Michael R.",pst593,Institute of Labor Economics (IZA),3%,"Bonn, Germany",edi:izaaade,https://ideas.repec.org/f/pst593.html,michael r strain,exact
4,A,Architecture to Transform Classic Academic Cou...,The AMFITEATRU ECONOMIC journal,2024.0,Journal Article,https://econpapers.repec.org/article/aesamfeco...,Batagan Lorena,"Batagan, Lorena",pba893,Academia de Studii Economice din Bucureşti,100,"Bucureşti, Romania",edi:aseeero,https://ideas.repec.org/f/pba893.html,lorena batagan,exact
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614475,Y,Modelling Profitability of Private Equity: A F...,,2022.0,Working Paper,https://econpapers.repec.org/paper/cesceswps/_...,Gil-Alana Luis,"Gil-Alana, Luis Alberiko",pgi173,Facultad de Ciencias Económicas y Empresariale...,50%,"Pamplona, Spain",edi:fcnaves,https://ideas.repec.org/e/pgi173.html,luis alberiko gil alana,subset
614476,Y,Modelling Profitability of Private Equity: A F...,,2022.0,Working Paper,https://econpapers.repec.org/paper/cesceswps/_...,Gil-Alana Luis,"Gil-Alana, Luis Alberiko",pgi173,Navarra Center for International Development U...,50%,"Pamplona, Spain",edi:cdnaves,https://ideas.repec.org/e/pgi173.html,luis alberiko gil alana,subset
614477,Y,The Paradigms of Industry 4.0 and Circular Eco...,Social Sciences,2018.0,Journal Article,https://econpapers.repec.org/article/gamjscscx...,Garcia-MuiÃ±a Fernando E.,"Garcia Garcia, Fernando",pga1304,Departamento de Economía y Ciencias Sociales U...,100,"Valencia, Spain",edi:deupves,https://ideas.repec.org/f/pga1304.html,fernando garcia garcia,subset
614478,Y,Identifying the Equilibrium Point between Sust...,Social Sciences,2019.0,Journal Article,https://econpapers.repec.org/article/gamjscscx...,Garcia-MuiÃ±a Fernando E.,"Garcia Garcia, Fernando",pga1304,Departamento de Economía y Ciencias Sociales U...,100,"Valencia, Spain",edi:deupves,https://ideas.repec.org/f/pga1304.html,fernando garcia garcia,subset


# exporter la base en csv


In [32]:
JEL_machted_final_clean.to_csv("final1.csv", index=False)


In [33]:
len(JEL_machted_final_clean)

311234

# code pour vérifier si on cherche qqn dans le dataset

In [42]:
mask = (
    JEL_machted_final_clean["author_name"].str.contains("boyer", case=False, na=False)
    & JEL_machted_final_clean["author_name"].str.contains("pierre", case=False, na=False)
)

JEL_machted_final_clean[mask]


Unnamed: 0,JEL Subject,Title,Journal,Year,Type,URL,Author(s),author_name,short_id,institution,share_pct,location,repec_institution_id,author_url,author_std_inst,match_type
28531,C,Regulatory arbitrage and the efficiency of ban...,Journal of Financial Intermediation,2020.0,Journal Article,https://econpapers.repec.org/article/eeejfinin...,Boyer Pierre,"Boyer, Pierre C.",pbo355,CESifo,1%,"München, Germany",edi:cesifde,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
28532,C,Regulatory arbitrage and the efficiency of ban...,Journal of Financial Intermediation,2020.0,Journal Article,https://econpapers.repec.org/article/eeejfinin...,Boyer Pierre,"Boyer, Pierre C.",pbo355,Centre de Recherche en Économie et Statistique...,96%,"Palaiseau, France",edi:crestfr,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
28533,C,Regulatory arbitrage and the efficiency of ban...,Journal of Financial Intermediation,2020.0,Journal Article,https://econpapers.repec.org/article/eeejfinin...,Boyer Pierre,"Boyer, Pierre C.",pbo355,Département d'Économie École Polytechnique,2%,"Palaiseau, France",edi:depolfr,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
28746,C,Regulatory arbitrage and the efficiency of ban...,,2016.0,Working Paper,https://econpapers.repec.org/paper/bafcbafwp/c...,Boyer Pierre,"Boyer, Pierre C.",pbo355,CESifo,1%,"München, Germany",edi:cesifde,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
28747,C,Regulatory arbitrage and the efficiency of ban...,,2016.0,Working Paper,https://econpapers.repec.org/paper/bafcbafwp/c...,Boyer Pierre,"Boyer, Pierre C.",pbo355,Centre de Recherche en Économie et Statistique...,96%,"Palaiseau, France",edi:crestfr,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445055,L,The Lifecycle of Protests in the Digital Age,,2024.0,Working Paper,https://econpapers.repec.org/paper/cprceprdp/1...,Boyer Pierre,"Boyer, Pierre C.",pbo355,Centre de Recherche en Économie et Statistique...,96%,"Palaiseau, France",edi:crestfr,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
445056,L,The Lifecycle of Protests in the Digital Age,,2024.0,Working Paper,https://econpapers.repec.org/paper/cprceprdp/1...,Boyer Pierre,"Boyer, Pierre C.",pbo355,Département d'Économie École Polytechnique,2%,"Palaiseau, France",edi:depolfr,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
504788,Z,The Lifecycle of Protests in the Digital Age,,2024.0,Working Paper,https://econpapers.repec.org/paper/cprceprdp/1...,Boyer Pierre,"Boyer, Pierre C.",pbo355,CESifo,1%,"München, Germany",edi:cesifde,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact
504789,Z,The Lifecycle of Protests in the Digital Age,,2024.0,Working Paper,https://econpapers.repec.org/paper/cprceprdp/1...,Boyer Pierre,"Boyer, Pierre C.",pbo355,Centre de Recherche en Économie et Statistique...,96%,"Palaiseau, France",edi:crestfr,https://ideas.repec.org/f/pbo355.html,pierre c boyer,exact


# Stat desc sur cette nouvelle base matchée

In [46]:
df = JEL_machted_final_clean.copy()
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")



In [47]:
df["weight"] = (
    df["share_pct"]
    .astype(str)
    .str.replace("%", "", regex=False)
    .astype(float)
    / 100
)



In [48]:
df["Title"].nunique()        # 36731


137550

In [49]:
df["author_name"].nunique() #11 679

18093

In [50]:
df["institution"].nunique() # 2009

2392

# voir les homonymes

In [51]:
df.groupby("author_name")["Author(s)"].nunique().sort_values(ascending=False)



author_name
Silva, Maria C.A.       84
Nguyen, Hoang           83
Maria, Jose R.          58
Ruiz, Juan M.           38
Silva, Pedro M.         38
                        ..
Hager, Theresa           1
Hagiu, Alina Mariana     1
Hahn, Julia              1
Hahn, Nadine             1
Haeussler, Carolin       1
Name: Author(s), Length: 18093, dtype: int64

In [52]:
df[df["author_name"] == "Nguyen, Hoang"]["Author(s)"].value_counts().head(15)


Author(s)
Nguyen Hoang                     71
Nguyen Long Hoang                54
Nguyen Viet Hoang                40
Nguyen Quyen Le Hoang Thuy To    24
Nguyen Bao Hoang                 18
Nguyen Minh-Hoang                17
Nguyen Hoang Viet                14
Nguyen Hoang Chung               10
Thi Hoang Ha Nguyen              10
Hoang Nguyen Kim                  7
Nguyen Hoang Huy                  6
Hoang-Tung Nguyen                 6
Nguyen Hoang Giang                6
Nguyen Hoang Linh                 6
Nguyen Linh Hoang                 5
Name: count, dtype: int64

# Analyse top journaux

In [53]:
top_journals = [
    "American Economic Review",
    "The Quarterly Journal of Economics",
    "Journal of Political Economy",
    "The Review of Economic Studies",
    "Econometrica"
]
#Econometrica et #Journal of Political Economy absent

In [56]:
pd.set_option("display.max_rows", None)

In [57]:
df["Journal"].value_counts()


Journal
Sustainability                                                                                                                                                                                          9991
Journal of Economic Behavior & Organization                                                                                                                                                             6437
IJERPH                                                                                                                                                                                                  6417
DIW Wochenbericht                                                                                                                                                                                       5495
ifo Schnelldienst                                                                                                                                                           

In [86]:
df["is_top5_journal"] = df["Journal"].isin(top_journals)


In [87]:
inst_top5 = (
    df[df["is_top5_journal"]]
    .groupby("institution")["weight"]
    .sum()
    .sort_values(ascending=False)
)


In [88]:
inst_top5

institution
Paris School of Economics                                                                                                                                                       49.96
Department of Economics Sciences économiques Sciences Po                                                                                                                        38.35
Departament d'Economia i Empresa Universitat Pompeu Fabra Barcelona School of Economics (BSE)                                                                                   28.36
Innocenzo Gasparini Institute for Economic Research (IGIER) Università Commerciale Luigi Bocconi                                                                                26.22
Toulouse School of Economics (TSE)                                                                                                                                              25.75
Institute of Labor Economics (IZA)                                            

In [90]:
top_authors_top5 = (
    df[df["is_top5_journal"]]
    .groupby("author_name")["Title"]
    .nunique()
    .sort_values(ascending=False)
    .reset_index(name="n_top5_articles")
)
#test



In [91]:
top_authors_top5

Unnamed: 0,author_name,n_top5_articles
0,"van Reenen, John Michael",7
1,"Brunnermeier, Markus K.",7
2,"Gorodnichenko, Yuriy",6
3,"Haltiwanger, John",5
4,"Autor, David",5
5,"Jayachandran, Seema",5
6,"Card, David E.",5
7,"Pinotti, Paolo",4
8,"Bloom, Nicholas",4
9,"Notowidigdo, Matthew J.",4
