#### Fetching data

In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m9.6 MB/s[0m  [33m0:00:01[0m6m0:00:01[0m00:01[0m
[?25hDownloading numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m8.2 MB/s[0m  [33m0:00:01[0mm0:00:01[0m:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 k

In [5]:
pip install BeautifulSoup4

Collecting BeautifulSoup4
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>=1.6.1 (from BeautifulSoup4)
  Downloading soupsieve-2.8.1-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.14.3-py3-none-any.whl (107 kB)
Downloading soupsieve-2.8.1-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, BeautifulSoup4
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [BeautifulSoup4]m [BeautifulSoup4]
[1A[2KSuccessfully installed BeautifulSoup4-4.14.3 soupsieve-2.8.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install selenium

Collecting selenium
  Downloading selenium-4.39.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions<5.0,>=4.15.0 (from selenium)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting websocket-client<2.0,>=1.8.0 (from selenium)
  Downloading websocket_client-1.9.0-py3-none-any.whl.metadata (8.3 kB)
Collecting attrs>=23.2.0 (from trio<1.0,>=0.31.0->selenium)
  Downloading attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio<1.0,>=0.31.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio<

In [35]:
import re
import os
import pandas as pd
import requests
import string
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ---------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------
BASE_URL = "https://econpapers.repec.org"
JEL_LETTERS = ["A", "B", "C"]   # test
LIMIT_PER_CATEGORY = 20

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "From": "researcher@university.edu"
}

# ---------------------------------------------------------
# SELENIUM — LIENS
# ---------------------------------------------------------
def get_links_with_selenium(jel_letter):

    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

    paper_links = []

    url = (
        "https://econpapers.repec.org/scripts/search.pf"
        f"?jel={jel_letter}*&ni=10%20years&inpage=1000"
    )

    driver.get(url)
    wait = WebDriverWait(driver, 10)

    wait.until(EC.element_to_be_clickable(
        (By.XPATH, "//input[@type='SUBMIT']")
    )).click()

    Select(wait.until(
        EC.presence_of_element_located((By.ID, "inpage1"))
    )).select_by_value(str(LIMIT_PER_CATEGORY))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    links = []
    for a in soup.find_all("a", href=re.compile(r"/paper/|/article/")):
        href = a["href"]
        if "scripts" in href or "pers" in href:
            continue
        links.append(urljoin(BASE_URL, href))

    return list(dict.fromkeys(links))

# ---------------------------------------------------------
# SCRAPING MÉTADONNÉES
# ---------------------------------------------------------
def get_paper_details(url, jel_cat):

    try:
        soup = BeautifulSoup(
            requests.get(url, headers=HEADERS, timeout=10).text,
            "html.parser"
        )

        # --- TITRE ---
        title = soup.find("h1", class_="colored").get_text(strip=True)

        # --- ANNÉE ---
        year = None
        date = soup.find("b", string=re.compile("Date:"))
        if date:
            m = re.search(r"\d{4}", date.next_sibling or "")
            if m:
                year = int(m.group())

        # --- AUTEURS (SOURCE FIABLE) ---
        authors = [
            m["content"].replace(",", "")
            for m in soup.find_all("meta", {"name": "citation_author"})
        ]

        if not authors:
            authors = ["Voir texte"]
        # --- JOURNAL ---
        journal = None
        journal_meta = soup.find("meta", {"name": "citation_journal_title"})
        if journal_meta:
            journal = journal_meta.get("content", None)

        # --- AFFILIATIONS ---
        affil = soup.find("span", id="contact")
        affiliations = affil.get_text(" ", strip=True) if affil else None

        # --- TYPE ---
        pub_type = "Journal Article" if "/article/" in url else "Working Paper"

        return {
            "JEL Subject": jel_cat,
            "Title": title,
            "Author(s)": "; ".join(authors),
            "Journal": journal, 
            "Year": year,
            "Type": pub_type,
            "Affiliations": affiliations,
            "URL": url
        }

    except Exception:
        return None

# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------
def main(csv_filename):

    data = []

    for jel in JEL_LETTERS:
        links = get_links_with_selenium(jel)
        for link in links:
            d = get_paper_details(link, jel)
            if d:
                data.append(d)

    df = pd.DataFrame(data).drop_duplicates("URL")
    df.to_csv(csv_filename, index=False)

# ---------------------------------------------------------
# RUN
# ---------------------------------------------------------
if __name__ == "__main__":

    out = "RePEc_Final_Dataset_Corrected.csv"
    if os.path.exists(out):
        os.remove(out)

    main(out)

#### Data cleaning

In [36]:
df = pd.read_csv("RePEc_Final_Dataset_Corrected.csv")
import pandas as pd
import numpy as np

def clean_affiliation(text):
    if pd.isna(text):
        return np.nan
    
    # Si le format est "Auteur: Affiliation"
    if ":" in text:
        return text.split(":", 1)[1].strip()
    
    return text.strip()

df["Affiliations"] = df["Affiliations"].apply(clean_affiliation)
df = df.iloc[2:].reset_index(drop=True)
df.head(20)

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL
0,A,Preparing students for careers using business ...,Nielsen Erland Hejn; Nielsen Steen,,2020.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
1,A,"Measuring Democracy - Eight indices: Polity, F...",Paldam Martin,,2021.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
2,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,Hylleberg Svend,,2023.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
3,A,Digital Tools in the Educational Environment E...,Andra Diaconescu,Research & Education,2024.0,Journal Article,"Politehnica University of Timisoara, Faculty o...",https://econpapers.repec.org/article/aaijournl...
4,A,On the Gender Diversity of Research Teams in E...,Biermann Marcus,AEA Papers and Proceedings,2023.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
5,A,Messages That Foster a Sense of Belonging Impr...,Forcada Sara Avila,AEA Papers and Proceedings,2023.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
6,A,Parenthood and Academic Career Trajectories,Lassen Anne Sophie; IvandiÄ Ria,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
7,A,Impact versus Inclusion in the Economics Profe...,Bansak Cynthia; Dunn Wendy; Meade Ellen; Starr...,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
8,A,Teaching-Track Economists: A Canadian Perspective,Murdock Jennifer; Cohen Avi,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
9,A,"Male Is a Gender, Too: A Review of Why Gender ...",Nelson Julie,Journal of Economic Literature,2016.0,Journal Article,,https://econpapers.repec.org/article/aeajeclit...


In [31]:
df_voir_texte = df[df["Author(s)"] == "Voir texte"]
len(df_voir_texte)


35

In [32]:
df_voir_texte["URL"].head(3).tolist()


['https://econpapers.repec.org/article/aeaapandp/v_3a113_3ay_3a2023_3ap_3a473-76.htm',
 'https://econpapers.repec.org/article/aeaapandp/v_3a113_3ay_3a2023_3ap_3a514-18.htm',
 'https://econpapers.repec.org/article/aeaapandp/v_3a114_3ay_3a2024_3ap_3a238-42.htm']