#### Fetching data

In [None]:
pip install pandas
pip install selenium
pip install BeautifulSoup4

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m9.6 MB/s[0m  [33m0:00:01[0m6m0:00:01[0m00:01[0m
[?25hDownloading numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m8.2 MB/s[0m  [33m0:00:01[0mm0:00:01[0m:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 k

#### Scrapping auteurs-articles

1er test sur A, B et C par 20

In [None]:
import re
import os
import pandas as pd
import requests
import string
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ---------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------
BASE_URL = "https://econpapers.repec.org"
JEL_LETTERS = ["A", "B", "C"]  # test
LIMIT_PER_CATEGORY = 20

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "From": "researcher@university.edu"
}

# ---------------------------------------------------------
# SELENIUM — LIENS
# ---------------------------------------------------------
def get_links_with_selenium(jel_letter):

    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

    url = (
        "https://econpapers.repec.org/scripts/search.pf"
        f"?jel={jel_letter}*&ni=10%20years&inpage=1000"
    )

    driver.get(url)
    wait = WebDriverWait(driver, 10)

    # Cliquer sur "Search"
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//input[@type='SUBMIT']"))
    ).click()

    # Sélection du nombre de résultats
    Select(
        wait.until(EC.presence_of_element_located((By.ID, "inpage1")))
    ).select_by_value(str(LIMIT_PER_CATEGORY))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    links = []
    for a in soup.find_all("a", href=re.compile(r"/paper/|/article/")):
        href = a["href"]
        if "scripts" in href or "pers" in href:
            continue
        links.append(urljoin(BASE_URL, href))

    # supprimer les doublons en gardant l'ordre
    return list(dict.fromkeys(links))


# ---------------------------------------------------------
# SCRAPING MÉTADONNÉES
# ---------------------------------------------------------
def get_paper_details(url, jel_cat):

    try:
        soup = BeautifulSoup(
            requests.get(url, headers=HEADERS, timeout=10).text,
            "html.parser"
        )

        # --- TITRE ---
        title_tag = soup.find("h1", class_="colored")
        title = title_tag.get_text(strip=True) if title_tag else None

        # --- ANNÉE ---
        year = None
        date = soup.find("b", string=re.compile("Date:"))
        if date:
            m = re.search(r"\d{4}", date.next_sibling or "")
            if m:
                year = int(m.group())

        # --- AUTEURS ---
        authors = [
            m["content"].replace(",", "")
            for m in soup.find_all("meta", {"name": "citation_author"})
        ]
        if not authors:
            authors = ["Voir texte"]

        # --- JOURNAL ---
        journal = None
        journal_meta = soup.find("meta", {"name": "citation_journal_title"})
        if journal_meta:
            journal = journal_meta.get("content")

        # --- AFFILIATIONS ---
        affil = soup.find("span", id="contact")
        affiliations = affil.get_text(" ", strip=True) if affil else None

        # --- TYPE ---
        pub_type = "Journal Article" if "/article/" in url else "Working Paper"

        return {
            "JEL Subject": jel_cat,
            "Title": title,
            "Author(s)": "; ".join(authors),
            "Journal": journal,
            "Year": year,
            "Type": pub_type,
            "Affiliations": affiliations,
            "URL": url
        }

    except Exception:
        return None


# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------
def main(csv_filename):

    data = []

    for jel in JEL_LETTERS:
        print(f"Scraping JEL {jel}...")
        links = get_links_with_selenium(jel)

        for link in links:
            d = get_paper_details(link, jel)
            if d:
                data.append(d)

    df = pd.DataFrame(data).drop_duplicates("URL")
    df.to_csv(csv_filename, index=False)


# ---------------------------------------------------------
# RUN
# ---------------------------------------------------------
if __name__ == "__main__":

    out = "RePEc_Final_Dataset_Corrected.csv"
    if os.path.exists(out):
        os.remove(out)

    main(out)


In [3]:
df = pd.read_csv("RePEc_Final_Dataset_Corrected.csv")
import pandas as pd
import numpy as np

def clean_affiliation(text):
    if pd.isna(text):
        return np.nan
    
    # Si le format est "Auteur: Affiliation"
    if ":" in text:
        return text.split(":", 1)[1].strip()
    
    return text.strip()

df["Affiliations"] = df["Affiliations"].apply(clean_affiliation)
df = df.iloc[2:].reset_index(drop=True)
df.head(20)

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL
0,A,Preparing students for careers using business ...,Nielsen Erland Hejn; Nielsen Steen,,2020.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
1,A,"Measuring Democracy - Eight indices: Polity, F...",Paldam Martin,,2021.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
2,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,Hylleberg Svend,,2023.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
3,A,Digital Tools in the Educational Environment E...,Andra Diaconescu,Research & Education,2024.0,Journal Article,"Politehnica University of Timisoara, Faculty o...",https://econpapers.repec.org/article/aaijournl...
4,A,On the Gender Diversity of Research Teams in E...,Biermann Marcus,AEA Papers and Proceedings,2023.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
5,A,Messages That Foster a Sense of Belonging Impr...,Forcada Sara Avila,AEA Papers and Proceedings,2023.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
6,A,Parenthood and Academic Career Trajectories,Lassen Anne Sophie; IvandiÄ Ria,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
7,A,Impact versus Inclusion in the Economics Profe...,Bansak Cynthia; Dunn Wendy; Meade Ellen; Starr...,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
8,A,Teaching-Track Economists: A Canadian Perspective,Murdock Jennifer; Cohen Avi,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
9,A,"Male Is a Gender, Too: A Review of Why Gender ...",Nelson Julie,Journal of Economic Literature,2016.0,Journal Article,,https://econpapers.repec.org/article/aeajeclit...


### Scrapping affiliations auteurs

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import string
import os

BASE_URL = "https://ideas.repec.org"
INDEX_URL = "https://ideas.repec.org/i/eall.html"

# =========================
# RUN sur tout l'alphabet
# =========================
LETTERS = list(string.ascii_uppercase[string.ascii_uppercase.index("A"):])

HEADERS = {"User-Agent": "Mozilla/5.0"}


OUTPUT_FILE = "data_A_Z.csv"
SAVE_EVERY = 50

UE_COUNTRY_CODES = [
    "at","be","bg","hr","cy","cz","dk","ee","fi","fr","de","gr","hu",
    "ie","it","lv","lt","lu","mt","nl","pl","pt","ro","sk","si","es","se"
]

def is_ue_edi(repec_id):
    if not repec_id:
        return False
    return repec_id.lower()[-2:] in UE_COUNTRY_CODES

# -------------------------------------------------
# Scrape index (A–Z)
# -------------------------------------------------
def scrape_author_index():
    resp = requests.get(INDEX_URL, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    authors = []

    for letter in LETTERS:
        anchor = soup.find("a", {"name": letter})
        if not anchor:
            continue

        table = anchor.find_next("table")
        if not table:
            continue

        for a in table.find_all("a", href=True):
            href = a["href"]
            if href.endswith(".html"):
                authors.append({
                    "author_name": a.get_text(strip=True),
                    "short_id": href.split("/")[-1].replace(".html", ""),
                    "author_url": urljoin(BASE_URL, href),
                    "letter": letter
                })

    return authors

# -------------------------------------------------
# Scrape affiliations
# -------------------------------------------------
def scrape_author_affiliations(author_url):
    try:
        r = requests.get(author_url, headers=HEADERS, timeout=10)
        r.raise_for_status()
    except Exception:
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    aff_div = soup.find("div", id="affiliation")
    if not aff_div:
        return []

    affils = []

    for h3 in aff_div.find_all("h3"):
        institution = " ".join(h3.stripped_strings)

        share_pct = None
        if institution.startswith("("):
            share_pct = institution.split(")")[0].replace("(", "")
            institution = institution.split(")", 1)[1].strip()

        location = None
        repec_id = None

        for sib in h3.find_next_siblings():
            if sib.name == "h3":
                break
            if sib.name == "span" and "locationlabel" in sib.get("class", []):
                location = sib.get_text(strip=True)
            if sib.name == "span" and "handlelabel" in sib.get("class", []):
                repec_id = sib.get_text(strip=True).replace("RePEc:", "")

        affils.append({
            "institution": institution,
            "share_pct": share_pct,
            "location": location,
            "repec_institution_id": repec_id
        })

    return affils

# -------------------------------------------------
# MAIN
# -------------------------------------------------
if __name__ == "__main__":

    authors = scrape_author_index()
    total_authors = len(authors)

    # reprise si relancé
    if os.path.exists(OUTPUT_FILE):
        df_existing = pd.read_csv(OUTPUT_FILE)
        processed_ids = set(df_existing["short_id"].unique())
        rows = df_existing.to_dict("records")
        print(f"Resuming M–Z – {len(processed_ids)} authors already processed")
    else:
        processed_ids = set()
        rows = []

    ue_authors_seen = set(processed_ids)

    for i, author in enumerate(authors):
        if author["short_id"] in processed_ids:
            continue

        print(
            f"[{author['letter']}] "
            f"Author {i+1}/{total_authors} | "
            f"UE authors: {len(ue_authors_seen)}"
        )

        affils = scrape_author_affiliations(author["author_url"])
        affils_ue = [a for a in affils if is_ue_edi(a["repec_institution_id"])]

        if not affils_ue:
            time.sleep(0.5)
            continue

        for aff in affils_ue:
            rows.append({
                "author_name": author["author_name"],
                "short_id": author["short_id"],
                "institution": aff["institution"],
                "share_pct": aff["share_pct"],
                "location": aff["location"],
                "repec_institution_id": aff["repec_institution_id"],
                "author_url": author["author_url"]
            })

        ue_authors_seen.add(author["short_id"])
        processed_ids.add(author["short_id"])

        if len(rows) % SAVE_EVERY == 0:
            pd.DataFrame(rows).to_csv(OUTPUT_FILE, index=False)
            print(f"Saved {len(rows)} rows")

        time.sleep(0.5)

    pd.DataFrame(rows).to_csv(OUTPUT_FILE, index=False)
    print(" DONE – A–Z final save complete")