#### Fetching data

In [None]:
pip install pandas
pip install selenium
pip install BeautifulSoup4

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m9.6 MB/s[0m  [33m0:00:01[0m6m0:00:01[0m00:01[0m
[?25hDownloading numpy-2.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m8.2 MB/s[0m  [33m0:00:01[0mm0:00:01[0m:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 k

In [2]:
import re
import os
import pandas as pd
import requests
import string
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ---------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------
BASE_URL = "https://econpapers.repec.org"
JEL_LETTERS = ["A", "B", "C"]   # test
LIMIT_PER_CATEGORY = 20

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "From": "researcher@university.edu"
}

# ---------------------------------------------------------
# SELENIUM — LIENS
# ---------------------------------------------------------
def get_links_with_selenium(jel_letter):

    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

    paper_links = []

    url = (
        "https://econpapers.repec.org/scripts/search.pf"
        f"?jel={jel_letter}*&ni=10%20years&inpage=1000"
    )

    driver.get(url)
    wait = WebDriverWait(driver, 10)

    wait.until(EC.element_to_be_clickable(
        (By.XPATH, "//input[@type='SUBMIT']")
    )).click()

    Select(wait.until(
        EC.presence_of_element_located((By.ID, "inpage1"))
    )).select_by_value(str(LIMIT_PER_CATEGORY))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    links = []
    for a in soup.find_all("a", href=re.compile(r"/paper/|/article/")):
        href = a["href"]
        if "scripts" in href or "pers" in href:
            continue
        links.append(urljoin(BASE_URL, href))

    return list(dict.fromkeys(links))

# ---------------------------------------------------------
# SCRAPING MÉTADONNÉES
# ---------------------------------------------------------
def get_paper_details(url, jel_cat):

    try:
        soup = BeautifulSoup(
            requests.get(url, headers=HEADERS, timeout=10).text,
            "html.parser"
        )

        # --- TITRE ---
        title = soup.find("h1", class_="colored").get_text(strip=True)

        # --- ANNÉE ---
        year = None
        date = soup.find("b", string=re.compile("Date:"))
        if date:
            m = re.search(r"\d{4}", date.next_sibling or "")
            if m:
                year = int(m.group())

        # --- AUTEURS (SOURCE FIABLE) ---
        authors = [
            m["content"].replace(",", "")
            for m in soup.find_all("meta", {"name": "citation_author"})
        ]

        if not authors:
            authors = ["Voir texte"]
        # --- JOURNAL ---
        journal = None
        journal_meta = soup.find("meta", {"name": "citation_journal_title"})
        if journal_meta:
            journal = journal_meta.get("content", None)

        # --- AFFILIATIONS ---
        affil = soup.find("span", id="contact")
        affiliations = affil.get_text(" ", strip=True) if affil else None

        # --- TYPE ---
        pub_type = "Journal Article" if "/article/" in url else "Working Paper"

        return {
            "JEL Subject": jel_cat,
            "Title": title,
            "Author(s)": "; ".join(authors),
            "Journal": journal, 
            "Year": year,
            "Type": pub_type,
            "Affiliations": affiliations,
            "URL": url
        }

    except Exception:
        return None

# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------
def main(csv_filename):

    data = []

    for jel in JEL_LETTERS:
        links = get_links_with_selenium(jel)
        for link in links:
            d = get_paper_details(link, jel)
            if d:
                data.append(d)

    df = pd.DataFrame(data).drop_duplicates("URL")
    df.to_csv(csv_filename, index=False)

# ---------------------------------------------------------
# RUN
# ---------------------------------------------------------
if __name__ == "__main__":

    out = "RePEc_Final_Dataset_Corrected.csv"
    if os.path.exists(out):
        os.remove(out)

    main(out)

#### Data cleaning

In [3]:
df = pd.read_csv("RePEc_Final_Dataset_Corrected.csv")
import pandas as pd
import numpy as np

def clean_affiliation(text):
    if pd.isna(text):
        return np.nan
    
    # Si le format est "Auteur: Affiliation"
    if ":" in text:
        return text.split(":", 1)[1].strip()
    
    return text.strip()

df["Affiliations"] = df["Affiliations"].apply(clean_affiliation)
df = df.iloc[2:].reset_index(drop=True)
df.head(20)

Unnamed: 0,JEL Subject,Title,Author(s),Journal,Year,Type,Affiliations,URL
0,A,Preparing students for careers using business ...,Nielsen Erland Hejn; Nielsen Steen,,2020.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
1,A,"Measuring Democracy - Eight indices: Polity, F...",Paldam Martin,,2021.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
2,A,Oeconstudiet og den ÃÂ¸konomiske faggruppe ve...,Hylleberg Svend,,2023.0,Working Paper,Department of Economics and Business Economics...,https://econpapers.repec.org/paper/aahaarhec/2...
3,A,Digital Tools in the Educational Environment E...,Andra Diaconescu,Research & Education,2024.0,Journal Article,"Politehnica University of Timisoara, Faculty o...",https://econpapers.repec.org/article/aaijournl...
4,A,On the Gender Diversity of Research Teams in E...,Biermann Marcus,AEA Papers and Proceedings,2023.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
5,A,Messages That Foster a Sense of Belonging Impr...,Forcada Sara Avila,AEA Papers and Proceedings,2023.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
6,A,Parenthood and Academic Career Trajectories,Lassen Anne Sophie; IvandiÄ Ria,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
7,A,Impact versus Inclusion in the Economics Profe...,Bansak Cynthia; Dunn Wendy; Meade Ellen; Starr...,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
8,A,Teaching-Track Economists: A Canadian Perspective,Murdock Jennifer; Cohen Avi,AEA Papers and Proceedings,2024.0,Journal Article,,https://econpapers.repec.org/article/aeaapandp...
9,A,"Male Is a Gender, Too: A Review of Why Gender ...",Nelson Julie,Journal of Economic Literature,2016.0,Journal Article,,https://econpapers.repec.org/article/aeajeclit...


In [4]:
df.iloc[14,7]

'https://econpapers.repec.org/article/aeajecper/v_3a38_3ay_3a2024_3ai_3a3_3ap_3a191-208.htm'

In [10]:
import requests

url = 'http://ideas.repec.org/e/pal184.html'
html = requests.get(url).text

with open("author_page_debug.html", "w", encoding="utf-8") as f:
    f.write(html)

print("HTML auteur sauvegardé")


HTML auteur sauvegardé


In [21]:
if __name__ == "__main__":
    df_authors = scrape_author_index(max_authors=20)
    print("Authors found:", len(df_authors))

    df_affil = build_author_affiliation_table(df_authors)
    df_affil.to_csv("RePEc_Author_Affiliations_TEST20.csv", index=False)

    print("Saved author affiliations (TEST 20)")


Authors found: 20
Saved author affiliations (TEST 20)


In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

BASE_URL = "https://ideas.repec.org"
INDEX_URL = "https://ideas.repec.org/i/eall.html"
LETTERS = ["A", "B", "C"]  # test A-C
HEADERS = {"User-Agent": "Mozilla/5.0"}


# -------------------------------------------------
# 1. Scrape author index
# -------------------------------------------------
def scrape_author_index(max_authors=None):
    resp = requests.get(INDEX_URL, headers=HEADERS)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")
    authors = []

    for letter in LETTERS:
        anchor = soup.find("a", {"name": letter})
        if not anchor:
            continue

        table = anchor.find_next("table")
        if not table:
            continue

        for a in table.find_all("a", href=True):
            href = a["href"]

            if not href.endswith(".html"):
                continue

            author_name = a.get_text(strip=True)
            author_url = urljoin(BASE_URL, href)
            short_id = href.split("/")[-1].replace(".html", "")

            authors.append({
                "author_name": author_name,
                "short_id": short_id,
                "author_url": author_url
            })

            if max_authors and len(authors) >= max_authors:
                return pd.DataFrame(authors)

    return pd.DataFrame(authors)


# -------------------------------------------------
# 2. Scrape affiliation from author page
# -------------------------------------------------
def scrape_author_affiliation(author_url):
    try:
        resp = requests.get(author_url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
    except Exception:
        return {
            "institution": None,
            "location": None,
            "repec_institution_id": None
        }

    soup = BeautifulSoup(resp.text, "html.parser")

    aff_div = soup.find("div", id="affiliation")
    if not aff_div:
        return {
            "institution": None,
            "location": None,
            "repec_institution_id": None
        }

    institution = None
    location = None
    repec_id = None

    h3 = aff_div.find("h3")
    if h3:
        institution = h3.get_text(strip=True)

    loc = aff_div.find("span", class_="locationlabel")
    if loc:
        location = loc.get_text(strip=True)

    handle = aff_div.find("span", class_="handlelabel")
    if handle:
        repec_id = handle.get_text(strip=True).replace("RePEc:", "")

    return {
        "institution": institution,
        "location": location,
        "repec_institution_id": repec_id
    }


# -------------------------------------------------
# 3. Build affiliation table
# -------------------------------------------------
def build_author_affiliation_table(df_authors, sleep_time=1):
    rows = []

    for i, row in df_authors.iterrows():
        print(f"[{i+1}/{len(df_authors)}] Scraping {row['author_name']}")

        aff = scrape_author_affiliation(row["author_url"])

        rows.append({
            "author_name": row["author_name"],
            "short_id": row["short_id"],
            "institution": aff["institution"],
            "location": aff["location"],
            "repec_institution_id": aff["repec_institution_id"],
            "author_url": row["author_url"]
        })

        time.sleep(sleep_time)  # IMPORTANT pour RePEc

    return pd.DataFrame(rows)


# -------------------------------------------------
# 4. Main
# -------------------------------------------------
if __name__ == "__main__":
    df_authors = scrape_author_index(max_authors=20)
    print("Authors found:", len(df_authors))

    df_affil = build_author_affiliation_table(df_authors)
    df_affil.to_csv("RePEc_Author_Affiliations_TEST20.csv", index=False)

    print("Saved author affiliations (TEST 20)")

Authors found: 20
[1/20] Scraping A, Arun Kumar
[2/20] Scraping Alamri, Yosef A.
[3/20] Scraping Antelius, Jesper
[4/20] Scraping A, Selvarasu
[5/20] Scraping Alan, Sule
[6/20] Scraping Antell, Jan Wilhelm
[7/20] Scraping Aaberge, Rolf
[8/20] Scraping Alananga, Samwel Sanga
[9/20] Scraping Antelo, Manel
[10/20] Scraping Aad, Samar S
[11/20] Scraping Alani, Ezekiel Ayinde
[12/20] Scraping Antenord, Jean-Baptiste
[13/20] Scraping Aadland, David
[14/20] Scraping Alao, Abdul-Azeez Adeniyi
[15/20] Scraping Anthoff, David
[16/20] Scraping Aakvik, Arild
[17/20] Scraping Alaoui, Larbi
[18/20] Scraping Anthony, Johnson Ukwumonu
[19/20] Scraping Aalbers, Rob
[20/20] Scraping Alarcon, David
Saved author affiliations (TEST 20)


In [11]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

BASE_URL = "https://ideas.repec.org"
INDEX_URL = "https://ideas.repec.org/i/eall.html"
LETTERS = ["A", "B", "C"]  # test A-C
HEADERS = {"User-Agent": "Mozilla/5.0"}

# -------------------------------------------------
# 1. Scrape author index
# -------------------------------------------------
def scrape_author_index(max_authors=None):
    resp = requests.get(INDEX_URL, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    authors = []

    for letter in LETTERS:
        anchor = soup.find("a", {"name": letter})
        if not anchor:
            continue

        table = anchor.find_next("table")
        if not table:
            continue

        for a in table.find_all("a", href=True):
            href = a["href"]
            if not href.endswith(".html"):
                continue

            author_name = a.get_text(strip=True)
            author_url = urljoin(BASE_URL, href)
            short_id = href.split("/")[-1].replace(".html", "")

            authors.append({
                "author_name": author_name,
                "short_id": short_id,
                "author_url": author_url
            })

            if max_authors and len(authors) >= max_authors:
                return pd.DataFrame(authors)

    return pd.DataFrame(authors)

# -------------------------------------------------
# 2. Scrape ALL affiliations from author page
# -------------------------------------------------
def scrape_author_affiliations(author_url):
    try:
        resp = requests.get(author_url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
    except Exception:
        return []

    soup = BeautifulSoup(resp.text, "html.parser")
    aff_div = soup.find("div", id="affiliation")

    if not aff_div:
        return []

    affiliations = []
    h3_blocks = aff_div.find_all("h3")

    for h3 in h3_blocks:
        institution_text = " ".join(h3.stripped_strings)

        # extract percentage if present
        share_pct = None
        if institution_text.startswith("("):
            share_pct = institution_text.split(")")[0].replace("(", "").strip()
            institution_text = institution_text.split(")", 1)[1].strip()

        location = None
        repec_id = None

        for sib in h3.find_next_siblings():
            if sib.name == "h3":
                break
            if sib.name == "span" and "locationlabel" in sib.get("class", []):
                location = sib.get_text(strip=True)
            if sib.name == "span" and "handlelabel" in sib.get("class", []):
                repec_id = sib.get_text(strip=True).replace("RePEc:", "")

        affiliations.append({
            "institution": institution_text,
            "share_pct": share_pct,
            "location": location,
            "repec_institution_id": repec_id
        })

    return affiliations

# -------------------------------------------------
# 3. Build affiliation table (LONG FORMAT)
# -------------------------------------------------
def build_author_affiliation_table(df_authors, sleep_time=1):
    rows = []

    for i, row in df_authors.iterrows():
        print(f"[{i+1}/{len(df_authors)}] Scraping {row['author_name']}")

        affils = scrape_author_affiliations(row["author_url"])

        # if no affiliation found, still keep author
        if not affils:
            rows.append({
                "author_name": row["author_name"],
                "short_id": row["short_id"],
                "institution": None,
                "share_pct": None,
                "location": None,
                "repec_institution_id": None,
                "author_url": row["author_url"]
            })
        else:
            for aff in affils:
                rows.append({
                    "author_name": row["author_name"],
                    "short_id": row["short_id"],
                    "institution": aff["institution"],
                    "share_pct": aff["share_pct"],
                    "location": aff["location"],
                    "repec_institution_id": aff["repec_institution_id"],
                    "author_url": row["author_url"]
                })

        time.sleep(sleep_time)  # IMPORTANT pour RePEc

    return pd.DataFrame(rows)

# -------------------------------------------------
# 4. Main
# -------------------------------------------------
if __name__ == "__main__":
    df_authors = scrape_author_index(max_authors=20)
    print("Authors found:", len(df_authors))

    df_affil = build_author_affiliation_table(df_authors)
    df_affil.to_csv("RePEc_Author_Affiliations_TEST20.csv", index=False)

    print("Saved author affiliations (TEST 20)")

Authors found: 20
[1/20] Scraping A, Arun Kumar
[2/20] Scraping Alamri, Yosef A.
[3/20] Scraping Antelius, Jesper
[4/20] Scraping A, Selvarasu
[5/20] Scraping Alan, Sule
[6/20] Scraping Antell, Jan Wilhelm
[7/20] Scraping Aaberge, Rolf
[8/20] Scraping Alananga, Samwel Sanga
[9/20] Scraping Antelo, Manel
[10/20] Scraping Aad, Samar S
[11/20] Scraping Alani, Ezekiel Ayinde
[12/20] Scraping Antenord, Jean-Baptiste
[13/20] Scraping Aadland, David
[14/20] Scraping Alao, Abdul-Azeez Adeniyi
[15/20] Scraping Anthoff, David
[16/20] Scraping Aakvik, Arild
[17/20] Scraping Alaoui, Larbi
[18/20] Scraping Anthony, Johnson Ukwumonu
[19/20] Scraping Aalbers, Rob
[20/20] Scraping Alarcon, David
Saved author affiliations (TEST 20)


In [12]:
df_affil

Unnamed: 0,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url
0,"A, Arun Kumar",paa30,The ICFAI Foundation for Higher Education,50%,Hyderabad,,https://ideas.repec.org/e/paa30.html
1,"Alamri, Yosef A.",pal932,Department of Agricultural Economics Universit...,,"Lexington, Kentucky (United States)",edi:daukyus,https://ideas.repec.org/f/pal932.html
2,"Antelius, Jesper",pan241,Riksrevisionen Government of Sweden,,"Stockholm, Sweden",edi:srrgvse,https://ideas.repec.org/f/pan241.html
3,"A, Selvarasu",pmu263,Department of Business Administration Annamala...,,"Annamalai Nagar, India",edi:dbannin,https://ideas.repec.org/f/pmu263.html
4,"Alan, Sule",pal184,Department of Economics European University In...,73%,"Firenze, Italy",edi:deiueit,https://ideas.repec.org/e/pal184.html
5,"Alan, Sule",pal184,İktisat Bölümü Bilkent Üniversitesi,23%,"Ankara, Turkey",edi:debiltr,https://ideas.repec.org/e/pal184.html
6,"Alan, Sule",pal184,Abdul Latif Jameel Poverty Action Lab (J-PAL) ...,4%,"Cambridge, Massachusetts (United States)",edi:jpmitus,https://ideas.repec.org/e/pal184.html
7,"Antell, Jan Wilhelm",pan150,Hanken Svenska Handelshögskolan,,"Helsinki, Finland",edi:shhhhfi,https://ideas.repec.org/e/pan150.html
8,"Aaberge, Rolf",paa6,Statistisk Sentralbyrå Government of Norway,20%,"Oslo, Norway",edi:ssbgvno,https://ideas.repec.org/e/paa6.html
9,"Aaberge, Rolf",paa6,Økonomisk institutt Universitetet i Oslo,80%,"Oslo, Norway",edi:souiono,https://ideas.repec.org/e/paa6.html


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

BASE_URL = "https://ideas.repec.org"
INDEX_URL = "https://ideas.repec.org/i/eall.html"
LETTERS = ["A", "B", "C"]  # test A–C
HEADERS = {"User-Agent": "Mozilla/5.0"}

# -------------------------------------------------
# UE country codes in EDIRC (suffix)
# -------------------------------------------------
UE_COUNTRY_CODES = [
    "at", "be", "bg", "hr", "cy", "cz", "dk", "ee", "fi", "fr",
    "de", "gr", "hu", "ie", "it", "lv", "lt", "lu", "mt", "nl",
    "pl", "pt", "ro", "sk", "si", "es", "se"
]

def is_ue_edi(repec_id):
    """
    EDIRC ids look like edi:deiueit, edi:debiltr, etc.
    We check the last 2 letters = country code
    """
    if not repec_id:
        return False
    code = repec_id.lower()[-2:]
    return code in UE_COUNTRY_CODES

# -------------------------------------------------
# 1. Scrape author index
# -------------------------------------------------
def scrape_author_index():
    resp = requests.get(INDEX_URL, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    authors = []

    for letter in LETTERS:
        anchor = soup.find("a", {"name": letter})
        if not anchor:
            continue

        table = anchor.find_next("table")
        if not table:
            continue

        for a in table.find_all("a", href=True):
            href = a["href"]
            if not href.endswith(".html"):
                continue

            authors.append({
                "author_name": a.get_text(strip=True),
                "short_id": href.split("/")[-1].replace(".html", ""),
                "author_url": urljoin(BASE_URL, href)
            })

    return authors

# -------------------------------------------------
# 2. Scrape ALL affiliations
# -------------------------------------------------
def scrape_author_affiliations(author_url):
    try:
        resp = requests.get(author_url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
    except Exception:
        return []

    soup = BeautifulSoup(resp.text, "html.parser")
    aff_div = soup.find("div", id="affiliation")

    if not aff_div:
        return []

    affiliations = []

    for h3 in aff_div.find_all("h3"):
        institution = " ".join(h3.stripped_strings)

        share_pct = None
        if institution.startswith("("):
            share_pct = institution.split(")")[0].replace("(", "").strip()
            institution = institution.split(")", 1)[1].strip()

        location = None
        repec_id = None

        for sib in h3.find_next_siblings():
            if sib.name == "h3":
                break
            if sib.name == "span" and "locationlabel" in sib.get("class", []):
                location = sib.get_text(strip=True)
            if sib.name == "span" and "handlelabel" in sib.get("class", []):
                repec_id = sib.get_text(strip=True).replace("RePEc:", "")

        affiliations.append({
            "institution": institution,
            "share_pct": share_pct,
            "location": location,
            "repec_institution_id": repec_id
        })

    return affiliations

# -------------------------------------------------
# 3. MAIN – EDIRC-FIRST TEST (20 UE AUTHORS)
# -------------------------------------------------
if __name__ == "__main__":

    authors = scrape_author_index()

    rows = []
    ue_authors_seen = set()

    for i, author in enumerate(authors):
        print(f"[{i+1}] Checking {author['author_name']}")

        affils = scrape_author_affiliations(author["author_url"])

        # keep only UE affiliations (EDIRC-based)
        affils_ue = [
            a for a in affils if is_ue_edi(a["repec_institution_id"])
        ]

        if not affils_ue:
            time.sleep(0.5)
            continue

        # add ALL UE affiliations for this author
        for aff in affils_ue:
            rows.append({
                "author_name": author["author_name"],
                "short_id": author["short_id"],
                "institution": aff["institution"],
                "share_pct": aff["share_pct"],
                "location": aff["location"],
                "repec_institution_id": aff["repec_institution_id"],
                "author_url": author["author_url"]
            })

        ue_authors_seen.add(author["short_id"])

        if len(ue_authors_seen) >= 20:
            print("Reached 20 UE authors – STOP")
            break

        time.sleep(0.5)

    df_test = pd.DataFrame(rows)
    df_test.to_csv("RePEc_UE_EDIRC_TEST20.csv", index=False)

    print("Saved RePEc_UE_EDIRC_TEST20.csv")

[1] Checking A, Arun Kumar
[2] Checking Alamri, Yosef A.
[3] Checking Antelius, Jesper
[4] Checking A, Selvarasu
[5] Checking Alan, Sule
[6] Checking Antell, Jan Wilhelm
[7] Checking Aaberge, Rolf
[8] Checking Alananga, Samwel Sanga
[9] Checking Antelo, Manel
[10] Checking Aad, Samar S
[11] Checking Alani, Ezekiel Ayinde
[12] Checking Antenord, Jean-Baptiste
[13] Checking Aadland, David
[14] Checking Alao, Abdul-Azeez Adeniyi
[15] Checking Anthoff, David
[16] Checking Aakvik, Arild
[17] Checking Alaoui, Larbi
[18] Checking Anthony, Johnson Ukwumonu
[19] Checking Aalbers, Rob
[20] Checking Alarcon, David
[21] Checking Anthony-Orji, Onyinye Imelda
[22] Checking Aalto, Aino-Maija
[23] Checking Alarcon, Jorge Victor
[24] Checking Anthropelos, Michail
[25] Checking Aamir, Suhaib
[26] Checking Alarcon, Pedro
[27] Checking Antigo, Mariangela Furlan
[28] Checking Aamir Khan, Muhammad
[29] Checking Alarcon Gambarte, Samuel
[30] Checking Antimiani, Alessandro
[31] Checking Aanderud, Philip
[32] 

In [14]:
df_test

Unnamed: 0,author_name,short_id,institution,share_pct,location,repec_institution_id,author_url
0,"Antelius, Jesper",pan241,Riksrevisionen Government of Sweden,,"Stockholm, Sweden",edi:srrgvse,https://ideas.repec.org/f/pan241.html
1,"Alan, Sule",pal184,Department of Economics European University In...,73%,"Firenze, Italy",edi:deiueit,https://ideas.repec.org/e/pal184.html
2,"Antell, Jan Wilhelm",pan150,Hanken Svenska Handelshögskolan,,"Helsinki, Finland",edi:shhhhfi,https://ideas.repec.org/e/pan150.html
3,"Antelo, Manel",pan291,Departamento de Fundamentos da Análise Económi...,,"Santiago de Compostela, Spain",edi:dfusces,https://ideas.repec.org/f/pan291.html
4,"Antenord, Jean-Baptiste",pan582,Lille Économie et Management (LEM),,"Lille, France",edi:laborfr,https://ideas.repec.org/f/pan582.html
5,"Alaoui, Larbi",pal299,Departament d'Economia i Empresa Universitat P...,,"Barcelona, Spain",edi:deupfes,https://ideas.repec.org/f/pal299.html
6,"Aalbers, Rob",paa8,Centraal Planbureau (CPB) Government of the Ne...,,"Den Haag, Netherlands",edi:cpbgvnl,https://ideas.repec.org/e/paa8.html
7,"Aalto, Aino-Maija",paa28,Nationalekonomi Fakulteten för Samhällsvetensk...,50%,"Turku, Finland",edi:niabofi,https://ideas.repec.org/e/paa28.html
8,"Aalto, Aino-Maija",paa28,Institutet för Social Forskning (SOFI) Stockho...,50%,"Stockholm, Sweden",edi:sofsuse,https://ideas.repec.org/e/paa28.html
9,"Anthropelos, Michail",pan319,Department of Banking and Financial Management...,,"Piraeus, Greece",edi:dfpirgr,https://ideas.repec.org/f/pan319.html
