#### Fetching data

In [None]:
import time
import re
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select

# --- CONFIGURATION ---
BASE_URL = "https://econpapers.repec.org"
#JEL_LETTERS = [chr(i) for i in range(ord('A'), ord('Z')+1)] # A √† Z
JEL_LETTERS = ['A', 'B', 'C'] # Pour test rapide, tu peux limiter √† quelques lettres
LIMIT_PER_CATEGORY = 20 # Ta strat√©gie : 20, 30, 50, 100, 500, 1000

# Headers pour la partie Requests (Deep scraping)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'From': 'researcher@university.edu'
}

# ---------------------------------------------------------
# PARTIE 1 : SELENIUM (R√©cup√©rer les liens via le bouton Search)
# ---------------------------------------------------------
def get_links_with_selenium(jel_letter):
    """
    Ouvre le navigateur, remplit l'URL avec les params, clique sur Search
    et r√©cup√®re les liens des r√©sultats.
    """
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless") # D√©commente pour ne pas voir la fen√™tre (plus rapide)
    options.add_argument("--disable-gpu")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    paper_links = []
    
    # URL optimis√©e avec tes param√®tres
    # jel={letter}* -> Cat√©gorie
    # ni=10+years -> 10 ans
    # inpage=1000 -> Pour √™tre s√ªr d'en avoir 500 sur une page
    target_url = f"https://econpapers.repec.org/scripts/search.pf?jel={jel_letter}*&ni=10%20years&inpage=1000"
    
    print(f"   [Selenium] Navigation vers : {target_url}")
    
    try:
        driver.get(target_url)
        
        # 1. Attendre et Cliquer sur le bouton "Search"
        # Sur EconPapers, le bouton est souvent un input type="submit" value="Search"
        wait = WebDriverWait(driver, 10)
        
        # On cherche le bouton submit
        search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@type='SUBMIT' and @value='Search!']")))
        search_btn.click()
        
        print(f"   [Selenium] Bouton cliqu√©. Attente des r√©sultats...")
        
        dropdown = wait.until(EC.presence_of_element_located((By.ID, "inpage1")))
        select = Select(dropdown)
        select.select_by_value(f"{LIMIT_PER_CATEGORY}")
        print(f"   [Selenium] 2. Menu d√©roulant r√©gl√© sur {LIMIT_PER_CATEGORY}.")
        
        # 2. Attendre que les r√©sultats chargent
        page_source = driver.page_source
        driver.quit()
        
        # --- C. PARSING BEAUTIFULSOUP (ADAPT√â √Ä TON SNIPPET) ---
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # ICI C'EST LE CHANGEMENT IMPORTANT
        # On cherche les liens qui contiennent '/paper/' OU '/article/' OU 'RePEc:'
        # Ton snippet montre: href="/paper/..."
        raw_links = soup.find_all('a', href=re.compile(r'/paper/|/article/|RePEc:'))
        
        seen = set()
        for link in raw_links:
            href = link['href']
            # Filtres de s√©curit√©
            if 'pers' not in href and 'scripts' not in href and 'inst' not in href and 'ras' not in href:
                full_url = urljoin(BASE_URL, href)
                if full_url not in seen:
                    paper_links.append(full_url)
                    seen.add(full_url)
        
        print(f"   -> {len(paper_links)} liens trouv√©s pour JEL {jel_letter}.")

    except Exception as e:
        print(f"   [Erreur Selenium] : {e}")
        if driver: driver.quit()
        
    return paper_links

# ---------------------------------------------------------
# PARTIE 2 : REQUESTS + BS4 (Scraping des d√©tails)
# ---------------------------------------------------------
def get_paper_details(url, jel_cat):
    """
    Scrape les m√©tadonn√©es en ciblant pr√©cis√©ment les balises HTML fournies :
    - Date : via <b>Date:</b>
    - Auteurs : via <i>
    - Affiliations : via <span id="contact">
    """
    try:
        resp = requests.get(url, headers=HEADERS, timeout=10)
        if resp.status_code != 200: return None
        
        soup = BeautifulSoup(resp.content, 'html.parser')
        
        # 1. TITRE
        title_tag = soup.find('h1', class_='colored')
        title = title_tag.get_text(strip=True) if title_tag else "N/A"

        # 2. DATE / ANN√âE (Ciblage pr√©cis selon ton HTML)
        year = "N/A"
        # On cherche la balise <b> qui contient exactement "Date:"
        date_label = soup.find('b', string=re.compile(r"Date:"))
        
        if date_label:
            # On prend le texte juste apr√®s la balise <b> (le "next_sibling")
            date_text = date_label.next_sibling
            if date_text:
                # Ex: "2020-08-06" -> on nettoie et on garde l'ann√©e
                full_date = date_text.strip() 
                # On extrait les 4 premiers chiffres (L'ann√©e)
                match = re.search(r'\d{4}', full_date)
                if match:
                    year = int(match.group(0))
        
        # Fallback : Si la balise <b>Date:</b> n'existe pas, on cherche une ann√©e dans tout le texte
        if year == "N/A":
            match_fallback = re.search(r'(201[5-9]|202[0-5])', soup.get_text())
            if match_fallback:
                year = int(match_fallback.group(0))

        # 3. AUTEURS (Via les balises <i>)
        authors = []
        balise_italique = soup.find('i')

        # On cherche la balise <a> √† l'int√©rieur de celle qu'on vient de trouver
        balise_lien = balise_italique.find('a')
        if balise_lien:
            auteur_principal = balise_lien.get_text(strip=True)
            authors.append(auteur_principal)
        
        content_div = soup.find('div', id='body')
        if content_div:
            # On cherche les auteurs en italique, en √©vitant "Department..."
            candidates = content_div.find_all('i', limit=15)
            for tag in candidates:
                text = tag.get_text(strip=True)
                # Filtres pour ne garder que les noms de personnes probables
                if text and "Department" not in text and "University" not in text and len(text) < 50:
                    if text not in authors:
                        authors.append(text)
        # enlever les doublons
        authors = list(dict.fromkeys(authors))
                            

        # 4. AFFILIATIONS (Via le span cach√© id="contact")
        affiliations = "Voir texte"
        contact_span = soup.find('span', id='contact')
        if contact_span:
            # R√©cup√®re le texte cach√© nettoy√©
            affiliations = contact_span.get_text(separator=" ", strip=True)

        # 5. JEL CODES (Via <b>JEL-codes:</b>)
        jel_codes = "N/A"
        jel_label = soup.find('b', string=re.compile(r"JEL-codes:"))
        if jel_label:
            # On prend le texte parent qui contient les codes et les liens
            jel_codes = jel_label.parent.get_text(strip=True).replace("JEL-codes:", "").strip()

        # 6. TYPE DE PUBLICATION & S√âRIE
        pub_type = "Journal Article" if "/article/" in url else "Working Paper"
        
        # Pour la s√©rie, on cherche le lien de l'institution souvent au d√©but
        series_name = "N/A"
        # Cherche un lien contenant 'Department' ou 'University' qui n'est pas un auteur
        series_link = soup.find('a', string=re.compile(r'(Department|University|School|Institute)'))
        if series_link:
            series_name = series_link.get_text(strip=True)

        return {
            "JEL Subject": jel_cat,
            "Title": title,
            "Author(s)": "; ".join(authors),
            "Year": year,
            "Type": pub_type,
            "Series/Journal": series_name,
            "Affiliations": affiliations,
            "URL": url
        }

    except Exception as e:
        # print(f"Erreur extraction {url}: {e}")
        return None

# ---------------------------------------------------------
# EX√âCUTION PRINCIPALE
# ---------------------------------------------------------
def main(csv_filename):
    all_data = []
    
    print(f"--- D√âMARRAGE HYBRIDE (Selenium + Requests) ---")
    
    for letter in JEL_LETTERS:
        print(f"\n>>> TRAITEMENT CAT√âGORIE : {letter}")
        
        # 1. Obtenir les liens via Selenium (C'est l√† qu'on clique sur le bouton)
        links = get_links_with_selenium(letter)
        print(f"   -> {len(links)} liens r√©cup√©r√©s. Passage √† l'extraction des donn√©es...")
        
        # 2. Scraper les d√©tails via Requests (Beaucoup plus rapide)
        count = 0
        for link in links:
            # Petite pause politesse
            # time.sleep(0.1) 
            
            data = get_paper_details(link, letter)
            if data:
                all_data.append(data)
                count += 1
                if count % 50 == 0:
                    print(f"      [{count}/{len(links)}] {data['Title'][:40]}...")

    # Sauvegarde
    df = pd.DataFrame(all_data)
    df.to_csv(csv_filename, index=False)
    print("\nTermin√© ! Donn√©es sauvegard√©es.")

if __name__ == "__main__":
    csv_filename = "RePEc_Final_Dataset_Corrected.csv"

    # V√©rifie si le fichier existe et le supprime
    if os.path.exists(csv_filename):
        os.remove(csv_filename)
        print(f"‚ôªÔ∏è Ancien fichier '{csv_filename}' supprim√©. On repart de z√©ro !")
    else:
        print(f"üÜï Aucun ancien fichier '{csv_filename}' trouv√©. Cr√©ation d'un nouveau.")
    main(csv_filename)

‚ôªÔ∏è Ancien fichier 'RePEc_Final_Dataset_Corrected.csv' supprim√©. On repart de z√©ro !
--- D√âMARRAGE HYBRIDE (Selenium + Requests) ---

>>> TRAITEMENT CAT√âGORIE : A
   [Selenium] Navigation vers : https://econpapers.repec.org/scripts/search.pf?jel=A*&ni=10%20years&inpage=1000
   [Selenium] Bouton cliqu√©. Attente des r√©sultats...
   [Selenium] 2. Menu d√©roulant r√©gl√© sur 20.
   -> 22 liens trouv√©s pour JEL A.
   -> 22 liens r√©cup√©r√©s. Passage √† l'extraction des donn√©es...

>>> TRAITEMENT CAT√âGORIE : B
   [Selenium] Navigation vers : https://econpapers.repec.org/scripts/search.pf?jel=B*&ni=10%20years&inpage=1000
   [Selenium] Bouton cliqu√©. Attente des r√©sultats...
   [Selenium] 2. Menu d√©roulant r√©gl√© sur 20.
   -> 22 liens trouv√©s pour JEL B.
   -> 22 liens r√©cup√©r√©s. Passage √† l'extraction des donn√©es...

>>> TRAITEMENT CAT√âGORIE : C
   [Selenium] Navigation vers : https://econpapers.repec.org/scripts/search.pf?jel=C*&ni=10%20years&inpage=1000
   [Selenium

#### Data cleaning

In [16]:
#csv to pandas dataframe
df = pd.read_csv("RePEc_Final_Dataset_Corrected.csv")

# remove irrelevant lines
df = df.loc[(df['Title'] != "Journals") & (df['Title'] != "Working Paper Series")]
df['Author(s)'] = df['Affiliations'].str.split(r' in | from |:', n=1).str[0]
df['Affiliations'] = df['Affiliations'].str.split(r' in | from |:', n=1).str[1]
df.to_csv("RePEc_dataset.csv", index=False)

df.head(20)




Unnamed: 0,JEL Subject,Title,Author(s),Year,Type,Series/Journal,Affiliations,URL
0,A,Preparing students for careers using business ...,Erland Hejn Nielsen,2020,Working Paper,Department of Economics and Business Economics...,Department of Economics and Business Economic...,https://econpapers.repec.org/paper/aahaarhec/2...
1,A,"Measuring Democracy - Eight indices: Polity, F...",Martin Paldam,2021,Working Paper,Department of Economics and Business Economics...,Department of Economics and Business Economic...,https://econpapers.repec.org/paper/aahaarhec/2...
2,A,Oeconstudiet og den √É¬∏konomiske faggruppe ved ...,Svend Hylleberg,2023,Working Paper,Department of Economics and Business Economics...,Department of Economics and Business Economic...,https://econpapers.repec.org/paper/aahaarhec/2...
3,A,Digital Tools in the Educational Environment E...,Diaconescu Andra,2024,Journal Article,School of Business,"Politehnica University of Timisoara, Faculty ...",https://econpapers.repec.org/article/aaijournl...
4,A,On the Gender Diversity of Research Teams in E...,Voir texte,2023,Journal Article,School of Business,,https://econpapers.repec.org/article/aeaapandp...
5,A,Messages That Foster a Sense of Belonging Impr...,Voir texte,2023,Journal Article,School of Business,,https://econpapers.repec.org/article/aeaapandp...
6,A,Parenthood and Academic Career Trajectories,Voir texte,2024,Journal Article,School of Business,,https://econpapers.repec.org/article/aeaapandp...
7,A,Impact versus Inclusion in the Economics Profe...,Voir texte,2024,Journal Article,School of Business,,https://econpapers.repec.org/article/aeaapandp...
8,A,Teaching-Track Economists: A Canadian Perspective,Voir texte,2024,Journal Article,School of Business,,https://econpapers.repec.org/article/aeaapandp...
9,A,"Male Is a Gender, Too: A Review of Why Gender ...",Voir texte,2016,Journal Article,School of Business,,https://econpapers.repec.org/article/aeajeclit...
