In [None]:
import re
import os
import pandas as pd
import requests
import string
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ---------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------
BASE_URL = "https://econpapers.repec.org"
JEL_LETTERS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
CSV_FILENAME = "RePEc_Full_Database.csv"
HEADERS = {"User-Agent": "Mozilla/5.0 (Research Scraping)"}

# ---------------------------------------------------------
# FONCTION : RÉCUPÉRATION DES LIENS PAR PAGE
# ---------------------------------------------------------
def get_links_from_current_page(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = []
    for a in soup.find_all("a", href=re.compile(r"/paper/|/article/")):
        href = a["href"]
        if "scripts" in href or "pers" in href:
            continue
        links.append(urljoin(BASE_URL, href))
    return list(dict.fromkeys(links))

# ---------------------------------------------------------
# FONCTION : MÉTADONNÉES
# ---------------------------------------------------------
def get_paper_details(url, jel_cat):
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        if response.status_code != 200: 
            return None
        
        soup = BeautifulSoup(response.text, "html.parser")
        title_tag = soup.find("h1", class_="colored")
        title = title_tag.get_text(strip=True) if title_tag else "N/A"

        year = None
        date_b = soup.find("b", string=re.compile("Date:"))
        if date_b:
            m = re.search(r"\d{4}", date_b.next_sibling or "")
            if m: 
                year = int(m.group())

        authors = [m["content"].replace(",", "") for m in soup.find_all("meta", {"name": "citation_author"})]
        journal_meta = soup.find("meta", {"name": "citation_journal_title"})
        journal = journal_meta.get("content", None) if journal_meta else None
        
        affil = soup.find("span", id="contact")
        affiliations = affil.get_text(" ", strip=True) if affil else None
        pub_type = "Journal Article" if "/article/" in url else "Working Paper"

        return {
            "JEL Subject": jel_cat,
            "Title": title,
            "Author(s)": "; ".join(authors) if authors else "N/A",
            "Journal": journal,
            "Year": year,
            "Type": pub_type,
            "Affiliations": affiliations,
            "URL": url
        }
    except Exception as e:
        print(f"Erreur sur {url}: {e}")
        return None

# ---------------------------------------------------------
# MAIN : NAVIGATION ET ÉCRITURE DIRECTE
# ---------------------------------------------------------
# --- CONFIGURATION DE REPRISE ---
START_JEL = "A"        # La lettre où le scrapping doit commencer
START_PAGE = 1     # La page où le scrapping doit recommencer pour la lettre 
# --------------------------------
def main():
    if not os.path.exists(CSV_FILENAME):
        pd.DataFrame(columns=["JEL Subject","Title","Author(s)","Journal","Year","Type","Affiliations","URL"]).to_csv(CSV_FILENAME, index=False)

    #LANCEMENT DU DRIVER AVEC LES OPTIONS
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")              
    options.add_argument("--disable-gpu")           
    options.add_argument("--no-sandbox")            
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # On filtre la liste des lettres pour commencer à START_JEL
    for jel in JEL_LETTERS[JEL_LETTERS.index(START_JEL):]:
        print(f"\n--- DÉBUT CATÉGORIE JEL: {jel} ---")
        search_url = f"{BASE_URL}/scripts/search.pf?jel={jel}*&ni=10%20years&inpage=1000"
        driver.get(search_url)
        wait = WebDriverWait(driver, 15)

        try:
            submit_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@type='SUBMIT']")))
            submit_btn.click()
            time.sleep(2)

            page_num = 1
            
            # --- LOGIQUE DE SAUT DE PAGES ---
            if jel == START_JEL and START_PAGE > 1:
                print(f"Saut rapide vers la page {START_PAGE}...")
                while page_num < START_PAGE:
                    try:
                        next_link = driver.find_element(By.XPATH, "//a[img[@class='rightarrow']]")
                        next_link.click()
                        page_num += 1
                        if page_num % 10 == 0: 
                            print(f"Passage de la page {page_num}...")
                            time.sleep(1) 
                    except:
                        break
            # --------------------------------

            while True:
                print(f"Extraction Page {page_num} pour JEL {jel}...")
                links = get_links_from_current_page(driver)
                
                current_batch = []
                for link in links:
                    details = get_paper_details(link, jel)
                    if details:
                        current_batch.append(details)
                
                if current_batch:
                    pd.DataFrame(current_batch).to_csv(CSV_FILENAME, mode='a', header=False, index=False)
                    current_batch = []

                try:
                    next_link = driver.find_element(By.XPATH, "//a[img[@class='rightarrow']]")
                    next_link.click()
                    page_num += 1
                    time.sleep(3)
                except:
                    print(f"Plus de pages pour la lettre {jel}.")
                    break

        except Exception as e:
            print(f"Erreur lors de la navigation pour {jel}: {e}")
            continue

    driver.quit()

if __name__ == "__main__":
    main()