In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from google.colab import drive  # Google Drive integration

# Mount Google Drive
drive.mount('/content/drive')

# File path to save in Google Drive
SAVE_PATH = "/content/drive/My Drive/Minor/papers_2021.csv"

# ACL Anthology event URL format
EVENT_URL_TEMPLATE = "https://aclanthology.org/events/acl-{year}/"

# Years for which we want to scrape papers
START_YEAR = 2021
END_YEAR = 2021

# Function to clean extracted BibTeX values
def clean_text(text):
    """Removes extra spaces, LaTeX artifacts, and special characters."""
    text = text.replace("{", "").replace("}", "")
    text = text.replace("\\ensuremath{>}", ">")  # Fix LaTeX math symbols
    text = text.replace("\\url", "")  # Remove \url references
    text = text.replace("{--}", "–")  # Convert LaTeX dashes
    text = text.strip()  # Remove extra spaces
    return text

# Function to parse BibTeX for a single paper
def parse_bibtex(bibtex_text, paper_id):
    """
    Parses a BibTeX formatted string and extracts volume name, title, authors, abstract, and URL.
    Handles multi-line authors and cleans extracted text.
    """
    title_match = re.search(r'title\s*=\s*[{"](.+?)[}"]', bibtex_text, re.DOTALL)
    author_match = re.findall(r'author\s*=\s*"(.*?)"', bibtex_text, re.DOTALL)
    year_match = re.search(r'year\s*=\s*[{"](\d+)[}"]', bibtex_text)
    url_match = re.search(r'url\s*=\s*[{"](.+?)[}"]', bibtex_text)
    booktitle_match = re.search(r'booktitle\s*=\s*[{"](.+?)[}"]', bibtex_text, re.DOTALL)

    title = clean_text(title_match.group(1)) if title_match else "Title Not Found"

    # Fix multi-line author parsing
    authors = []
    for match in author_match:
        authors.extend([a.strip() for a in match.split("  and\n ")])  # Handle 'and' separators
    authors = ", ".join(authors) if authors else "Authors Not Found"

    year = year_match.group(1) if year_match else "Year Not Found"
    paper_url = f"https://aclanthology.org/{paper_id}"
    pdf_link = f"https://aclanthology.org/{paper_id}.pdf"  # Direct PDF link
    volume_name = clean_text(booktitle_match.group(1)) if booktitle_match else "Volume Name Not Found"

    return {
        "volume_name": volume_name,
        "title": title,
        "authors": authors,
        "year": year,
        "pdf_link": pdf_link,
        "url": paper_url
    }

# Function to scrape a single paper's BibTeX page
def scrape_paper(paper_id):
    paper_bib_url = f"https://aclanthology.org/{paper_id}.bib"
    response = requests.get(paper_bib_url)

    if response.status_code != 200:
        return None  # Skip failed requests

    bibtex_text = response.text.strip()  # Get the BibTeX content
    return parse_bibtex(bibtex_text, paper_id)  # Extract useful details

# Function to scrape all papers from a given year
def scrape_year(year):
    event_url = EVENT_URL_TEMPLATE.format(year=year)
    response = requests.get(event_url)

    if response.status_code != 200:
        print(f"Failed to retrieve ACL event page for {year}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract proper paper identifiers (without slashes)
    paper_ids = [
        a['href'].strip("/")
        for a in soup.find_all('a', href=True)
        if a['href'].startswith(f"/{year}.") and not a['href'].endswith(".bib")
    ]

    if not paper_ids:
        print(f" Found 0 papers for {year}")
        return []

    print(f" Found {len(paper_ids)} papers for {year}")

    papers_data = []
    for paper_id in paper_ids:
        paper_data = scrape_paper(paper_id)
        if paper_data:
            papers_data.append(paper_data)

        # Add random delay to prevent rate limiting
        time.sleep(random.uniform(1, 3))

    return papers_data

# Function to scrape multiple years and save to CSV
def scrape_acl_papers():
    all_papers = []

    for year in range(START_YEAR, END_YEAR + 1):
        print(f"\n Scraping papers for year {year}...")
        papers = scrape_year(year)
        all_papers.extend(papers)
        print(f" {len(papers)} papers scraped for {year}.")

    # Convert data to DataFrame
    df = pd.DataFrame(all_papers)

    # Save to Google Drive without displaying contents
    df.to_csv(SAVE_PATH, index=False)
    print(f"\n Scraping complete! Data saved to {SAVE_PATH}")

# Run the scraper
scrape_acl_papers()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

 Scraping papers for year 2021...
 Found 1849 papers for 2021
