In [None]:
# %% [markdown]
# # Scraper for ICLR Papers Across Multiple Years via OpenReview

# %%
import os
import re
import time
import logging
import csv
import requests
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementNotInteractableException,
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from pdfminer.high_level import extract_text
from webdriver_manager.chrome import ChromeDriverManager
import glob

# %%
# Setting up

# Configure Logging
logging.basicConfig(
    filename="scraper.log",
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)

# Also log to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console.setFormatter(formatter)
logging.getLogger().addHandler(console)

# Constants
BASE_URL = "https://openreview.net"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DataScraper/1.0; +https://yourdomain.com/)"
}
BASE_DOWNLOAD_DIR = "data/iclr"  # Base directory for all years

# Define metrics per year
METRICS_BY_YEAR = {
    "2024": ["Soundness", "Presentation", "Contribution", "Rating", "Confidence"],
    "2023": ["Correctness", "Technical Novelty And Significance", "Empirical Novelty And Significance", "Recommendation", "Confidence"],
    "2022": ["Correctness", "Technical Novelty And Significance", "Empirical Novelty And Significance", "Recommendation", "Confidence"],
    "2021": ["Rating", "Confidence"]
}

# Ensure base directory exists
os.makedirs(BASE_DOWNLOAD_DIR, exist_ok=True)

def setup_selenium():
    """Sets up Selenium with headless Chrome."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize Service with ChromeDriver
    service = Service(ChromeDriverManager().install())
    
    # Initialize WebDriver with Service and Options
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def fetch_html(driver, url, timeout=30):
    """Fetches the fully rendered HTML content of a given URL using Selenium."""
    try:
        logging.info(f"Fetching URL: {url}")
        driver.get(url)

        # Wait until the main content is loaded
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME, "note"))
        )

        # Scroll to the bottom to ensure all lazy-loaded content is fetched
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for additional content to load

        html = driver.page_source
        logging.info(f"Successfully fetched URL: {url}")
        return html
    except TimeoutException:
        logging.error(f"Timeout while loading {url}")
        return None
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
        return None

def parse_paper_info(soup):
    """Parses the paper's BeautifulSoup object to extract metadata."""
    paper_info = {}

    # Extract Title
    title_tag = soup.find("h2", class_="citation_title")
    paper_info["title"] = title_tag.text.strip() if title_tag else "N/A"

    # Extract Authors
    authors_tag = soup.find("div", class_="forum-authors")
    if authors_tag:
        authors = [author.text.strip() for author in authors_tag.find_all("a")]
        paper_info["authors"] = authors
    else:
        paper_info["authors"] = []

    # Extract Publication Date
    pub_date_tag = soup.find("span", class_="glyphicon-calendar")
    if pub_date_tag and pub_date_tag.parent:
        dates_text = pub_date_tag.parent.text.strip()
        publication_date = re.search(r"Published:\s*(.*?)(?:,|$)", dates_text)
        paper_info["publication_date"] = (
            publication_date.group(1) if publication_date else "N/A"
        )
    else:
        paper_info["publication_date"] = "N/A"

    # Extract PDF URL
    pdf_link_tag = soup.find("a", class_="citation_pdf_url")
    if pdf_link_tag and "href" in pdf_link_tag.attrs:
        pdf_url = pdf_link_tag["href"]
        if not pdf_url.startswith("http"):
            pdf_url = BASE_URL + pdf_url
        paper_info["pdf_url"] = pdf_url
    else:
        paper_info["pdf_url"] = None

    return paper_info

def download_pdf(pdf_url, save_path):
    """Downloads the PDF from the given URL to the specified path."""
    try:
        logging.info(f"Downloading PDF: {pdf_url}")
        response = requests.get(pdf_url, headers=HEADERS)
        response.raise_for_status()
        with open(save_path, "wb") as f:
            f.write(response.content)
        logging.info(f"Downloaded PDF to {save_path}")
        return True
    except requests.RequestException as e:
        logging.error(f"Error downloading PDF from {pdf_url}: {e}")
        return False

def extract_sections_from_pdf(pdf_path):
    """Extracts the abstract and introduction from the PDF."""
    try:
        logging.info(f"Extracting sections from PDF: {pdf_path}")
        text = extract_text(pdf_path)
        # Normalize whitespace
        text = re.sub(r"\s+", " ", text)

        # Improved regex patterns to accurately capture Abstract and Introduction
        abstract_match = re.search(
            r"(?is)abstract\s*(.*?)\s*(?:(introduction|1\.\s*Introduction|2\.\s*Methods|methods|conclusion|related work|acknowledgments|references|$))",
            text
        )
        introduction_match = re.search(
            r"(?is)(introduction|1\.\s*Introduction)\s*(.*?)\s*(?:(conclusion|related work|methods|acknowledgments|references|2\.\s*Methods|$))",
            text
        )

        abstract = abstract_match.group(1).strip() if abstract_match else "N/A"
        introduction = introduction_match.group(2).strip() if introduction_match else "N/A"

        logging.info(f"Extracted Abstract and Introduction from {pdf_path}")
        return abstract, introduction
    except Exception as e:
        logging.error(f"Error extracting text from PDF {pdf_path}: {e}")
        return "N/A", "N/A"

def convert_to_markdown(text, header):
    """Converts plain text to Markdown with a specified header."""
    if text == "N/A":
        markdown = f"## {header}\n\nN/A\n"
    else:
        markdown = f"## {header}\n\n{text}\n"
    return markdown

def save_markdown(content, filename):
    """Saves the given content to a Markdown file."""
    try:
        path = os.path.join(filename)
        with open(path, "w", encoding="utf-8") as f:
            f.write(content)
        logging.info(f"Saved Markdown to {path}")
    except Exception as e:
        logging.error(f"Error saving Markdown file {filename}: {e}")

def extract_reviewer_responses(soup):
    """Extracts all notes (reviews, meta-reviews, decisions, comments) in order."""
    responses = []

    # Find all notes (reviews, meta-reviews, decisions, comments)
    note_divs = soup.find_all("div", class_="note", attrs={"data-id": re.compile(".*")})

    for note in note_divs:
        # Determine the type of note
        invitation = note.find("span", class_="invitation")
        if not invitation:
            continue
        invitation_type = invitation.text.strip()

        # Extract author
        signatures_span = note.find("span", class_="signatures")
        author = "N/A"
        if signatures_span:
            author_tags = signatures_span.find_all("span")
            if author_tags:
                author = author_tags[-1].text.strip()
            else:
                author = signatures_span.text.strip()

        # Extract content fields
        content_div = note.find("div", class_="note-content")
        content_dict = {}
        if content_div:
            content_fields = content_div.find_all("div", recursive=False)
            for field in content_fields:
                field_name_tag = field.find("strong", class_="note-content-field")
                if not field_name_tag:
                    continue
                field_name = field_name_tag.text.strip(":").strip()
                field_value_div = field.find("div", class_="note-content-value")
                field_value_span = field.find("span", class_="note-content-value")
                field_value = ""
                if field_value_div:
                    field_value = md(str(field_value_div)).strip()
                elif field_value_span:
                    field_value = md(str(field_value_span)).strip()
                content_dict[field_name] = field_value

        # Determine the note type based on presence of 'Soundness'
        if "Soundness" in content_dict:
            note_type = "Official Review"
        else:
            note_type = invitation_type  # Use the invitation type as the note type

        # Append the note
        responses.append(
            {"type": note_type, "author": author, "content": content_dict}
        )

        # Handle nested comments (e.g., author responses)
        nested_comments = note.find_all(
            "div", class_="note", attrs={"data-id": re.compile(".*")}
        )
        for comment in nested_comments:
            comment_invitation = comment.find("span", class_="invitation")
            if comment_invitation:
                comment_type = comment_invitation.text.strip()
                comment_author_span = comment.find("span", class_="signatures")
                comment_author = "N/A"
                if comment_author_span:
                    author_tags = comment_author_span.find_all("span")
                    if author_tags:
                        comment_author = author_tags[-1].text.strip()
                    else:
                        comment_author = comment_author_span.text.strip()

                comment_content_div = comment.find("div", class_="note-content")
                comment_content_dict = {}
                if comment_content_div:
                    content_fields = comment_content_div.find_all("div", recursive=False)
                    for field in content_fields:
                        field_name_tag = field.find("strong", class_="note-content-field")
                        if not field_name_tag:
                            continue
                        field_name = field_name_tag.text.strip(":").strip()
                        field_value_div = field.find("div", class_="note-content-value")
                        field_value_span = field.find("span", class_="note-content-value")
                        field_value = ""
                        if field_value_div:
                            field_value = md(str(field_value_div)).strip()
                        elif field_value_span:
                            field_value = md(str(field_value_span)).strip()
                        comment_content_dict[field_name] = field_value

                responses.append(
                    {"type": comment_type, "author": comment_author, "content": comment_content_dict}
                )

    return responses

def save_reviewer_responses(responses, filename):
    """Saves reviewer responses to a Markdown file, maintaining the sequential order."""
    try:
        content = f"## Reviewer Responses\n\n"
        for idx, response in enumerate(responses, 1):
            content += f"### {response['type']} {idx}\n"
            content += f"**Author:** {response['author']}\n\n"
            for field_name, field_value in response["content"].items():
                content += f"**{field_name}:**\n{field_value}\n\n"
            content += "\n"
        save_markdown(content, filename)
        logging.info(f"Saved reviewer responses to {filename}")
    except Exception as e:
        logging.error(f"Error saving reviewer responses to {filename}: {e}")

def save_paper_metadata(paper_info, filename):
    """Saves paper metadata to a Markdown file."""
    try:
        content = f"# {paper_info['title']}\n\n"
        content += f"**Authors:** {', '.join(paper_info['authors'])}\n\n"
        content += f"**Publication Date:** {paper_info['publication_date']}\n\n"
        save_markdown(content, filename)
        logging.info(f"Saved metadata to {filename}")
    except Exception as e:
        logging.error(f"Error saving metadata to {filename}: {e}")

def scrape_paper(driver, year, paper_url):
    """Scrapes a single paper: downloads HTML and PDF."""
    logging.info(f"Starting scraping for paper: {paper_url} (Year: {year})")
    html = fetch_html(driver, paper_url)
    if not html:
        logging.warning(f"Failed to retrieve HTML for {paper_url}. Skipping.")
        return
    
    paper_id_match = re.search(r"id=(.+)", paper_url)
    paper_id = paper_id_match.group(1) if paper_id_match else "unknown"
    
    # Define directories for the year
    year_download_dir = os.path.join(BASE_DOWNLOAD_DIR, f"iclr_{year}")
    year_html_dir = os.path.join(year_download_dir, "HTML")
    year_pdf_dir = os.path.join(year_download_dir, "PDF")
    year_markdown_dir = os.path.join(year_download_dir, "Markdown")
    year_images_dir = os.path.join(year_download_dir, "Image")
    
    # Create directories if they don't exist
    for directory in [year_download_dir, year_html_dir, year_pdf_dir, year_markdown_dir, year_images_dir]:
        os.makedirs(directory, exist_ok=True)
    
    # Save HTML
    html_filename = os.path.join(year_html_dir, f"{paper_id}.html")
    try:
        with open(html_filename, "w", encoding="utf-8") as f:
            f.write(html)
        logging.info(f"Saved HTML to {html_filename}")
    except Exception as e:
        logging.error(f"Error saving HTML for {paper_url}: {e}")
    
    # Parse paper info to get PDF URL
    soup = BeautifulSoup(html, "html.parser")
    paper_info = parse_paper_info(soup)
    
    # Download PDF
    if paper_info["pdf_url"]:
        pdf_filename = f"{paper_id}.pdf"
        pdf_path = os.path.join(year_pdf_dir, pdf_filename)
        success = download_pdf(paper_info["pdf_url"], pdf_path)
        if success:
            logging.info(f"Successfully scraped paper: {paper_id} (Year: {year})")
    else:
        logging.warning(f"No PDF URL found for {paper_url}.")

def parse_paper(paper_id, year):
    """Parses the scraped HTML and PDF to extract metadata, sections, and reviewer responses."""
    year_download_dir = os.path.join(BASE_DOWNLOAD_DIR, f"iclr_{year}")
    html_filename = os.path.join(year_download_dir, "HTML", f"{paper_id}.html")
    pdf_path = os.path.join(year_download_dir, "PDF", f"{paper_id}.pdf")
    markdown_dir = os.path.join(year_download_dir, "Markdown")
    
    # Read HTML
    try:
        with open(html_filename, "r", encoding="utf-8") as f:
            html = f.read()
        soup = BeautifulSoup(html, "html.parser")
    except Exception as e:
        logging.error(f"Error reading HTML file {html_filename}: {e}")
        return
    
    # Parse paper info
    paper_info = parse_paper_info(soup)
    
    # Save metadata
    metadata_filename = os.path.join(markdown_dir, f"{paper_id}_metadata.md")
    save_paper_metadata(paper_info, metadata_filename)
    
    # Extract sections from PDF
    if os.path.exists(pdf_path):
        abstract, introduction = extract_sections_from_pdf(pdf_path)
        abstract_md = convert_to_markdown(abstract, "Abstract")
        introduction_md = convert_to_markdown(introduction, "Introduction")
        combined_md = abstract_md + "\n" + introduction_md
        sections_filename = os.path.join(markdown_dir, f"{paper_id}_sections.md")
        save_markdown(combined_md, sections_filename)
    else:
        logging.warning(f"PDF not found for paper ID {paper_id}. Skipping section extraction.")
    
    # Extract reviewer responses
    responses = extract_reviewer_responses(soup)
    if responses:
        responses_filename = os.path.join(markdown_dir, f"{paper_id}_responses.md")
        save_reviewer_responses(responses, responses_filename)
    else:
        logging.info(f"No reviewer responses found for paper ID {paper_id}.")
    
    logging.info(f"Completed parsing for paper ID: {paper_id} (Year: {year})")

def aggregate_csv(csv_filename="decisions_and_scores.csv"):
    """Aggregates decisions and scores from all *_responses.md files into a CSV."""
    csv_path = os.path.join(BASE_DOWNLOAD_DIR, csv_filename)
    fieldnames = [
        "paperid", "title", "year", "decision",
        "soundness", "presentation", "contribution",
        "correctness", "technical_novelty_and_significance",
        "empirical_novelty_and_significance",
        "review_rating", "recommendation",
        "confidence"
    ]

    try:
        with open(csv_path, "w", newline='', encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            # Iterate over each year directory
            for year in METRICS_BY_YEAR.keys():
                year_download_dir = os.path.join(BASE_DOWNLOAD_DIR, f"iclr_{year}")
                responses_files = glob.glob(os.path.join(year_download_dir, "Markdown", "*_responses.md"))
                for resp_file in responses_files:
                    paper_id = os.path.basename(resp_file).replace("_responses.md", "")
                    metadata_file = os.path.join(year_download_dir, "Markdown", f"{paper_id}_metadata.md")
                    
                    # Read metadata to get title
                    try:
                        with open(metadata_file, "r", encoding="utf-8") as f:
                            metadata = f.read()
                        title_match = re.search(r"# (.+)", metadata)
                        title = title_match.group(1).strip() if title_match else "N/A"
                    except Exception as e:
                        logging.error(f"Error reading metadata for {paper_id}: {e}")
                        title = "N/A"
                    
                    # Initialize metrics
                    decision = "N/A"
                    metrics = {
                        "soundness": [],
                        "presentation": [],
                        "contribution": [],
                        "correctness": [],
                        "technical_novelty_and_significance": [],
                        "empirical_novelty_and_significance": [],
                        "review_rating": [],
                        "recommendation": [],
                        "confidence": []
                    }
                    
                    # Read responses to get decision and reviews
                    try:
                        with open(resp_file, "r", encoding="utf-8") as f:
                            responses_md = f.read()
                        
                        # Split the responses_md into sections based on headers
                        sections = re.split(r"^### ", responses_md, flags=re.MULTILINE)
                        for section in sections:
                            if not section.strip():
                                continue
                            header_match = re.match(r"(\w+.*?)\n", section)
                            if header_match:
                                header = header_match.group(1).strip()
                                content = section[header_match.end():]
                                if header.startswith("Decision"):
                                    # Extract decision
                                    decision_match = re.search(r"\*\*Decision:\*\*\s*\n*(.+?)(?:\n\n|\Z)", content, re.DOTALL)
                                    decision = decision_match.group(1).strip() if decision_match else "N/A"
                                elif header.startswith("Official Review"):
                                    # Extract review fields based on year
                                    for metric in METRICS_BY_YEAR[year]:
                                        metric_key = metric.lower().replace(" ", "_").replace("and_", "_and_")
                                        pattern = rf"\*\*{re.escape(metric)}:\*\*\s*(\d+)"
                                        match = re.search(pattern, content, re.IGNORECASE)
                                        if match:
                                            metrics[metric_key].append(match.group(1))
                                        else:
                                            metrics[metric_key].append("N/A")
                                else:
                                    # Other types (Meta Review, Comments, etc.) are ignored for CSV
                                    pass
                        
                    except Exception as e:
                        logging.error(f"Error reading responses for {paper_id}: {e}")
                    
                    # Compile row data
                    row = {
                        "paperid": paper_id,
                        "title": title,
                        "year": year,
                        "decision": decision,
                        "soundness": str(metrics["soundness"]) if metrics["soundness"] else "N/A",
                        "presentation": str(metrics["presentation"]) if metrics["presentation"] else "N/A",
                        "contribution": str(metrics["contribution"]) if metrics["contribution"] else "N/A",
                        "correctness": str(metrics["correctness"]) if metrics["correctness"] else "N/A",
                        "technical_novelty_and_significance": str(metrics["technical_novelty_and_significance"]) if metrics["technical_novelty_and_significance"] else "N/A",
                        "empirical_novelty_and_significance": str(metrics["empirical_novelty_and_significance"]) if metrics["empirical_novelty_and_significance"] else "N/A",
                        "review_rating": str(metrics["review_rating"]) if metrics["review_rating"] else "N/A",
                        "recommendation": str(metrics["recommendation"]) if metrics["recommendation"] else "N/A",
                        "confidence": str(metrics["confidence"]) if metrics["confidence"] else "N/A"
                    }
                    
                    # Write to CSV
                    writer.writerow(row)
        
        logging.info(f"Aggregated CSV saved to {csv_path}")
    except Exception as e:
        logging.error(f"Error creating CSV file {csv_path}: {e}")

def process_papers_parallel_scrape(papers, max_workers=4):
    """Processes multiple papers in parallel for scraping."""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for year, url in papers:
            driver = setup_selenium()
            future = executor.submit(scrape_paper, driver, year, url)
            futures.append((future, driver))

        for future, driver in futures:
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error scraping a paper: {e}")
            finally:
                driver.quit()

def parse_paper_wrapper(papers, max_workers=4):
    """Parses all scraped papers in parallel."""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for year, url in papers:
            paper_id_match = re.search(r"id=(.+)", url)
            paper_id = paper_id_match.group(1) if paper_id_match else "unknown"
            year_download_dir = os.path.join(BASE_DOWNLOAD_DIR, f"iclr_{year}")
            future = executor.submit(parse_paper, paper_id, year)
            futures.append(future)

        for future in futures:
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error parsing a paper: {e}")

def get_paper_urls_from_page(driver, page_url):
    """Extract all unique paper URLs from the given OpenReview page."""
    driver.get(page_url)
    time.sleep(3)  # Give some time for page to load (adjust as necessary)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    paper_links = soup.find_all("a", href=True)

    # Filter out URLs with '&noteId=' and ensure they contain 'forum?id='
    paper_urls = [
        link["href"]
        for link in paper_links
        if "forum?id=" in link["href"] and "&noteId=" not in link["href"]
    ]

    return paper_urls

def switch_to_tab_with_js(driver, tab_id):
    """Use JavaScript to switch to the desired tab to avoid click interception."""
    try:
        logging.info(f"Switching to {tab_id} tab using JavaScript...")
        driver.execute_script(f"document.querySelector('a[href=\"#{tab_id}\"]').click();")
        time.sleep(5)  # Allow time for the page to update after clicking the tab
    except Exception as e:
        logging.error(f"Failed to switch to {tab_id} tab: {e}")

def scrape_all_pages(driver, year, base_url, tab_id):
    """Scrapes all pages from a single tab."""
    switch_to_tab_with_js(driver, tab_id)
    
    all_paper_urls = set()
    page_count = 1

    while True:
        logging.info(f"Scraping page {page_count} of {tab_id} tab for year {year}...")
        paper_urls = get_paper_urls_from_page(driver, base_url)
        all_paper_urls.update(paper_urls)
        
        if not go_to_next_page(driver):  # Stop if there are no more pages
            break
        
        page_count += 1

    return all_paper_urls

def go_to_next_page(driver):
    try:
        # Look for the "Next" button in the pagination section and click it
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.right-arrow > a'))
        )
        next_button.click()
        time.sleep(3)  # Allow some time for the next page to load
        return True
    except Exception as e:
        logging.info("No more pages to navigate.")
        return False

def scrape_multiple_tabs(year, base_url, tabs):
    """Scrapes multiple tabs for a given year."""
    driver = setup_selenium()
    driver.get(base_url)
    time.sleep(5)  # Give the page some time to load

    combined_paper_urls = set()
    
    for tab_id in tabs:
        logging.info(f"Scraping {tab_id} tab for year {year}...")
        paper_urls = scrape_all_pages(driver, year, base_url, tab_id)
        combined_paper_urls.update(paper_urls)
    
    driver.quit()

    # Return a list of (year, formatted_url)
    formatted_urls = [f"https://openreview.net{url}" for url in combined_paper_urls]
    return [(year, url) for url in formatted_urls]

def save_urls_to_file(filename, urls):
    """Saves the paper URLs to a text file."""
    try:
        with open(filename, 'w') as f:
            for url in urls:
                f.write(f"{url}\n")
        logging.info(f"Saved {len(urls)} URLs to {filename}")
    except Exception as e:
        logging.error(f"Error saving URLs to file {filename}: {e}")

def main():
    """Main function to orchestrate scraping and aggregation."""
    # Define the groups per year with their respective tabs
    groups = [
        {
            "year": "2024",
            "base_url": "https://openreview.net/group?id=ICLR.cc/2024/Conference",
            "tabs": ["accept-oral", "accept-spotlight", "accept-poster", "reject"]
        },
        {
            "year": "2023",
            "base_url": "https://openreview.net/group?id=ICLR.cc/2023/Conference",
            "tabs": ["notable-top-25-", "poster", "submitted"]
        },
        {
            "year": "2022",
            "base_url": "https://openreview.net/group?id=ICLR.cc/2022/Conference",
            "tabs": ["spotlight-submissions", "poster-submissions", "submitted-submissions"]
        },
        {
            "year": "2021",
            "base_url": "https://openreview.net/group?id=ICLR.cc/2021/Conference",
            "tabs": ["submitted-submissions", "spotlight-presentations", "poster-presentations"]
        }
    ]

    all_papers = []

    # Scrape paper URLs for each group
    for group in groups:
        year = group["year"]
        base_url = group["base_url"]
        tabs = group["tabs"]
        logging.info(f"Starting URL scraping for year {year}...")
        papers = scrape_multiple_tabs(year, base_url, tabs)
        all_papers.extend(papers)
        save_urls_to_file(f"{year}_paper_urls.txt", [url for _, url in papers])
        logging.info(f"Completed URL scraping for year {year}.")

    logging.info(f"Total papers to scrape: {len(all_papers)}")

    # Scrape all papers in parallel
    logging.info("Starting paper scraping...")
    process_papers_parallel_scrape(all_papers, max_workers=8)
    logging.info("Completed paper scraping.")

    # Parse all papers in parallel
    logging.info("Starting paper parsing...")
    parse_paper_wrapper(all_papers, max_workers=8)
    logging.info("Completed paper parsing.")

    # Aggregate all data into CSV
    logging.info("Starting CSV aggregation...")
    aggregate_csv()
    logging.info("Completed CSV aggregation.")

if __name__ == "__main__":
    main()
