In [5]:
# %% [markdown]
# # Scraper for ICLR Papers through OpenReview

# %%
import os
import re
import time
import logging
import csv
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementNotInteractableException,
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from pdfminer.high_level import extract_text
from webdriver_manager.chrome import ChromeDriverManager
import glob

# %%
# Setting up

# Configure Logging
logging.basicConfig(
    filename="scraper.log",
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)

# Also log to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console.setFormatter(formatter)
logging.getLogger().addHandler(console)

# Constants
BASE_URL = "https://openreview.net"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DataScraper/1.0; +https://yourdomain.com/)"
}
DOWNLOAD_DIR = "data/iclr_2024"
HTML_DIR = os.path.join(DOWNLOAD_DIR, "HTML")
PDF_DIR = os.path.join(DOWNLOAD_DIR, "PDF")
MARKDOWN_DIR = os.path.join(DOWNLOAD_DIR, "Markdown")
IMAGES_DIR = os.path.join(DOWNLOAD_DIR, "Image")

# Create directories if they don't exist
for directory in [DOWNLOAD_DIR, HTML_DIR, PDF_DIR, MARKDOWN_DIR, IMAGES_DIR]:
    os.makedirs(directory, exist_ok=True)

def setup_selenium():
    """Sets up Selenium with headless Chrome."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize Service with ChromeDriver
    service = Service(ChromeDriverManager().install())
    
    # Initialize WebDriver with Service and Options
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def fetch_html(driver, url, timeout=20):
    """Fetches the fully rendered HTML content of a given URL using Selenium."""
    try:
        logging.info(f"Fetching URL: {url}")
        driver.get(url)

        # Wait until the main content is loaded
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME, "note"))
        )

        # Optional: Scroll to the bottom to ensure all lazy-loaded content is fetched
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for additional content to load

        html = driver.page_source
        logging.info(f"Successfully fetched URL: {url}")
        return html
    except TimeoutException:
        logging.error(f"Timeout while loading {url}")
        return None
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
        return None

def parse_paper_info(soup):
    """Parses the paper's BeautifulSoup object to extract metadata."""
    paper_info = {}

    # Extract Title
    title_tag = soup.find("h2", class_="citation_title")
    paper_info["title"] = title_tag.text.strip() if title_tag else "N/A"

    # Extract Authors
    authors_tag = soup.find("div", class_="forum-authors")
    if authors_tag:
        authors = [author.text.strip() for author in authors_tag.find_all("a")]
        paper_info["authors"] = authors
    else:
        paper_info["authors"] = []

    # Extract Publication Date
    pub_date_tag = soup.find("span", class_="glyphicon-calendar")
    if pub_date_tag and pub_date_tag.parent:
        dates_text = pub_date_tag.parent.text.strip()
        publication_date = re.search(r"Published:\s*(.*?)(?:,|$)", dates_text)
        paper_info["publication_date"] = (
            publication_date.group(1) if publication_date else "N/A"
        )
    else:
        paper_info["publication_date"] = "N/A"

    # Extract PDF URL
    pdf_link_tag = soup.find("a", class_="citation_pdf_url")
    if pdf_link_tag and "href" in pdf_link_tag.attrs:
        pdf_url = pdf_link_tag["href"]
        if not pdf_url.startswith("http"):
            pdf_url = BASE_URL + pdf_url
        paper_info["pdf_url"] = pdf_url
    else:
        paper_info["pdf_url"] = None

    return paper_info

def download_pdf(pdf_url, save_path):
    """Downloads the PDF from the given URL to the specified path."""
    try:
        logging.info(f"Downloading PDF: {pdf_url}")
        response = requests.get(pdf_url, headers=HEADERS)
        response.raise_for_status()
        with open(save_path, "wb") as f:
            f.write(response.content)
        logging.info(f"Downloaded PDF to {save_path}")
        return True
    except requests.RequestException as e:
        logging.error(f"Error downloading PDF from {pdf_url}: {e}")
        return False

def extract_sections_from_pdf(pdf_path):
    """Extracts the abstract and introduction from the PDF."""
    try:
        logging.info(f"Extracting sections from PDF: {pdf_path}")
        text = extract_text(pdf_path)
        # Normalize whitespace
        text = re.sub(r"\s+", " ", text)

        # Improved regex patterns to accurately capture Abstract and Introduction
        abstract_match = re.search(
            r"(?is)abstract\s*(.*?)\s*(?:(introduction|1\.\s*Introduction|2\.\s*Methods|methods|conclusion|related work|acknowledgments|references|$))",
            text
        )
        introduction_match = re.search(
            r"(?is)(introduction|1\.\s*Introduction)\s*(.*?)\s*(?:(conclusion|related work|methods|acknowledgments|references|2\.\s*Methods|$))",
            text
        )

        abstract = abstract_match.group(1).strip() if abstract_match else "N/A"
        introduction = introduction_match.group(2).strip() if introduction_match else "N/A"

        logging.info(f"Extracted Abstract and Introduction from {pdf_path}")
        return abstract, introduction
    except Exception as e:
        logging.error(f"Error extracting text from PDF {pdf_path}: {e}")
        return "N/A", "N/A"

def convert_to_markdown(text, header):
    """Converts plain text to Markdown with a specified header."""
    if text == "N/A":
        markdown = f"## {header}\n\nN/A\n"
    else:
        markdown = f"## {header}\n\n{text}\n"
    return markdown

def save_markdown(content, filename):
    """Saves the given content to a Markdown file."""
    try:
        path = os.path.join(MARKDOWN_DIR, filename)
        with open(path, "w", encoding="utf-8") as f:
            f.write(content)
        logging.info(f"Saved Markdown to {path}")
    except Exception as e:
        logging.error(f"Error saving Markdown file {filename}: {e}")

def extract_reviewer_responses(soup):
    """Extracts all notes (reviews, meta-reviews, decisions, comments) in order."""
    responses = []

    # Find all notes (reviews, meta-reviews, decisions, comments)
    note_divs = soup.find_all("div", class_="note", attrs={"data-id": re.compile(".*")})

    for note in note_divs:
        # Determine the type of note
        invitation = note.find("span", class_="invitation")
        if not invitation:
            continue
        invitation_type = invitation.text.strip()

        # Extract author
        signatures_span = note.find("span", class_="signatures")
        author = "N/A"
        if signatures_span:
            author_tags = signatures_span.find_all("span")
            if author_tags:
                author = author_tags[-1].text.strip()
            else:
                author = signatures_span.text.strip()

        # Extract content fields
        content_div = note.find("div", class_="note-content")
        content_dict = {}
        if content_div:
            content_fields = content_div.find_all("div", recursive=False)
            for field in content_fields:
                field_name_tag = field.find("strong", class_="note-content-field")
                if not field_name_tag:
                    continue
                field_name = field_name_tag.text.strip(":").strip()
                field_value_div = field.find("div", class_="note-content-value")
                field_value_span = field.find("span", class_="note-content-value")
                field_value = ""
                if field_value_div:
                    field_value = md(str(field_value_div)).strip()
                elif field_value_span:
                    field_value = md(str(field_value_span)).strip()
                content_dict[field_name] = field_value

        # Determine the note type
        if "Soundness" in content_dict:
            note_type = "Official Review"
        else:
            note_type = invitation_type  # Use the invitation type as the note type

        # Append the note
        responses.append(
            {"type": note_type, "author": author, "content": content_dict}
        )

        # Handle nested comments (e.g., author responses)
        nested_comments = note.find_all(
            "div", class_="note", attrs={"data-id": re.compile(".*")}
        )
        for comment in nested_comments:
            comment_invitation = comment.find("span", class_="invitation")
            if comment_invitation:
                comment_type = comment_invitation.text.strip()
                comment_author_span = comment.find("span", class_="signatures")
                comment_author = "N/A"
                if comment_author_span:
                    author_tags = comment_author_span.find_all("span")
                    if author_tags:
                        comment_author = author_tags[-1].text.strip()
                    else:
                        comment_author = comment_author_span.text.strip()

                comment_content_div = comment.find("div", class_="note-content")
                comment_content_dict = {}
                if comment_content_div:
                    content_fields = comment_content_div.find_all("div", recursive=False)
                    for field in content_fields:
                        field_name_tag = field.find("strong", class_="note-content-field")
                        if not field_name_tag:
                            continue
                        field_name = field_name_tag.text.strip(":").strip()
                        field_value_div = field.find("div", class_="note-content-value")
                        field_value_span = field.find("span", class_="note-content-value")
                        field_value = ""
                        if field_value_div:
                            field_value = md(str(field_value_div)).strip()
                        elif field_value_span:
                            field_value = md(str(field_value_span)).strip()
                        comment_content_dict[field_name] = field_value

                responses.append(
                    {"type": comment_type, "author": comment_author, "content": comment_content_dict}
                )

    return responses


def save_reviewer_responses(responses, filename):
    """Saves reviewer responses to a Markdown file, maintaining the sequential order."""
    try:
        content = f"## Reviewer Responses\n\n"
        for idx, response in enumerate(responses, 1):
            content += f"### {response['type']} {idx}\n"
            content += f"**Author:** {response['author']}\n\n"
            for field_name, field_value in response["content"].items():
                content += f"**{field_name}:**\n{field_value}\n\n"
            content += "\n"
        save_markdown(content, filename)
        logging.info(f"Saved reviewer responses to {filename}")
    except Exception as e:
        logging.error(f"Error saving reviewer responses to {filename}: {e}")


def save_paper_metadata(paper_info, filename):
    """Saves paper metadata to a Markdown file."""
    try:
        content = f"# {paper_info['title']}\n\n"
        content += f"**Authors:** {', '.join(paper_info['authors'])}\n\n"
        content += f"**Publication Date:** {paper_info['publication_date']}\n\n"
        save_markdown(content, filename)
        logging.info(f"Saved metadata to {filename}")
    except Exception as e:
        logging.error(f"Error saving metadata to {filename}: {e}")

def scrape_paper(driver, paper_url):
    """Scrapes a single paper: downloads HTML and PDF."""
    logging.info(f"Starting scraping for paper: {paper_url}")
    html = fetch_html(driver, paper_url)
    if not html:
        logging.warning(f"Failed to retrieve HTML for {paper_url}. Skipping.")
        return
    
    paper_id_match = re.search(r"id=(.+)", paper_url)
    paper_id = paper_id_match.group(1) if paper_id_match else "unknown"
    
    # Save HTML
    html_filename = os.path.join(HTML_DIR, f"{paper_id}.html")
    try:
        with open(html_filename, "w", encoding="utf-8") as f:
            f.write(html)
        logging.info(f"Saved HTML to {html_filename}")
    except Exception as e:
        logging.error(f"Error saving HTML for {paper_url}: {e}")
    
    # Parse paper info to get PDF URL
    soup = BeautifulSoup(html, "html.parser")
    paper_info = parse_paper_info(soup)
    
    # Download PDF
    if paper_info["pdf_url"]:
        pdf_filename = f"{paper_id}.pdf"
        pdf_path = os.path.join(PDF_DIR, pdf_filename)
        success = download_pdf(paper_info["pdf_url"], pdf_path)
        if success:
            logging.info(f"Successfully scraped paper: {paper_id}")
    else:
        logging.warning(f"No PDF URL found for {paper_url}.")

def parse_paper(paper_id):
    """Parses the scraped HTML and PDF to extract metadata, sections, and reviewer responses."""
    html_filename = os.path.join(HTML_DIR, f"{paper_id}.html")
    pdf_path = os.path.join(PDF_DIR, f"{paper_id}.pdf")
    
    # Read HTML
    try:
        with open(html_filename, "r", encoding="utf-8") as f:
            html = f.read()
        soup = BeautifulSoup(html, "html.parser")
    except Exception as e:
        logging.error(f"Error reading HTML file {html_filename}: {e}")
        return
    
    # Parse paper info
    paper_info = parse_paper_info(soup)
    
    # Save metadata
    metadata_filename = f"{paper_id}_metadata.md"
    save_paper_metadata(paper_info, metadata_filename)
    
    # Extract sections from PDF
    if os.path.exists(pdf_path):
        abstract, introduction = extract_sections_from_pdf(pdf_path)
        abstract_md = convert_to_markdown(abstract, "Abstract")
        introduction_md = convert_to_markdown(introduction, "Introduction")
        combined_md = abstract_md + "\n" + introduction_md
        sections_filename = f"{paper_id}_sections.md"
        save_markdown(combined_md, sections_filename)
    else:
        logging.warning(f"PDF not found for paper ID {paper_id}. Skipping section extraction.")
    
    # Extract reviewer responses
    responses = extract_reviewer_responses(soup)
    if responses:
        responses_filename = f"{paper_id}_responses.md"
        save_reviewer_responses(responses, responses_filename)
    else:
        logging.info(f"No reviewer responses found for paper ID {paper_id}.")
    
    logging.info(f"Completed parsing for paper ID: {paper_id}")

def aggregate_csv(csv_filename="decisions_and_scores.csv"):
    """Aggregates decisions and scores from all *_responses.md files into a CSV."""
    csv_path = os.path.join(DOWNLOAD_DIR, csv_filename)
    fieldnames = ["paperid", "title", "decision", "soundness", "presentation", "contribution", "review_rating", "confidence"]

    try:
        with open(csv_path, "w", newline='', encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            # Get all responses.md files
            responses_files = glob.glob(os.path.join(MARKDOWN_DIR, "*_responses.md"))
            for resp_file in responses_files:
                paper_id = os.path.basename(resp_file).replace("_responses.md", "")
                metadata_file = os.path.join(MARKDOWN_DIR, f"{paper_id}_metadata.md")
                
                # Read metadata to get title
                try:
                    with open(metadata_file, "r", encoding="utf-8") as f:
                        metadata = f.read()
                    title_match = re.search(r"# (.+)", metadata)
                    title = title_match.group(1).strip() if title_match else "N/A"
                except Exception as e:
                    logging.error(f"Error reading metadata for {paper_id}: {e}")
                    title = "N/A"
                
                # Initialize lists
                decision = "N/A"
                soundness_list = []
                presentation_list = []
                contribution_list = []
                rating_list = []
                confidence_list = []
                
                # Read responses to get decision and reviews
                try:
                    with open(resp_file, "r", encoding="utf-8") as f:
                        responses_md = f.read()
                    
                    # Split the responses_md into sections
                    sections = re.split(r"^### ", responses_md, flags=re.MULTILINE)
                    for section in sections:
                        if not section.strip():
                            continue
                        header_match = re.match(r"(\w+.*?)\n", section)
                        if header_match:
                            header = header_match.group(1).strip()
                            content = section[header_match.end():]
                            if header.startswith("Decision"):
                                # Extract decision
                                decision_match = re.search(r"\*\*Decision:\*\*\s*\n*(.+?)(?:\n\n|\Z)", content, re.DOTALL)
                                decision = decision_match.group(1).strip() if decision_match else "N/A"
                            elif header.startswith("Official Review"):
                                # Extract review fields
                                soundness = re.search(r"\*\*Soundness:\*\*\s*(\d+)", content, re.IGNORECASE)
                                presentation = re.search(r"\*\*Presentation:\*\*\s*(\d+)", content, re.IGNORECASE)
                                contribution = re.search(r"\*\*Contribution:\*\*\s*(\d+)", content, re.IGNORECASE)
                                rating = re.search(r"\*\*Rating:\*\*\s*(\d+)", content, re.IGNORECASE)
                                confidence = re.search(r"\*\*Confidence:\*\*\s*(\d+)", content, re.IGNORECASE)
                                
                                soundness_list.append(soundness.group(1) if soundness else "N/A")
                                presentation_list.append(presentation.group(1) if presentation else "N/A")
                                contribution_list.append(contribution.group(1) if contribution else "N/A")
                                rating_list.append(rating.group(1) if rating else "N/A")
                                confidence_list.append(confidence.group(1) if confidence else "N/A")
                            else:
                                # Other types, ignore for CSV
                                pass
                    
                except Exception as e:
                    logging.error(f"Error reading responses for {paper_id}: {e}")
                
                # Compile row data
                row = {
                    "paperid": paper_id,
                    "title": title,
                    "decision": decision,
                    "soundness": str(soundness_list),
                    "presentation": str(presentation_list),
                    "contribution": str(contribution_list),
                    "review_rating": str(rating_list),
                    "confidence": str(confidence_list)
                }
                
                # Write to CSV
                writer.writerow(row)
        
        logging.info(f"Aggregated CSV saved to {csv_path}")
    except Exception as e:
        logging.error(f"Error creating CSV file {csv_path}: {e}")


def process_papers_parallel_scrape(paper_urls, max_workers=4):
    """Processes multiple papers in parallel for scraping."""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for url in paper_urls:
            driver = setup_selenium()
            future = executor.submit(scrape_paper, driver, url)
            futures.append((future, driver))

        for future, driver in futures:
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error scraping a paper: {e}")
            finally:
                driver.quit()

def process_all_papers_parsing():
    """Parses all scraped papers."""
    html_files = glob.glob(os.path.join(HTML_DIR, "*.html"))
    for html_file in html_files:
        paper_id = os.path.basename(html_file).replace(".html", "")
        parse_paper(paper_id)

def run_aggregation():
    """Runs the CSV aggregation after parsing."""
    aggregate_csv()

def get_paper_urls_from_page(driver, page_url):
    """Extract all unique paper URLs from the given OpenReview page."""
    driver.get(page_url)
    time.sleep(3)  # Give some time for page to load (adjust as necessary)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    paper_links = soup.find_all("a", href=True)

    # Filter out URLs with '&noteId=' and ensure they contain 'forum?id='
    paper_urls = [
        link["href"]
        for link in paper_links
        if "forum?id=" in link["href"] and "&noteId=" not in link["href"]
    ]

    return paper_urls

def url_getter():
    base_url = "https://openreview.net/group?id=ICLR.cc/2024/Conference"  # The URL of the OpenReview ICLR page
    driver = setup_selenium()

    all_paper_urls = []

    # Modify the range as needed to scrape multiple pages
    for page_number in range(1):
        page_url = f"{base_url}&page={page_number}"
        paper_urls = get_paper_urls_from_page(driver, page_url)
        all_paper_urls.extend(paper_urls)

    driver.quit()

    # Create a list with the desired format
    unique_urls = list(set(all_paper_urls))
    # For prototyping, limit to first 5 papers
    formatted_urls = [f"https://openreview.net{url}" for url in unique_urls[:15]]

    return formatted_urls

# %%
# Experimenting with the scraper

# Initial list of paper URLs (you can comment this out if using url_getter)
# paper_urls = [
#     "https://openreview.net/forum?id=KS8mIvetg2",
#     "https://openreview.net/forum?id=7Ttk3RzDeu",
#     "https://openreview.net/forum?id=ANvmVS2Yr0",
#     "https://openreview.net/forum?id=ekeyCgeRfC",
# ]

# Alternatively, get paper URLs from the OpenReview page
paper_urls = url_getter()
print('\n'.join(paper_urls))
process_papers_parallel_scrape(paper_urls, max_workers=8)

# %%
# After scraping, parse all papers and aggregate CSV
process_all_papers_parsing()
aggregate_csv()


2024-10-27 01:46:36,089 - INFO - Get LATEST chromedriver version for google-chrome
2024-10-27 01:46:36,174 - INFO - Get LATEST chromedriver version for google-chrome
2024-10-27 01:46:36,243 - INFO - Driver [/Users/yd211/.wdm/drivers/chromedriver/mac64/130.0.6723.69/chromedriver-mac-arm64/chromedriver] found in cache
2024-10-27 01:46:41,236 - INFO - Get LATEST chromedriver version for google-chrome


https://openreview.net/forum?id=IdibrApfps
https://openreview.net/forum?id=3SJE1WLB4M
https://openreview.net/forum?id=lyoOWX0e0O
https://openreview.net/forum?id=aN4Jf6Cx69
https://openreview.net/forum?id=BXYZvcgVUv
https://openreview.net/forum?id=kIZcruKmBg
https://openreview.net/forum?id=GzNaCp6Vcg
https://openreview.net/forum?id=RzNlECeoOB
https://openreview.net/forum?id=2DJMtdfgfH
https://openreview.net/forum?id=buC4E91xZE
https://openreview.net/forum?id=OpWg0ldkcB
https://openreview.net/forum?id=ApjY32f3Xr
https://openreview.net/forum?id=hcXfzlmg7Y
https://openreview.net/forum?id=OkHHJcMroY
https://openreview.net/forum?id=uNrFpDPMyo


2024-10-27 01:46:41,401 - INFO - Get LATEST chromedriver version for google-chrome
2024-10-27 01:46:41,474 - INFO - Driver [/Users/yd211/.wdm/drivers/chromedriver/mac64/130.0.6723.69/chromedriver-mac-arm64/chromedriver] found in cache
2024-10-27 01:46:41,977 - INFO - Starting scraping for paper: https://openreview.net/forum?id=IdibrApfps
2024-10-27 01:46:41,985 - INFO - Fetching URL: https://openreview.net/forum?id=IdibrApfps
2024-10-27 01:46:42,076 - INFO - Get LATEST chromedriver version for google-chrome
2024-10-27 01:46:42,158 - INFO - Get LATEST chromedriver version for google-chrome
2024-10-27 01:46:42,234 - INFO - Driver [/Users/yd211/.wdm/drivers/chromedriver/mac64/130.0.6723.69/chromedriver-mac-arm64/chromedriver] found in cache
2024-10-27 01:46:42,759 - INFO - Starting scraping for paper: https://openreview.net/forum?id=3SJE1WLB4M
2024-10-27 01:46:42,770 - INFO - Fetching URL: https://openreview.net/forum?id=3SJE1WLB4M
2024-10-27 01:46:42,880 - INFO - Get LATEST chromedriver 

In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def setup_selenium():
    options = Options()
    options.add_argument("--headless")  # Run in headless mode for faster scraping
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

def switch_to_tab_with_js(driver, tab_id):
    """Use JavaScript to switch to the desired tab to avoid click interception."""
    try:
        print(f"Switching to {tab_id} tab using JavaScript...")
        driver.execute_script(f"document.querySelector('a[href=\"#{tab_id}\"]').click();")
        time.sleep(5)  # Allow time for the page to update after clicking the tab
    except Exception as e:
        print(f"Failed to switch to {tab_id} tab: {e}")

def get_paper_urls_from_page(driver):
    try:
        # Wait for the paper titles to appear (with a timeout of 20 seconds)
        paper_elements = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.note h4 a'))
        )
        print(f"Found {len(paper_elements)} papers on the page.")
        
        # Collect all paper links on the page
        paper_urls = [element.get_attribute("href") for element in paper_elements]
        
        # Filter only forum URLs (exclude PDF links)
        forum_urls = [url for url in paper_urls if "forum?id=" in url]
        
        return forum_urls
    except Exception as e:
        print(f"Error: {e}")
        return []

def go_to_next_page(driver):
    try:
        # Look for the "Next" button in the pagination section and click it
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.right-arrow > a'))
        )
        next_button.click()
        time.sleep(3)  # Allow some time for the next page to load
        return True
    except Exception as e:
        print("No more pages to navigate.")
        return False

def scrape_all_pages(driver, tab_id):
    """Scrapes all pages from a single tab."""
    switch_to_tab_with_js(driver, tab_id)
    
    all_paper_urls = set()
    page_count = 1

    while True:
        print(f"Scraping page {page_count} of {tab_id} tab...")
        paper_urls = get_paper_urls_from_page(driver)
        all_paper_urls.update(paper_urls)
        
        if not go_to_next_page(driver):  # Stop if there are no more pages
            break
        
        page_count += 1

    return all_paper_urls

def scrape_multiple_tabs(base_url):
    driver = setup_selenium()
    driver.get(base_url)
    time.sleep(5)  # Give the page some time to load

    combined_paper_urls = set()
    
    # Define tab ids for "oral," "spotlight," and "poster"
    tabs = {
        "accept-oral": "Oral",
        "accept-spotlight": "Spotlight",
        "accept-poster": "Poster"
    }
    
    for tab_id, tab_name in tabs.items():
        print(f"Scraping {tab_name} tab...")
        paper_urls = scrape_all_pages(driver, tab_id)
        combined_paper_urls.update(paper_urls)
    
    driver.quit()

    return combined_paper_urls

# The base URL
base_url = "https://openreview.net/group?id=ICLR.cc/2024/Conference"
all_unique_paper_urls = scrape_multiple_tabs(base_url)

print(f"Total unique forum URLs scraped across all tabs: {len(all_unique_paper_urls)}")
print(all_unique_paper_urls)

print(all_unique_paper_urls)

# save them to a file 

with open('2024_paper_urls.txt', 'w') as f:
    for item in all_unique_paper_urls:
        f.write("%s\n" % item)


Scraping Oral tab...
Switching to accept-oral tab using JavaScript...
Scraping page 1 of accept-oral tab...
Found 286 papers on the page.
Scraping page 2 of accept-oral tab...
Found 286 papers on the page.
Scraping page 3 of accept-oral tab...
Found 286 papers on the page.
Scraping page 4 of accept-oral tab...
Found 258 papers on the page.
No more pages to navigate.
Scraping Spotlight tab...
Switching to accept-spotlight tab using JavaScript...
Scraping page 1 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 2 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 3 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 4 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 5 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 6 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 7 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 8 of accept-spotlight tab.

In [17]:
print(all_unique_paper_urls)

# save them to a file 

with open('2024_paper_urls.txt', 'w') as f:
    for item in all_unique_paper_urls:
        f.write("%s\n" % item)

{'https://openreview.net/forum?id=Glcsog6zOe', 'https://openreview.net/forum?id=m3xVPaZp6Z', 'https://openreview.net/forum?id=jKhNBulNMh', 'https://openreview.net/forum?id=Tzh6xAJSll', 'https://openreview.net/forum?id=DCDT918ZkI', 'https://openreview.net/forum?id=KjegfPGRde', 'https://openreview.net/forum?id=sn7CYWyavh', 'https://openreview.net/forum?id=WNzy9bRDvG', 'https://openreview.net/forum?id=Mhb5fpA1T0', 'https://openreview.net/forum?id=RIcYTbpO38', 'https://openreview.net/forum?id=U7VW3KBm34', 'https://openreview.net/forum?id=GlpawHh80l', 'https://openreview.net/forum?id=mXpNp8MMr5', 'https://openreview.net/forum?id=tBROYsEz9G', 'https://openreview.net/forum?id=3UWuFoksGb', 'https://openreview.net/forum?id=xJEd8PkdNz', 'https://openreview.net/forum?id=gLARhFLE0F', 'https://openreview.net/forum?id=XwiA1nDahv', 'https://openreview.net/forum?id=aPNwsJgnZJ', 'https://openreview.net/forum?id=nFI3wFM9yN', 'https://openreview.net/forum?id=pw2ssoOTpo', 'https://openreview.net/forum?id=

Okay great - I also have a url getter function that get's the url of all the webages on these links:

https://openreview.net/group?id=ICLR.cc/2024/Conference#tab-accept-oral

https://openreview.net/group?id=ICLR.cc/2024/Conference#tab-accept-spotlight

https://openreview.net/group?id=ICLR.cc/2024/Conference#tab-accept-poster

https://openreview.net/group?id=ICLR.cc/2024/Conference#tab-reject

https://openreview.net/group?id=ICLR.cc/2023/Conference

https://openreview.net/group?id=ICLR.cc/2023/Conference#notable-top-25-

https://openreview.net/group?id=ICLR.cc/2023/Conference#poster

https://openreview.net/group?id=ICLR.cc/2023/Conference#submitted 

https://openreview.net/group?id=ICLR.cc/2022/Conference#submitted

https://openreview.net/group?id=ICLR.cc/2022/Conference#spotlight-submissions

https://openreview.net/group?id=ICLR.cc/2022/Conference#poster-submissions

https://openreview.net/group?id=ICLR.cc/2022/Conference#submitted-submissions

https://openreview.net/group?id=ICLR.cc/2021/Conference#submitted-submissions

https://openreview.net/group?id=ICLR.cc/2021/Conference#spotlight-presentations

https://openreview.net/group?id=ICLR.cc/2021/Conference#poster-presentations


this is the python code that does that:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def setup_selenium():
    options = Options()
    options.add_argument("--headless")  # Run in headless mode for faster scraping
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

def switch_to_tab_with_js(driver, tab_id):
    """Use JavaScript to switch to the desired tab to avoid click interception."""
    try:
        print(f"Switching to {tab_id} tab using JavaScript...")
        driver.execute_script(f"document.querySelector('a[href=\"#{tab_id}\"]').click();")
        time.sleep(5)  # Allow time for the page to update after clicking the tab
    except Exception as e:
        print(f"Failed to switch to {tab_id} tab: {e}")

def get_paper_urls_from_page(driver):
    try:
        # Wait for the paper titles to appear (with a timeout of 20 seconds)
        paper_elements = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.note h4 a'))
        )
        print(f"Found {len(paper_elements)} papers on the page.")
        
        # Collect all paper links on the page
        paper_urls = [element.get_attribute("href") for element in paper_elements]
        
        # Filter only forum URLs (exclude PDF links)
        forum_urls = [url for url in paper_urls if "forum?id=" in url]
        
        return forum_urls
    except Exception as e:
        print(f"Error: {e}")
        return []

def go_to_next_page(driver):
    try:
        # Look for the "Next" button in the pagination section and click it
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.right-arrow > a'))
        )
        next_button.click()
        time.sleep(3)  # Allow some time for the next page to load
        return True
    except Exception as e:
        print("No more pages to navigate.")
        return False

def scrape_all_pages(driver, tab_id):
    """Scrapes all pages from a single tab."""
    switch_to_tab_with_js(driver, tab_id)
    
    all_paper_urls = set()
    page_count = 1

    while True:
        print(f"Scraping page {page_count} of {tab_id} tab...")
        paper_urls = get_paper_urls_from_page(driver)
        all_paper_urls.update(paper_urls)
        
        if not go_to_next_page(driver):  # Stop if there are no more pages
            break
        
        page_count += 1

    return all_paper_urls

def scrape_multiple_tabs(base_url, tab_ids):
    driver = setup_selenium()
    driver.get(base_url)
    time.sleep(5)  # Give the page some time to load

    combined_paper_urls = set()
    
    for tab_id in tab_ids:
        print(f"Scraping {tab_id} tab...")
        paper_urls = scrape_all_pages(driver, tab_id)
        combined_paper_urls.update(paper_urls)
    
    driver.quit()

    return combined_paper_urls

def save_urls_to_file(filename, urls):
    """Saves the paper URLs to a text file."""
    with open(filename, 'w') as f:
        for url in urls:
            f.write(f"{url}\n")
    print(f"Saved {len(urls)} URLs to {filename}")

# Scraping for 2024
base_url_2024 = "https://openreview.net/group?id=ICLR.cc/2024/Conference"
tabs_2024 = ["accept-oral", "accept-spotlight", "accept-poster", "reject"]

all_unique_paper_urls_2024 = scrape_multiple_tabs(base_url_2024, tabs_2024)
save_urls_to_file("2024_paper_urls.txt", all_unique_paper_urls_2024)

# Scraping for 2023
base_url_2023 = "https://openreview.net/group?id=ICLR.cc/2023/Conference"
tabs_2023 = ["notable-top-25-", "poster", "submitted"]

all_unique_paper_urls_2023 = scrape_multiple_tabs(base_url_2023, tabs_2023)
save_urls_to_file("2023_paper_urls.txt", all_unique_paper_urls_2023)

# Scraping for 2022
base_url_2022 = "https://openreview.net/group?id=ICLR.cc/2022/Conference"
tabs_2022 = ["spotlight-submissions", "poster-submissions", "submitted-submissions"]

all_unique_paper_urls_2022 = scrape_multiple_tabs(base_url_2022, tabs_2022)
save_urls_to_file("2022_paper_urls.txt", all_unique_paper_urls_2022)

# Scraping for 2021
base_url_2021 = "https://openreview.net/group?id=ICLR.cc/2021/Conference"
tabs_2021 = ["submitted-submissions", "spotlight-presentations", "poster-presentations"]

all_unique_paper_urls_2021 = scrape_multiple_tabs(base_url_2021, tabs_2021)
save_urls_to_file("2021_paper_urls.txt", all_unique_paper_urls_2021)



One important thing to note - you know how for the years 2024 - we have the following metrics with scores - soundness,presentation,contribution,rating,confidence

for the years 2023 and 2022 - the metrics are 

Correctness: 3: Some of the paper’s claims have minor issues. A few statements are not well-supported, or require small changes to be made correct.
Technical Novelty And Significance: 4: The contributions are significant, and do not exist in prior works.
Empirical Novelty And Significance: 2: The contributions are only marginally significant or novel.
Recommendation: 8: accept, good paper
Confidence: 3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.

and this

Correctness: 3: Some of the paper’s claims have minor issues. A few statements are not well-supported, or require small changes to be made correct.
Technical Novelty And Significance: 4: The contributions are significant, and do not exist in prior works.
Empirical Novelty And Significance: 3: The contributions are significant and somewhat new. Aspects of the contributions exist in prior work.
Recommendation: 6: marginally above the acceptance threshold
Confidence: 4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some 


and this:

Correctness: 4: All of the claims and statements are well-supported and correct.
Technical Novelty And Significance: 3: The contributions are significant and somewhat new. Aspects of the contributions exist in prior work.
Empirical Novelty And Significance: 3: The contributions are significant and somewhat new. Aspects of the contributions exist in prior work.
Recommendation: 8: accept, good paper
Confidence: 3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.


and than for the year 2021 we only have these numeric scores

Rating: 7: Good paper, accept
Confidence: 4: The reviewer is confident but not absolutely certain that the evaluation is correct


give me updated code that has a better url getter function and than also update the code such that teh metrics we scan for cahnge based on the year. It should still extract the pdf and get the markdown stuff the same - it's jsut for the csv file that we have a change. this is the current code - you will have to chagne it quite a bit:


# %% [markdown]
# # Scraper for ICLR Papers through OpenReview

# %%
import os
import re
import time
import logging
import csv
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementNotInteractableException,
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from pdfminer.high_level import extract_text
from webdriver_manager.chrome import ChromeDriverManager
import glob

# %%
# Setting up

# Configure Logging
logging.basicConfig(
    filename="scraper.log",
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)

# Also log to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console.setFormatter(formatter)
logging.getLogger().addHandler(console)

# Constants
BASE_URL = "https://openreview.net"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DataScraper/1.0; +https://yourdomain.com/)"
}
DOWNLOAD_DIR = "data/iclr_2024"
HTML_DIR = os.path.join(DOWNLOAD_DIR, "HTML")
PDF_DIR = os.path.join(DOWNLOAD_DIR, "PDF")
MARKDOWN_DIR = os.path.join(DOWNLOAD_DIR, "Markdown")
IMAGES_DIR = os.path.join(DOWNLOAD_DIR, "Image")

# Create directories if they don't exist
for directory in [DOWNLOAD_DIR, HTML_DIR, PDF_DIR, MARKDOWN_DIR, IMAGES_DIR]:
    os.makedirs(directory, exist_ok=True)

def setup_selenium():
    """Sets up Selenium with headless Chrome."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Initialize Service with ChromeDriver
    service = Service(ChromeDriverManager().install())
    
    # Initialize WebDriver with Service and Options
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def fetch_html(driver, url, timeout=20):
    """Fetches the fully rendered HTML content of a given URL using Selenium."""
    try:
        logging.info(f"Fetching URL: {url}")
        driver.get(url)

        # Wait until the main content is loaded
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CLASS_NAME, "note"))
        )

        # Optional: Scroll to the bottom to ensure all lazy-loaded content is fetched
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for additional content to load

        html = driver.page_source
        logging.info(f"Successfully fetched URL: {url}")
        return html
    except TimeoutException:
        logging.error(f"Timeout while loading {url}")
        return None
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
        return None

def parse_paper_info(soup):
    """Parses the paper's BeautifulSoup object to extract metadata."""
    paper_info = {}

    # Extract Title
    title_tag = soup.find("h2", class_="citation_title")
    paper_info["title"] = title_tag.text.strip() if title_tag else "N/A"

    # Extract Authors
    authors_tag = soup.find("div", class_="forum-authors")
    if authors_tag:
        authors = [author.text.strip() for author in authors_tag.find_all("a")]
        paper_info["authors"] = authors
    else:
        paper_info["authors"] = []

    # Extract Publication Date
    pub_date_tag = soup.find("span", class_="glyphicon-calendar")
    if pub_date_tag and pub_date_tag.parent:
        dates_text = pub_date_tag.parent.text.strip()
        publication_date = re.search(r"Published:\s*(.*?)(?:,|$)", dates_text)
        paper_info["publication_date"] = (
            publication_date.group(1) if publication_date else "N/A"
        )
    else:
        paper_info["publication_date"] = "N/A"

    # Extract PDF URL
    pdf_link_tag = soup.find("a", class_="citation_pdf_url")
    if pdf_link_tag and "href" in pdf_link_tag.attrs:
        pdf_url = pdf_link_tag["href"]
        if not pdf_url.startswith("http"):
            pdf_url = BASE_URL + pdf_url
        paper_info["pdf_url"] = pdf_url
    else:
        paper_info["pdf_url"] = None

    return paper_info

def download_pdf(pdf_url, save_path):
    """Downloads the PDF from the given URL to the specified path."""
    try:
        logging.info(f"Downloading PDF: {pdf_url}")
        response = requests.get(pdf_url, headers=HEADERS)
        response.raise_for_status()
        with open(save_path, "wb") as f:
            f.write(response.content)
        logging.info(f"Downloaded PDF to {save_path}")
        return True
    except requests.RequestException as e:
        logging.error(f"Error downloading PDF from {pdf_url}: {e}")
        return False

def extract_sections_from_pdf(pdf_path):
    """Extracts the abstract and introduction from the PDF."""
    try:
        logging.info(f"Extracting sections from PDF: {pdf_path}")
        text = extract_text(pdf_path)
        # Normalize whitespace
        text = re.sub(r"\s+", " ", text)

        # Improved regex patterns to accurately capture Abstract and Introduction
        abstract_match = re.search(
            r"(?is)abstract\s*(.*?)\s*(?:(introduction|1\.\s*Introduction|2\.\s*Methods|methods|conclusion|related work|acknowledgments|references|$))",
            text
        )
        introduction_match = re.search(
            r"(?is)(introduction|1\.\s*Introduction)\s*(.*?)\s*(?:(conclusion|related work|methods|acknowledgments|references|2\.\s*Methods|$))",
            text
        )

        abstract = abstract_match.group(1).strip() if abstract_match else "N/A"
        introduction = introduction_match.group(2).strip() if introduction_match else "N/A"

        logging.info(f"Extracted Abstract and Introduction from {pdf_path}")
        return abstract, introduction
    except Exception as e:
        logging.error(f"Error extracting text from PDF {pdf_path}: {e}")
        return "N/A", "N/A"

def convert_to_markdown(text, header):
    """Converts plain text to Markdown with a specified header."""
    if text == "N/A":
        markdown = f"## {header}\n\nN/A\n"
    else:
        markdown = f"## {header}\n\n{text}\n"
    return markdown

def save_markdown(content, filename):
    """Saves the given content to a Markdown file."""
    try:
        path = os.path.join(MARKDOWN_DIR, filename)
        with open(path, "w", encoding="utf-8") as f:
            f.write(content)
        logging.info(f"Saved Markdown to {path}")
    except Exception as e:
        logging.error(f"Error saving Markdown file {filename}: {e}")

def extract_reviewer_responses(soup):
    """Extracts all notes (reviews, meta-reviews, decisions, comments) in order."""
    responses = []

    # Find all notes (reviews, meta-reviews, decisions, comments)
    note_divs = soup.find_all("div", class_="note", attrs={"data-id": re.compile(".*")})

    for note in note_divs:
        # Determine the type of note
        invitation = note.find("span", class_="invitation")
        if not invitation:
            continue
        invitation_type = invitation.text.strip()

        # Extract author
        signatures_span = note.find("span", class_="signatures")
        author = "N/A"
        if signatures_span:
            author_tags = signatures_span.find_all("span")
            if author_tags:
                author = author_tags[-1].text.strip()
            else:
                author = signatures_span.text.strip()

        # Extract content fields
        content_div = note.find("div", class_="note-content")
        content_dict = {}
        if content_div:
            content_fields = content_div.find_all("div", recursive=False)
            for field in content_fields:
                field_name_tag = field.find("strong", class_="note-content-field")
                if not field_name_tag:
                    continue
                field_name = field_name_tag.text.strip(":").strip()
                field_value_div = field.find("div", class_="note-content-value")
                field_value_span = field.find("span", class_="note-content-value")
                field_value = ""
                if field_value_div:
                    field_value = md(str(field_value_div)).strip()
                elif field_value_span:
                    field_value = md(str(field_value_span)).strip()
                content_dict[field_name] = field_value

        # Determine the note type
        if "Soundness" in content_dict:
            note_type = "Official Review"
        else:
            note_type = invitation_type  # Use the invitation type as the note type

        # Append the note
        responses.append(
            {"type": note_type, "author": author, "content": content_dict}
        )

        # Handle nested comments (e.g., author responses)
        nested_comments = note.find_all(
            "div", class_="note", attrs={"data-id": re.compile(".*")}
        )
        for comment in nested_comments:
            comment_invitation = comment.find("span", class_="invitation")
            if comment_invitation:
                comment_type = comment_invitation.text.strip()
                comment_author_span = comment.find("span", class_="signatures")
                comment_author = "N/A"
                if comment_author_span:
                    author_tags = comment_author_span.find_all("span")
                    if author_tags:
                        comment_author = author_tags[-1].text.strip()
                    else:
                        comment_author = comment_author_span.text.strip()

                comment_content_div = comment.find("div", class_="note-content")
                comment_content_dict = {}
                if comment_content_div:
                    content_fields = comment_content_div.find_all("div", recursive=False)
                    for field in content_fields:
                        field_name_tag = field.find("strong", class_="note-content-field")
                        if not field_name_tag:
                            continue
                        field_name = field_name_tag.text.strip(":").strip()
                        field_value_div = field.find("div", class_="note-content-value")
                        field_value_span = field.find("span", class_="note-content-value")
                        field_value = ""
                        if field_value_div:
                            field_value = md(str(field_value_div)).strip()
                        elif field_value_span:
                            field_value = md(str(field_value_span)).strip()
                        comment_content_dict[field_name] = field_value

                responses.append(
                    {"type": comment_type, "author": comment_author, "content": comment_content_dict}
                )

    return responses


def save_reviewer_responses(responses, filename):
    """Saves reviewer responses to a Markdown file, maintaining the sequential order."""
    try:
        content = f"## Reviewer Responses\n\n"
        for idx, response in enumerate(responses, 1):
            content += f"### {response['type']} {idx}\n"
            content += f"**Author:** {response['author']}\n\n"
            for field_name, field_value in response["content"].items():
                content += f"**{field_name}:**\n{field_value}\n\n"
            content += "\n"
        save_markdown(content, filename)
        logging.info(f"Saved reviewer responses to {filename}")
    except Exception as e:
        logging.error(f"Error saving reviewer responses to {filename}: {e}")


def save_paper_metadata(paper_info, filename):
    """Saves paper metadata to a Markdown file."""
    try:
        content = f"# {paper_info['title']}\n\n"
        content += f"**Authors:** {', '.join(paper_info['authors'])}\n\n"
        content += f"**Publication Date:** {paper_info['publication_date']}\n\n"
        save_markdown(content, filename)
        logging.info(f"Saved metadata to {filename}")
    except Exception as e:
        logging.error(f"Error saving metadata to {filename}: {e}")

def scrape_paper(driver, paper_url):
    """Scrapes a single paper: downloads HTML and PDF."""
    logging.info(f"Starting scraping for paper: {paper_url}")
    html = fetch_html(driver, paper_url)
    if not html:
        logging.warning(f"Failed to retrieve HTML for {paper_url}. Skipping.")
        return
    
    paper_id_match = re.search(r"id=(.+)", paper_url)
    paper_id = paper_id_match.group(1) if paper_id_match else "unknown"
    
    # Save HTML
    html_filename = os.path.join(HTML_DIR, f"{paper_id}.html")
    try:
        with open(html_filename, "w", encoding="utf-8") as f:
            f.write(html)
        logging.info(f"Saved HTML to {html_filename}")
    except Exception as e:
        logging.error(f"Error saving HTML for {paper_url}: {e}")
    
    # Parse paper info to get PDF URL
    soup = BeautifulSoup(html, "html.parser")
    paper_info = parse_paper_info(soup)
    
    # Download PDF
    if paper_info["pdf_url"]:
        pdf_filename = f"{paper_id}.pdf"
        pdf_path = os.path.join(PDF_DIR, pdf_filename)
        success = download_pdf(paper_info["pdf_url"], pdf_path)
        if success:
            logging.info(f"Successfully scraped paper: {paper_id}")
    else:
        logging.warning(f"No PDF URL found for {paper_url}.")

def parse_paper(paper_id):
    """Parses the scraped HTML and PDF to extract metadata, sections, and reviewer responses."""
    html_filename = os.path.join(HTML_DIR, f"{paper_id}.html")
    pdf_path = os.path.join(PDF_DIR, f"{paper_id}.pdf")
    
    # Read HTML
    try:
        with open(html_filename, "r", encoding="utf-8") as f:
            html = f.read()
        soup = BeautifulSoup(html, "html.parser")
    except Exception as e:
        logging.error(f"Error reading HTML file {html_filename}: {e}")
        return
    
    # Parse paper info
    paper_info = parse_paper_info(soup)
    
    # Save metadata
    metadata_filename = f"{paper_id}_metadata.md"
    save_paper_metadata(paper_info, metadata_filename)
    
    # Extract sections from PDF
    if os.path.exists(pdf_path):
        abstract, introduction = extract_sections_from_pdf(pdf_path)
        abstract_md = convert_to_markdown(abstract, "Abstract")
        introduction_md = convert_to_markdown(introduction, "Introduction")
        combined_md = abstract_md + "\n" + introduction_md
        sections_filename = f"{paper_id}_sections.md"
        save_markdown(combined_md, sections_filename)
    else:
        logging.warning(f"PDF not found for paper ID {paper_id}. Skipping section extraction.")
    
    # Extract reviewer responses
    responses = extract_reviewer_responses(soup)
    if responses:
        responses_filename = f"{paper_id}_responses.md"
        save_reviewer_responses(responses, responses_filename)
    else:
        logging.info(f"No reviewer responses found for paper ID {paper_id}.")
    
    logging.info(f"Completed parsing for paper ID: {paper_id}")

def aggregate_csv(csv_filename="decisions_and_scores.csv"):
    """Aggregates decisions and scores from all *_responses.md files into a CSV."""
    csv_path = os.path.join(DOWNLOAD_DIR, csv_filename)
    fieldnames = ["paperid", "title", "decision", "soundness", "presentation", "contribution", "review_rating", "confidence"]

    try:
        with open(csv_path, "w", newline='', encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            # Get all responses.md files
            responses_files = glob.glob(os.path.join(MARKDOWN_DIR, "*_responses.md"))
            for resp_file in responses_files:
                paper_id = os.path.basename(resp_file).replace("_responses.md", "")
                metadata_file = os.path.join(MARKDOWN_DIR, f"{paper_id}_metadata.md")
                
                # Read metadata to get title
                try:
                    with open(metadata_file, "r", encoding="utf-8") as f:
                        metadata = f.read()
                    title_match = re.search(r"# (.+)", metadata)
                    title = title_match.group(1).strip() if title_match else "N/A"
                except Exception as e:
                    logging.error(f"Error reading metadata for {paper_id}: {e}")
                    title = "N/A"
                
                # Initialize lists
                decision = "N/A"
                soundness_list = []
                presentation_list = []
                contribution_list = []
                rating_list = []
                confidence_list = []
                
                # Read responses to get decision and reviews
                try:
                    with open(resp_file, "r", encoding="utf-8") as f:
                        responses_md = f.read()
                    
                    # Split the responses_md into sections
                    sections = re.split(r"^### ", responses_md, flags=re.MULTILINE)
                    for section in sections:
                        if not section.strip():
                            continue
                        header_match = re.match(r"(\w+.*?)\n", section)
                        if header_match:
                            header = header_match.group(1).strip()
                            content = section[header_match.end():]
                            if header.startswith("Decision"):
                                # Extract decision
                                decision_match = re.search(r"\*\*Decision:\*\*\s*\n*(.+?)(?:\n\n|\Z)", content, re.DOTALL)
                                decision = decision_match.group(1).strip() if decision_match else "N/A"
                            elif header.startswith("Official Review"):
                                # Extract review fields
                                soundness = re.search(r"\*\*Soundness:\*\*\s*(\d+)", content, re.IGNORECASE)
                                presentation = re.search(r"\*\*Presentation:\*\*\s*(\d+)", content, re.IGNORECASE)
                                contribution = re.search(r"\*\*Contribution:\*\*\s*(\d+)", content, re.IGNORECASE)
                                rating = re.search(r"\*\*Rating:\*\*\s*(\d+)", content, re.IGNORECASE)
                                confidence = re.search(r"\*\*Confidence:\*\*\s*(\d+)", content, re.IGNORECASE)
                                
                                soundness_list.append(soundness.group(1) if soundness else "N/A")
                                presentation_list.append(presentation.group(1) if presentation else "N/A")
                                contribution_list.append(contribution.group(1) if contribution else "N/A")
                                rating_list.append(rating.group(1) if rating else "N/A")
                                confidence_list.append(confidence.group(1) if confidence else "N/A")
                            else:
                                # Other types, ignore for CSV
                                pass
                    
                except Exception as e:
                    logging.error(f"Error reading responses for {paper_id}: {e}")
                
                # Compile row data
                row = {
                    "paperid": paper_id,
                    "title": title,
                    "decision": decision,
                    "soundness": str(soundness_list),
                    "presentation": str(presentation_list),
                    "contribution": str(contribution_list),
                    "review_rating": str(rating_list),
                    "confidence": str(confidence_list)
                }
                
                # Write to CSV
                writer.writerow(row)
        
        logging.info(f"Aggregated CSV saved to {csv_path}")
    except Exception as e:
        logging.error(f"Error creating CSV file {csv_path}: {e}")


def process_papers_parallel_scrape(paper_urls, max_workers=4):
    """Processes multiple papers in parallel for scraping."""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for url in paper_urls:
            driver = setup_selenium()
            future = executor.submit(scrape_paper, driver, url)
            futures.append((future, driver))

        for future, driver in futures:
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error scraping a paper: {e}")
            finally:
                driver.quit()

def process_all_papers_parsing():
    """Parses all scraped papers."""
    html_files = glob.glob(os.path.join(HTML_DIR, "*.html"))
    for html_file in html_files:
        paper_id = os.path.basename(html_file).replace(".html", "")
        parse_paper(paper_id)

def run_aggregation():
    """Runs the CSV aggregation after parsing."""
    aggregate_csv()

def get_paper_urls_from_page(driver, page_url):
    """Extract all unique paper URLs from the given OpenReview page."""
    driver.get(page_url)
    time.sleep(3)  # Give some time for page to load (adjust as necessary)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    paper_links = soup.find_all("a", href=True)

    # Filter out URLs with '&noteId=' and ensure they contain 'forum?id='
    paper_urls = [
        link["href"]
        for link in paper_links
        if "forum?id=" in link["href"] and "&noteId=" not in link["href"]
    ]

    return paper_urls

def url_getter():
    base_url = "https://openreview.net/group?id=ICLR.cc/2024/Conference"  # The URL of the OpenReview ICLR page
    driver = setup_selenium()

    all_paper_urls = []

    # Modify the range as needed to scrape multiple pages
    for page_number in range(1):
        page_url = f"{base_url}&page={page_number}"
        paper_urls = get_paper_urls_from_page(driver, page_url)
        all_paper_urls.extend(paper_urls)

    driver.quit()

    # Create a list with the desired format
    unique_urls = list(set(all_paper_urls))
    # For prototyping, limit to first 5 papers
    formatted_urls = [f"https://openreview.net{url}" for url in unique_urls[:15]]

    return formatted_urls

# %%
# Experimenting with the scraper

# Initial list of paper URLs (you can comment this out if using url_getter)
# paper_urls = [
#     "https://openreview.net/forum?id=KS8mIvetg2",
#     "https://openreview.net/forum?id=7Ttk3RzDeu",
#     "https://openreview.net/forum?id=ANvmVS2Yr0",
#     "https://openreview.net/forum?id=ekeyCgeRfC",
# ]

# Alternatively, get paper URLs from the OpenReview page
paper_urls = url_getter()
print('\n'.join(paper_urls))
process_papers_parallel_scrape(paper_urls, max_workers=8)

# %%
# After scraping, parse all papers and aggregate CSV
process_all_papers_parsing()
aggregate_csv()



In [19]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def setup_selenium():
    options = Options()
    options.add_argument("--headless")  # Run in headless mode for faster scraping
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

def switch_to_tab_with_js(driver, tab_id):
    """Use JavaScript to switch to the desired tab to avoid click interception."""
    try:
        print(f"Switching to {tab_id} tab using JavaScript...")
        driver.execute_script(f"document.querySelector('a[href=\"#{tab_id}\"]').click();")
        time.sleep(5)  # Allow time for the page to update after clicking the tab
    except Exception as e:
        print(f"Failed to switch to {tab_id} tab: {e}")

def get_paper_urls_from_page(driver):
    try:
        # Wait for the paper titles to appear (with a timeout of 20 seconds)
        paper_elements = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.note h4 a'))
        )
        print(f"Found {len(paper_elements)} papers on the page.")
        
        # Collect all paper links on the page
        paper_urls = [element.get_attribute("href") for element in paper_elements]
        
        # Filter only forum URLs (exclude PDF links)
        forum_urls = [url for url in paper_urls if "forum?id=" in url]
        
        return forum_urls
    except Exception as e:
        print(f"Error: {e}")
        return []

def go_to_next_page(driver):
    try:
        # Look for the "Next" button in the pagination section and click it
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.right-arrow > a'))
        )
        next_button.click()
        time.sleep(3)  # Allow some time for the next page to load
        return True
    except Exception as e:
        print("No more pages to navigate.")
        return False

def scrape_all_pages(driver, tab_id):
    """Scrapes all pages from a single tab."""
    switch_to_tab_with_js(driver, tab_id)
    
    all_paper_urls = set()
    page_count = 1

    while True:
        print(f"Scraping page {page_count} of {tab_id} tab...")
        paper_urls = get_paper_urls_from_page(driver)
        all_paper_urls.update(paper_urls)
        
        if not go_to_next_page(driver):  # Stop if there are no more pages
            break
        
        page_count += 1

    return all_paper_urls

def scrape_multiple_tabs(base_url, tab_ids):
    driver = setup_selenium()
    driver.get(base_url)
    time.sleep(5)  # Give the page some time to load

    combined_paper_urls = set()
    
    for tab_id in tab_ids:
        print(f"Scraping {tab_id} tab...")
        paper_urls = scrape_all_pages(driver, tab_id)
        combined_paper_urls.update(paper_urls)
    
    driver.quit()

    return combined_paper_urls

def save_urls_to_file(filename, urls):
    """Saves the paper URLs to a text file."""
    with open(filename, 'w') as f:
        for url in urls:
            f.write(f"{url}\n")
    print(f"Saved {len(urls)} URLs to {filename}")

# Scraping for 2024
base_url_2024 = "https://openreview.net/group?id=ICLR.cc/2024/Conference"
tabs_2024 = ["accept-oral", "accept-spotlight", "accept-poster", "reject"]

all_unique_paper_urls_2024 = scrape_multiple_tabs(base_url_2024, tabs_2024)
save_urls_to_file("2024_paper_urls.txt", all_unique_paper_urls_2024)

# Scraping for 2023
base_url_2023 = "https://openreview.net/group?id=ICLR.cc/2023/Conference"
tabs_2023 = ["notable-top-25-", "poster", "submitted"]

all_unique_paper_urls_2023 = scrape_multiple_tabs(base_url_2023, tabs_2023)
save_urls_to_file("2023_paper_urls.txt", all_unique_paper_urls_2023)

# Scraping for 2022
base_url_2022 = "https://openreview.net/group?id=ICLR.cc/2022/Conference"
tabs_2022 = ["spotlight-submissions", "poster-submissions", "submitted-submissions"]

all_unique_paper_urls_2022 = scrape_multiple_tabs(base_url_2022, tabs_2022)
save_urls_to_file("2022_paper_urls.txt", all_unique_paper_urls_2022)

# Scraping for 2021
base_url_2021 = "https://openreview.net/group?id=ICLR.cc/2021/Conference"
tabs_2021 = ["submitted-submissions", "spotlight-presentations", "poster-presentations"]

all_unique_paper_urls_2021 = scrape_multiple_tabs(base_url_2021, tabs_2021)
save_urls_to_file("2021_paper_urls.txt", all_unique_paper_urls_2021)


Scraping accept-oral tab...
Switching to accept-oral tab using JavaScript...
Scraping page 1 of accept-oral tab...
Found 286 papers on the page.
Scraping page 2 of accept-oral tab...
Found 286 papers on the page.
Scraping page 3 of accept-oral tab...
Found 286 papers on the page.
Scraping page 4 of accept-oral tab...
Found 258 papers on the page.
No more pages to navigate.
Scraping accept-spotlight tab...
Switching to accept-spotlight tab using JavaScript...
Scraping page 1 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 2 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 3 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 4 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 5 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 6 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 7 of accept-spotlight tab...
Found 258 papers on the page.
Scraping page 8 of accept-

KeyboardInterrupt: 