In [1]:
import os
import glob
import shutil
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver import ChromeOptions, Chrome
from selenium.webdriver.chrome.webdriver import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
def init_chrome_driver(path_to_chromedriver, temp_download_folder):
    # Initialize the instance for managing the internet connection.
    service = Service(executable_path=path_to_chromedriver)
    
    # Create an empty option passed to chromedriver.
    options = ChromeOptions()
    # Add the option for the headless launch.
    options.headless = True
    # Add the option just to avoid errors.
    options.add_argument("--no-sandbox")
    # Add the option of the user agent not to be detected as a bot engine.
    options.add_argument("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36")
    # Add the option to fix the window size.(The code works properly without this option.)
    options.add_argument("--window-size=1280,1024")
    # Change the default download folder.
    options.add_experimental_option("prefs", {
        "download.default_directory": temp_download_folder
    })

    # Instantiate the chrome driver.
    driver = Chrome(options=options, service=service)

    return driver

In [3]:
def page_transition(driver, url):
    # Set the maximum time for the pause.
    wait = WebDriverWait(driver=driver, timeout=30)
    
    # Access to the web page with specified link.
    driver.get(url)
    # Pause the process until all elements are loaded.
    wait.until(EC.presence_of_all_elements_located)

In [4]:
def get_parsed_html(driver, url):
    # Move to the page where you want to get the html.
    page_transition(driver, url)
    
    # Load the webpage with the UTF-8 encoding.
    html = driver.page_source.encode("utf-8")
    # Get the parsed html.
    soup = BeautifulSoup(html, "html.parser")

    return soup

In [5]:
def download_pdf(driver, pdf_url, temp_download_folder, save_path):
    driver.get(pdf_url)

    # Timeout configuration
    timeout_second = 60

    # Wait predifined timeout seconds for the PDF files to be downloaded.
    for i in range(timeout_second + 1):
        # Get all filenames under the temporary folder.
        download_filename = glob.glob("{}/*".format(temp_download_folder))

        # If any file exists...
        if download_filename:
            extension = os.path.splitext(download_filename[0])
            # If the extension is not '.crdownload', that means the PDF file has been downloaded.
            if ".crdownload" not in extension[1]:
                time.sleep(1)
                break
        
        # If there are not any other files than ones with a '.crdownload' extention, an error is raised.
        if i >= timeout_second:
            raise TimeoutError("Some error might have occured through the download processes.")

        time.sleep(1)
    
    # Move the downloaded PDF file to the save folder.
    shutil.move(os.path.join(temp_download_folder, download_filename[0]), save_path)

In [6]:
path_to_chromedriver = os.path.abspath(os.path.join(os.getcwd(), "../apps/chromedriver"))
temp_save_folder = os.path.abspath(os.path.join(os.getcwd(), "../tmp")) # The temporary folder for saving paper PDFs
save_folder = os.path.abspath(os.path.join(os.getcwd(), "../data/raw/acl/2020")) # The folder for saving all downloaded slide images and paper PDFs.
os.makedirs(temp_save_folder, exist_ok=True)
os.makedirs(save_folder, exist_ok=True)
driver = init_chrome_driver(path_to_chromedriver, temp_save_folder)

In [7]:
papertalk_url = "https://papertalk.org"
path_to_acl_2020 = "/papertalks?journal=acl_2020&selected_date=all&sort=descending&order_by=date"

# Transite to the search result page and get the parsed html.
html = get_parsed_html(driver, papertalk_url+path_to_acl_2020)
# Get the number of the search result pages.
num_search_page = int(driver.find_elements(By.CLASS_NAME, "page-link")[-2].text)

In [8]:
acl_2020_paper_title_dict = {}
acl_2020_slide_link_dict = {}
acl_2020_paper_link_dict = {}
for_dataframe_list = [] # A list to make a dataframe later.
skipped_paper_idx_list = [] # A list of paper indexes which are skipped for some reason.
paper_idx = 0 # The start paper index of each page.

# Process through each result page.
for i in range(num_search_page):
    # Transite to the search result page and get the parsed html.
    html = get_parsed_html(driver, papertalk_url+path_to_acl_2020+"&page={}".format(i+1))
    # Get the html corresponding to each card.
    each_card_list = html.find_all("div", class_="pp-card-header")

    # Process through each card in a page.
    for j, each_card in enumerate(each_card_list):
        # Get the paper title, slide link, and paper link.
        paper_title = each_card.find("a", class_="text-muted").find("h5").get_text(strip=True)
        slide_link = each_card.find("a", class_="text-muted").get("href")
        paper_link = each_card.find_all("a", class_="px-1")[-1].get("href")
        acl_2020_paper_title_dict[paper_idx+j] = paper_title
        acl_2020_slide_link_dict[paper_idx+j] = slide_link
        acl_2020_paper_link_dict[paper_idx+j] = paper_link

        # If the paper link is not provided, it is regarded as invalid and will be skipped afterwards.
        valid = True
        if paper_link == "":
            skipped_paper_idx_list.append(paper_idx+j)
            valid = False

        # Append the contents to the list.
        for_dataframe_list.append([paper_title, papertalk_url+slide_link, paper_link, valid])

    # Update the start index.
    paper_idx += len(each_card_list)

    # Wait for 1 second not to send too much traffic to the website.
    driver.implicitly_wait(1)

In [9]:
# Define the original exception class for the following process.
class OverTimeException(Exception):
    pass

In [18]:
num_acl_papers = len(acl_2020_paper_link_dict) # Get the number of papers detected.
no_slide_card_list = [] # A list of paper indexes which have no slide card.

for idx, slide_link in acl_2020_slide_link_dict.items():
    # Skip the paper whose index is stored in the skip list.
    if idx in skipped_paper_idx_list:
        print("Skip the {:3d}th/{:3d} paper because there is no paper pdf link.".format(idx, num_acl_papers))
        continue
    
    # Move to the each papartalk page.
    page_transition(driver, papertalk_url+slide_link)

    # Get the embeded iframe element and switch the driver to it.
    iframe = driver.find_element(By.TAG_NAME, "iframe")
    driver.switch_to.frame(iframe)

    # A flag to decide whether there is no slide card in the page or not.
    is_no_slide_card = False

    # If there is no slide card in the page, set True for the flag.
    slide_card_size_list = driver.find_element(By.CLASS_NAME, "slp__slides").get_attribute("style").split(";")
    for size in slide_card_size_list[:3]:
        size_type, size_txt = size.split(":")
        if size_type == "width" or size_type == "height":
            size_int = int(size_txt[:-2].replace(" ", ""))
            if size_int == 0:
                is_no_slide_card = True
                break

    # Skip when the flag is True.
    if is_no_slide_card:
        no_slide_card_list.append(idx)
        # Change the valid value of corresponding element in the dataframe list.
        for_dataframe_list[idx][-1] = False
        print("Skip the {:3d}th/{:3d} paper because there are no slide images.".format(idx, num_acl_papers))
        continue

    print("Start processing the {:3d}th/{:3d} paper.".format(idx, num_acl_papers))
    
    # Make a save folder.
    paper_save_folder = os.path.join(save_folder, "paper{:04d}".format(idx))
    os.makedirs(os.path.join(paper_save_folder, "img"), exist_ok=True)

    allowed_wait_time = 60 # The maximum allowed time of waiting for hovering the mouse.
    # Get the time when the process starts.
    start_t = time.time()
    time.sleep(5)

    # Hover the mouse on the slide contents to display the control arrow for moving on to the next slide.
    flag = True # A flag to decide whether to finish the loop
    while flag:
        try:
            # Get the currrent time.
            t = time.time()
            # If the process takes more time than the allowed wait time, raise an error and stop the process.
            if t - start_t > allowed_wait_time:
                raise OverTimeException("The driver could not hover the mouse on the element.")

            # Move the mouse cursor on to the embedded slide to display the total number of slide pages.
            actions = ActionChains(driver)
            actions.move_to_element(
                driver.find_element(By.CLASS_NAME, "slp__slidesControls")
            ).perform()

        # If the page is not loaded completely and pop out an error when hovering a mouse, try again.
        except ElementNotInteractableException:
            time.sleep(1)
            continue

        # If hovering a mouse succeeded, terminate the loop.
        else:
            flag = False

    # Get the total number of slide pages of the presentation.
    num_pages = int(driver.find_elements(By.CLASS_NAME, "slp__slideStats--darker")[-1].text)

    for n in range(num_pages):
        print("\tThe page {:3d}/{:3d}".format(n, num_pages))

        # Get the image link of the displayed slide.
        image_link = driver.find_element(By.CLASS_NAME, "slp__slidesPlayer__content").find_element(By.TAG_NAME, "img").get_attribute("src")

        # Save the slide page as a png image.
        save_img_path = os.path.join(paper_save_folder, "img", "page{:03d}.png".format(n))
        re = requests.get(image_link)
        with open(save_img_path, 'wb') as f:
            f.write(re.content)

        # Detect the button for transition to the next slide page and click it.
        next_button = driver.find_element(By.XPATH, '//*[@id="player"]/div/div[7]/div[2]/div[5]/div[4]')
        next_button.click()

        # Wait for 1 second not to send too much traffic to the website.
        time.sleep(1)

# Add the paper indexes having no slide card to the skip list.
skipped_paper_idx_list.extend(no_slide_card_list)

Start processing the 159th/903 paper.
	The page   0/ 24
	The page   1/ 24
	The page   2/ 24
	The page   3/ 24
	The page   4/ 24
	The page   5/ 24
	The page   6/ 24
	The page   7/ 24
	The page   8/ 24
	The page   9/ 24
	The page  10/ 24
	The page  11/ 24
	The page  12/ 24
	The page  13/ 24
	The page  14/ 24
	The page  15/ 24
	The page  16/ 24
	The page  17/ 24
	The page  18/ 24
	The page  19/ 24
	The page  20/ 24
	The page  21/ 24
	The page  22/ 24
	The page  23/ 24
Start processing the 160th/903 paper.
	The page   0/ 48
	The page   1/ 48
	The page   2/ 48
	The page   3/ 48
	The page   4/ 48
	The page   5/ 48
	The page   6/ 48
	The page   7/ 48
	The page   8/ 48
	The page   9/ 48
	The page  10/ 48
	The page  11/ 48
	The page  12/ 48
	The page  13/ 48
	The page  14/ 48
	The page  15/ 48
	The page  16/ 48
	The page  17/ 48
	The page  18/ 48
	The page  19/ 48
	The page  20/ 48
	The page  21/ 48
	The page  22/ 48
	The page  23/ 48
	The page  24/ 48
	The page  25/ 48
	The page  26/ 48
	The p

In [19]:
no_slide_card_list

[298]

In [20]:
skipped_paper_idx_list

[13, 37, 39, 42, 107, 298]

In [22]:
for idx, pdf_link in acl_2020_paper_link_dict.items():
    # Skip the paper whose index is stored in the skip list.
    if idx in skipped_paper_idx_list:
        print("Skip the {:3d}th/{:3d} paper.".format(idx, num_acl_papers))
        continue

    print("Start processing the {:3d}th/{:3d} paper.".format(idx, num_acl_papers))

    # Make a save folder.
    paper_save_folder = os.path.join(save_folder, "paper{:04d}".format(idx))
    os.makedirs(paper_save_folder, exist_ok=True)

    # If the direct link to the PDF file is not provided but to the arxiv page, update it with the arxiv PDF link.
    if "https://arxiv.org" in pdf_link and "pdf" not in pdf_link:
        # Move to the arxiv page.
        page_transition(driver, pdf_link)
        # Get the PDF link on the arxiv page.
        pdf_link = driver.find_element(By.CLASS_NAME, "full-text").find_elements(By.TAG_NAME, "a")[1].get_attribute("href")
        # Update the pdf link of the corresponding element in the dataframe list.
        for_dataframe_list[idx][2] = pdf_link

    # Download the PDF file from each link.
    download_pdf(driver, pdf_link, temp_save_folder, os.path.join(paper_save_folder, "paper{:04d}.pdf").format(idx))

Start processing the   0th/903 paper.
Start processing the   1th/903 paper.
Start processing the   2th/903 paper.
Start processing the   3th/903 paper.
Start processing the   4th/903 paper.
Start processing the   5th/903 paper.
Start processing the   6th/903 paper.
Start processing the   7th/903 paper.
Start processing the   8th/903 paper.
Start processing the   9th/903 paper.
Start processing the  10th/903 paper.
Start processing the  11th/903 paper.
Start processing the  12th/903 paper.
Skip the  13th/903 paper.
Start processing the  14th/903 paper.
Start processing the  15th/903 paper.
Start processing the  16th/903 paper.
Start processing the  17th/903 paper.
Start processing the  18th/903 paper.
Start processing the  19th/903 paper.
Start processing the  20th/903 paper.
Start processing the  21th/903 paper.
Start processing the  22th/903 paper.
Start processing the  23th/903 paper.
Start processing the  24th/903 paper.
Start processing the  25th/903 paper.
Start processing the  26

In [23]:
columns = ["Paper Title", "Slide Link", "Paper Link", "Valid"]
df = pd.DataFrame(data=for_dataframe_list, columns=columns)
df.to_csv(os.path.join(save_folder, "data_info.csv"))

In [None]:
skipped_paper_idx_list