# Requirements:

- https://selenium-python.readthedocs.io/locating-elements.html
- https://sites.google.com/a/chromium.org/chromedriver/downloads

Note from Audrey: I did not use this script at all! It is simply Luke's Python scraping script pared down for ACM only. Instead of iterating through pages (which this script does), PyLitReview_v4 selects "All Citations" to download at once. Do not use this notebook to replicate Audrey's data collection. 

In [2]:
from urllib.request import urlopen, Request

import pandas as pd
import numpy as np

from html.parser import HTMLParser
import tqdm
import math

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait

from pybtex.database.input import bibtex
import pybtex.errors
pybtex.errors.set_strict_mode(False)

import itertools
from itertools import permutations 

import os
import time
# import config
from enum import Enum

import bibtexparser

In [3]:
# 0: No Screenshots
# 1: One Screenshot for each query (recommended)
# 2: Screenshots of different steps to find out why crawler might not work
DEBUG = 2

acm_maxpage = 39

GLOBAL_ERROR_LIST = []
urls = []

# chrome_options = Options()
# chrome_options.add_argument("--headless=new")

# Settings for crawling


In [4]:
class SearchWhere(Enum):
        Title = 1
        Abstract = 2
        TitleAbstract = 3 #Keywords have to be in Title OR Abstract
        Text = 4
class Library(Enum):
        IEEE = 1
        ACM = 2
        ScienceDirect = 3

year_min = 2015 # Set to earliest year which should be crawled
year_max = 2024 # Set to latest year whichh should be crawled

# keywords = [['behavior change'], ['behaviour change']]
keywords = [['dataset']]

In [5]:
keywords

[['dataset']]

# Setup for crawler

### function to crawl: crawl(keywords, LIBRARY, titlesearch)


In [6]:
def setupCrawler(dl_folder):
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920,1080')
    dl = ""
    p = {"download.default_directory": dl}
    options.add_experimental_option("prefs", p)
    op = webdriver.ChromeOptions()
    driver = webdriver.Chrome(options=options)
    print("Driver setup complete.")
    return driver

def crawl(keywords_list, library, searchWhere):
    print(f"Start crawling {library}")
    if library == Library.ACM:
        keywords = [[item.replace(" ", "+") for item in keywords] for keywords in keywords_list]
        saveACMBib(keywords, Library.ACM, searchWhere)
    else:
        print(f"Library {library} not yet supported")

def getURL(keywords, library, searchWhere, concatentation="AND"):
    URL = ""
    search = ""
    if library == Library.ACM:
        titleSearch = "doSearch?AllField="
        for i, keyword in enumerate(keywords):
            search += f"%22{keyword}%22"
            if (i < len(keywords)-1):
                search += f"+{concatentation}+"
        match searchWhere:
            case SearchWhere.Title:
                print("Searching ACM for title only")
                titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1={search}"
            case SearchWhere.Abstract:
                print("Searching ACM for abstract only")
                titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Abstract&text1={search}"
            case SearchWhere.TitleAbstract:
                print("ACM does not support searching for keywords in Title OR Abstract. Please use Title and Abstract search seperately.")
            case SearchWhere.Text | _:
                print("Quicksearching ACM")
        URL = f"https://dl.acm.org/action/{titleSearch}&SeriesKeyAnd=imwut&startPage="
        return URL

    else:
        print(f"Library {library} not yet supported")
    return URL

# ACM


In [7]:
def loadACMBib (toOpen, driver):
    driver.get(toOpen)#put here the adress of your page
    # delay = 3 # seconds
    
    try: 
        # Only accept necessary cookies to resolve cookie popup
        driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinDeclineAll").click()
    except: 
        print("No cookie popup found, continuing.")

    # Wait for the cookie dialog to disappear
    WebDriverWait(driver, 10).until(EC.invisibility_of_element((By.ID, "CybotCookiebotDialogBodyContent")))

    #iterate over middle navbar to see if query found paper results or only people
    driver.find_element(by=By.CLASS_NAME, value="item-results__checkbox").click()
    time.sleep(5)
    driver.find_element(by=By.CLASS_NAME, value="item-results__buttons.visible").find_elements(by=By.XPATH, value=".//*")[0].click()
    time.sleep(20)
    driver.find_element(by=By.CLASS_NAME, value="rlist--inline.separator").find_elements(by=By.XPATH, value=".//*")[0].click()
    time.sleep(20)
    
def saveACMBib(keywords_list, dl_folder, searchWhere = SearchWhere.Text):
    driver = setupCrawler(dl_folder)
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        ACM_URL = getURL(keywords, Library.ACM, searchWhere)
        if DEBUG > 0: print(ACM_URL)
        driver.get(ACM_URL)#put here the adress of your page
        time.sleep(3)
        navbar = driver.find_elements(by=By.CLASS_NAME, value="search-result__nav-container")
        navbar = navbar[0]
        navelements = navbar.find_elements(by=By.XPATH, value=".//*")
        foundResults = False
        for nav_element in navelements:
            if "RESULTS" in nav_element.text: 
                foundResults = True
        if foundResults == False: 
            print("Only people in results - next keyword")
            continue
        name = ""
        for word in keywords:
            name += f"{word}"
        match searchWhere:
            case SearchWhere.Title:
                name += "_TitleOnly" 
            case SearchWhere.Abstract:
                name += "_AbstractOnly" 
            case SearchWhere.TitleAbstract:
                print("Stopping")
                break 
                # name += "_TitleAbstract"
            case SearchWhere.Text | _:
                name += ""  
        if DEBUG > 0: driver.save_screenshot(f"./acm_{name}.png")
        # get amount of results for for-loop

        try:
            results = driver.find_element(by=By.CLASS_NAME, value="result__count")
            results = results.text.split(" ")[0]
            if "," in results:
                results = results.replace(",", "")
            results = int(results)
        except NoSuchElementException:
            results = 0
        r = np.min([math.ceil(results / 50), acm_maxpage])
        # Loop through all pages and save resulting bib files
        for i in tqdm.tqdm(range(r)):
            # toOpen = ACM_URL + str(i) 
            toOpen = ACM_URL + str(i) + '&pageSize=50'
            driver = setupCrawler(dl_folder) # I think this is unnecessary - if something breaks with ACM try uncommenting this line first
            loadACMBib(toOpen, driver)
            print(toOpen)
            try:
                os.rename('./acm/acm.bib', f'./acm/acm_{name.replace("*","")}_page{i}.bib')
            except FileNotFoundError:
                print("Only 1 bib entry in that file.")

In [8]:
# No results for now
# crawl(keywords, Library.ACM, SearchWhere.Title)

In [None]:
crawl(keywords, Library.ACM, SearchWhere.Text)