# Requirements:

- https://selenium-python.readthedocs.io/locating-elements.html
- https://sites.google.com/a/chromium.org/chromedriver/downloads

Instead of looping through pages, uses "All Results" button under "Export Citations"


In [164]:
import os
import time
from enum import Enum
import shutil
import pandas as pd

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait

from pybtex.database.input import bibtex
import pybtex.errors
pybtex.errors.set_strict_mode(False)

# from urllib.request import urlopen, Request
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.chrome.service import Service
# import bibtexparser
# from html.parser import HTMLParser
# from itertools import permutations 


In [165]:
# 0: No Screenshots
# 1: One Screenshot for each query (recommended)
# 2: Screenshots of different steps to find out why crawler might not work
DEBUG = 2

acm_maxpage = 39

GLOBAL_ERROR_LIST = []
urls = []

# chrome_options = Options()
# chrome_options.add_argument("--headless=new")


# Settings for crawling


In [166]:
class SearchWhere(Enum):
        Title = 1
        Abstract = 2
        TitleAbstract = 3 #Keywords have to be in Title OR Abstract
        Anywhere = 4
        Fulltext = 5
class Library(Enum):
        IEEE = 1
        ACM = 2
        ScienceDirect = 3

year_min = 2015 # Set to earliest year which should be crawled
year_max = 2024 # Set to latest year whichh should be crawled


# Setup for crawler

### function to crawl: crawl(keywords, LIBRARY, titlesearch)


In [167]:
# Define read and write permissions (in octal notation)
directory_path = os.getcwd()+ '/acm-query-dl'
new_permissions = 644  # read and write
os.chmod(directory_path, new_permissions)

def setupCrawler(dl_folder):
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920,1080')
    dl = os.getcwd() + '/acm-query-dl'
    p = {"download.default_directory": dl}
    options.add_experimental_option("prefs", p)
    op = webdriver.ChromeOptions()
    driver = webdriver.Chrome(options=options)
    print("Driver setup complete.")
    return driver

def crawl(keywords_list, library, searchWhere):
    print(f"Start crawling {library}")
    if library == Library.ACM:
        keywords = [[item.replace(" ", "+") for item in keywords] for keywords in keywords_list]
        saveACMBib(keywords, Library.ACM, searchWhere)
    else:
        print(f"Library {library} not yet supported")

def getURL(keywords, library, searchWhere, concatentation="AND"):
    URL = ""
    search = ""
    if library == Library.ACM:
        titleSearch = "doSearch?AllField="
        for i, keyword in enumerate(keywords):
            search += f"%22{keyword}%22"
            if (i < len(keywords)-1):
                search += f"+{concatentation}+"
        match searchWhere:
            case SearchWhere.Title:
                print("Searching ACM for title only")
                titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1={search}"
            case SearchWhere.Abstract:
                print("Searching ACM for abstract only")
                titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Abstract&text1={search}"
            case SearchWhere.TitleAbstract:
                print("ACM does not support searching for keywords in Title OR Abstract. Please use Title and Abstract search seperately.")
            case SearchWhere.Anywhere:
                print("Searching ACM for anywhere")
                titleSearch=f"doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=AllField&text1={search}"
            case SearchWhere.Fulltext:
                titleSearch=f"doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=Fulltext&text1={search}"
        URL = f"https://dl.acm.org/action/{titleSearch}&SeriesKeyAnd=imwut&startPage="
        return URL

    else:
        print(f"Library {library} not yet supported")
    return URL

# ACM


In [168]:
def loadACMBib (toOpen, driver):
    driver.get(toOpen)#put here the adress of your page
    # delay = 3 # seconds
    
    try: 
        # Only accept necessary cookies to resolve cookie popup
        driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinDeclineAll").click()
    except: 
        print("No cookie popup found, continuing.")
        pass

    # Wait for the cookie dialog to disappear
    WebDriverWait(driver, 10).until(EC.invisibility_of_element((By.ID, "CybotCookiebotDialogBodyContent")))

    #iterate over middle navbar to see if query found paper results or only people
    driver.find_element(by=By.CLASS_NAME, value="item-results__checkbox").click()
    time.sleep(5)
    driver.find_element(by=By.CLASS_NAME, value="item-results__buttons.visible").find_elements(by=By.XPATH, value=".//*")[0].click()
    time.sleep(20)
    
    # Instead, click "All Results"
    driver.find_element(by=By.ID, value="allResults").click()
    time.sleep(20)
    driver.find_element(by=By.CLASS_NAME, value="downloadBtn").click()
    time.sleep(15)

    # Wait for the popup and the new download button to be visible
    export_div = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="exportDownloadReady"]/div[2]')))

    # Find the 'a' element inside the located div, which contains our href of interest
    download_now_button = export_div.find_element(By.XPATH, './/a[@class="btn blue searchCiteExport-popup__close pull-right"]')

    # Scroll into view and click the 'a' element
    driver.execute_script("arguments[0].scrollIntoView();", download_now_button)
    download_now_button.click()
    time.sleep(2)
    
def saveACMBib(keywords_list, dl_folder, searchWhere = SearchWhere.Anywhere):
    driver = setupCrawler(dl_folder)
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        ACM_URL = getURL(keywords, Library.ACM, searchWhere)
        if DEBUG > 0: print(ACM_URL)
        driver.get(ACM_URL)#put here the adress of your page
        time.sleep(3)
        navbar = driver.find_elements(by=By.CLASS_NAME, value="search-result__nav-container")
        navbar = navbar[0]
        navelements = navbar.find_elements(by=By.XPATH, value=".//*")
        foundResults = False
        for nav_element in navelements:
            if "RESULTS" in nav_element.text: 
                foundResults = True
        if foundResults == False: 
            print("Only people in results - next keyword")
            continue
        name = ""
        for word in keywords:
            name += f"{word}"
        match searchWhere:
            case SearchWhere.Title:
                name += "_TitleOnly" 
            case SearchWhere.Abstract:
                name += "_AbstractOnly" 
            case SearchWhere.TitleAbstract:
                print("Stopping")
                break 
            case SearchWhere.Anywhere:
                name += "_Anywhere"
            case SearchWhere.Fulltext:
                name += "_Fulltext"
        if DEBUG > 0: driver.save_screenshot(f"./acm-query-dl/acm_{name}.png")
        # get amount of results for for-loop

        try:
            results = driver.find_element(by=By.CLASS_NAME, value="result__count")
            results = results.text.split(" ")[0]
            if "," in results:
                results = results.replace(",", "")
            results = int(results)
            if results >= 1000:
                print(f'NOTE: More than 1000 search results for keyword, export citations will not be comprehensive: {keywords}')
        except NoSuchElementException:
            results = 0
        
        # Navigate to page
        loadACMBib(ACM_URL, driver)
        print('URL:', ACM_URL)
        try:
            source = os.getcwd() + '/acm-query-dl/acm.bib'
            destination = os.getcwd() + '/acm-query-dl/' + f'/acm_{name.replace("*","8")}.bib'

            shutil.copy(source, destination)  # Copy the file
            os.remove(source)  # Remove the original file

        except FileNotFoundError:
            print("Only 1 bib entry in that file.")

# Implement Search
Input: CSV with columns: keywords, searchwhere

In [171]:
queries_df = pd.read_csv('./acm-queries.csv')

searchtypes = queries_df['searchwhere'].unique()

for searchtype in searchtypes:
    temp_df = queries_df[queries_df['searchwhere'] == searchtype]
    keywords = [[keyword] for keyword in temp_df['keywords']]
    if searchtype == 'Title':
        crawl(keywords, Library.ACM, SearchWhere.Title)
    elif searchtype == 'Abstract':
        crawl(keywords, Library.ACM, SearchWhere.Abstract)
    elif searchtype == 'TitleAbstract':
        break
    elif searchtype == 'Anywhere':
        crawl(keywords, Library.ACM, SearchWhere.Anywhere)
    elif searchtype == 'Fulltext':
        crawl(keywords, Library.ACM, SearchWhere.Fulltext)
    else:
        print('Invalid search term.')

Start crawling Library.ACM
Driver setup complete.
Search for: ['external+dataset*']
https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=Fulltext&text1=%22external+dataset*%22&SeriesKeyAnd=imwut&startPage=
URL: https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=Fulltext&text1=%22external+dataset*%22&SeriesKeyAnd=imwut&startPage=
Search for: ['existing+dataset*']
https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=Fulltext&text1=%22existing+dataset*%22&SeriesKeyAnd=imwut&startPage=
No cookie popup found, continuing.
URL: https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=Fulltext&text1=%22existing+dataset*%22&SeriesKeyAnd=imwut&startPage=
Search for: ['datasets+used']
https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=Fulltext&text1=%22datasets+used%22&SeriesKeyAnd=imwut&startPage=
No cookie popup 

Create a loop here that takes in a csv of:
keywords, where they are searched

it coverts the keywords to lists based on where they are searched and searches all of them, creating .bib files

it returns a list of keywords for which there were greater than 1000 results (need a script for this, can modify the original script that goes through pages)

ready to run the next part of the notebook, that adds them to columns. 


# Append search queries to existing CSV
Input: directory of bibtex files, csv on which to append the bibtex files.
Output: csv with appended columns for each bib file

In [173]:
import os
import pandas as pd
from pybtex.database.input import bibtex
from pybtex.database import parse_file as parser
import time

# Path to the dir
directory_path = os.getcwd()+ '/acm-query-dl'
csv_directory_path = os.getcwd() + '/'
csv_filename = 'master_imwut.csv'

# Open CSV as dataframe
df = pd.read_csv(csv_directory_path + csv_filename)

# Iterate through all files in the directory, which contains search queries
for filename in os.listdir(directory_path):
    if filename.endswith('.bib'):
        file_path = os.path.join(directory_path, filename)
        print(file_path)

        # Read the file with pybtex parser
        parser = bibtex.Parser()
        bib_data = parser.parse_file(file_path)
        bib_dois = [key for (key,value) in bib_data.entries.items()]
        print(f'Successfully parsed: {filename}, writing to csv.')

        # Add column to df
        colname = filename.split('.')[0]
        df[colname] = df['id'].isin(bib_dois)
    
# Write the updated DataFrame back to a new CSV file
timestr = time.strftime("%Y%m%d-%H%M%S")
df.to_csv(csv_directory_path + 'query-modified-masters/' + csv_filename.split('.')[0] + f'_modified_{timestr}.csv', index=False)

/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_classification+model8_Fulltext.bib
Successfully parsed: acm_classification+model8_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_training+set_Fulltext.bib
Successfully parsed: acm_training+set_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_pretrain8_Fulltext.bib
Successfully parsed: acm_pretrain8_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_existing+dataset8_Fulltext.bib
Successfully parsed: acm_existing+dataset8_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_LLM_AbstractOnly.bib
Successfully parsed: acm_LLM_AbstractOnly.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_neural+network_AbstractOnly.bib
Successfully parsed: acm_neural+network_AbstractOnly.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_review_AbstractOnly.