# Requirements:

- https://selenium-python.readthedocs.io/locating-elements.html
- https://sites.google.com/a/chromium.org/chromedriver/downloads

Instead of looping through pages, uses "All Results" button under "Export Citations"


In [48]:
import os
import time
from enum import Enum
import shutil
import pandas as pd
import random
import csv

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait

from pybtex.database.input import bibtex
import pybtex.errors
pybtex.errors.set_strict_mode(False)

# from urllib.request import urlopen, Request
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.chrome.service import Service
# import bibtexparser
# from html.parser import HTMLParser
# from itertools import permutations 


In [49]:
# 0: No Screenshots
# 1: One Screenshot for each query (recommended)
# 2: Screenshots of different steps to find out why crawler might not work
DEBUG = 2

acm_maxpage = 39

GLOBAL_ERROR_LIST = []
urls = []

# chrome_options = Options()
# chrome_options.add_argument("--headless=new")


# Settings for crawling


In [50]:
class SearchWhere(Enum):
        Title = 1
        Abstract = 2
        TitleAbstract = 3 #Keywords have to be in Title OR Abstract
        Anywhere = 4
        Fulltext = 5
class Library(Enum):
        IEEE = 1
        ACM = 2
        ScienceDirect = 3

year_min = 2015 # Set to earliest year which should be crawled
year_max = 2024 # Set to latest year whichh should be crawled

# Set random seed
random.seed(5)


# Setup for crawler

### function to crawl: crawl(keywords, LIBRARY, titlesearch)


In [51]:
# Define read and write permissions (in octal notation)
directory_path = os.getcwd()+ '/acm-query-dl'
new_permissions = 644  # read and write
os.chmod(directory_path, new_permissions)

def setupCrawler(dl_folder):
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920,1080')
    dl = os.getcwd() + '/acm-query-dl'
    p = {"download.default_directory": dl}
    options.add_experimental_option("prefs", p)
    op = webdriver.ChromeOptions()
    driver = webdriver.Chrome(options=options)
    print("Driver setup complete.")
    return driver

def crawl(keywords_list, library, searchWhere):
    print(f"Start crawling {library}")
    if library == Library.ACM:
        keywords = [[item.replace(" ", "+") for item in keywords] for keywords in keywords_list]
        saveACMBib(keywords, Library.ACM, searchWhere)
    else:
        print(f"Library {library} not yet supported")

def getURL(keywords, library, searchWhere, concatentation="AND"):
    URL = ""
    search = ""
    if library == Library.ACM:
        titleSearch = "doSearch?AllField="
        for i, keyword in enumerate(keywords):
            search += f"%22{keyword}%22"
            if (i < len(keywords)-1):
                search += f"+{concatentation}+"
        match searchWhere:
            case SearchWhere.Title:
                print("Searching ACM for title only")
                titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1={search}"
            case SearchWhere.Abstract:
                print("Searching ACM for abstract only")
                titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Abstract&text1={search}"
            case SearchWhere.TitleAbstract:
                print("ACM does not support searching for keywords in Title OR Abstract. Please use Title and Abstract search seperately.")
            case SearchWhere.Anywhere:
                print("Searching ACM for anywhere")
                titleSearch=f"doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=AllField&text1={search}"
            case SearchWhere.Fulltext:
                titleSearch=f"doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=Fulltext&text1={search}"
        URL = f"https://dl.acm.org/action/{titleSearch}&SeriesKeyAnd=imwut&startPage=0&pageSize="
        return URL

    else:
        print(f"Library {library} not yet supported")
    return URL

# ACM


In [52]:
# def loadACMBib(toOpen, driver):
#     driver.get(toOpen)#put here the adress of your page
#     # delay = 3 # seconds
    
#     try: 
#         # Only accept necessary cookies to resolve cookie popup
#         driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinDeclineAll").click()
#     except: 
#         print("No cookie popup found, continuing.")
#         pass

#     # Wait for the cookie dialog to disappear
#     WebDriverWait(driver, 10).until(EC.invisibility_of_element((By.ID, "CybotCookiebotDialogBodyContent")))

#     #iterate over middle navbar to see if query found paper results or only people
#     driver.find_element(by=By.CLASS_NAME, value="item-results__checkbox").click()
#     time.sleep(5 + random.random())
#     driver.find_element(by=By.CLASS_NAME, value="item-results__buttons.visible").find_elements(by=By.XPATH, value=".//*")[0].click()
#     time.sleep(20 + random.random())
    
#     # Instead, click "All Results"
#     driver.find_element(by=By.ID, value="allResults").click()
#     time.sleep(15 + random.random())
#     driver.find_element(by=By.CLASS_NAME, value="downloadBtn").click()
#     time.sleep(40 + random.random())

#     # Wait for the popup and the new download button to be visible
#     export_div = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="exportDownloadReady"]/div[2]')))

#     # Find the 'a' element inside the located div, which contains our href of interest
#     download_now_button = export_div.find_element(By.XPATH, './/a[@class="btn blue searchCiteExport-popup__close pull-right"]')

#     # Scroll into view and click the 'a' element
#     driver.execute_script("arguments[0].scrollIntoView();", download_now_button)
#     download_now_button.click()
#     time.sleep(2 + random.random())

def saveACMOA(toOpen, driver):
    '''Create CSV of open access articles by DOI'''
    driver.get(toOpen)#put here the adress of your page
    # delay = 3 # seconds
    
    try: 
        # Only accept necessary cookies to resolve cookie popup
        driver.find_element(By.ID, "CybotCookiebotDialogBodyLevelButtonLevelOptinDeclineAll").click()
    except: 
        print("No cookie popup found, continuing.")
        pass

    # Wait for the cookie dialog to disappear
    WebDriverWait(driver, 10).until(EC.invisibility_of_element((By.ID, "CybotCookiebotDialogBodyContent")))

    # Find all elements with the class name `search__item issue-item-container`
    ul_element = driver.find_element(By.CLASS_NAME, 'search-result__xsl-body.items-results.rlist--inline')

    # Find all <li> elements within the <ul> element with the class name `search__item issue-item-container`
    li_elements = ul_element.find_elements(By.CLASS_NAME, 'search__item.issue-item-container')

    oa_list = []

    for element in li_elements:
        # Extract the DOI
        doi_element = element.find_element(By.CLASS_NAME, 'issue-item__doi')
        doi = doi_element.get_attribute('href').split('org/')[-1]

        # Check for access type
        restricted = False
        open_access = False

        try:
            element.find_element(By.CLASS_NAME, 'get-access')
            restricted = True
        except:
            pass

        try:
            element.find_element(By.CLASS_NAME, 'btn--icon.simple-tooltip__block--b.blue.btn')
            open_access = True
        except:
            pass

        # Append the extracted data to the list
        oa_list.append([doi, restricted, open_access])

    # Write the data to a CSV file
    with open('acm-query-dl/acm-opensource.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['DOI', 'Restricted', 'Open Access'])
        writer.writerows(oa_list)

    time.sleep(2 + random.random())
    
def saveACMBib(keywords_list, dl_folder, searchWhere = SearchWhere.Anywhere):
    driver = setupCrawler(dl_folder)
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        ACM_URL = getURL(keywords, Library.ACM, searchWhere)
        if DEBUG > 0: print(ACM_URL)
        driver.get(ACM_URL)#put here the adress of your page
        time.sleep(3 + random.random())
        navbar = driver.find_elements(by=By.CLASS_NAME, value="search-result__nav-container")
        navbar = navbar[0]
        navelements = navbar.find_elements(by=By.XPATH, value=".//*")
        foundResults = False
        for nav_element in navelements:
            if "RESULTS" in nav_element.text: 
                foundResults = True
        if foundResults == False: 
            print("Only people in results - next keyword")
            continue
        name = ""
        for word in keywords:
            name += f"{word}"
        match searchWhere:
            case SearchWhere.Title:
                name += "_TitleOnly" 
            case SearchWhere.Abstract:
                name += "_AbstractOnly" 
            case SearchWhere.TitleAbstract:
                print("Stopping")
                break 
            case SearchWhere.Anywhere:
                name += "_Anywhere"
            case SearchWhere.Fulltext:
                name += "_Fulltext"
        if DEBUG > 0: driver.save_screenshot(f"./acm-query-dl/acm-opensource_{name}.png")
        # get amount of results for for-loop

        try:
            results = driver.find_element(by=By.CLASS_NAME, value="result__count")
            results = results.text.split(" ")[0]
            if "," in results:
                results = results.replace(",", "")
            results = int(results)
            if results >= 1000:
                print(f'NOTE: More than 1000 search results for keyword, export citations will not be comprehensive: {keywords}')
        except NoSuchElementException:
            results = 0
        
        # Navigate to page
        print("New URL displaying all results:", ACM_URL+str(results))
        saveACMOA(ACM_URL+str(results), driver)
        try:
            source = os.getcwd() + '/acm-query-dl/acm-opensource.csv'
            destination = os.getcwd() + '/acm-query-dl/' + f'/acm-opensource_{name.replace("*","8")}.csv'

            shutil.copy(source, destination)  # Copy the file
            os.remove(source)  # Remove the original file
            print('REMOVED.')

        except FileNotFoundError:
            print("Only 1 bib entry in that file.")

# Implement Search
Input: CSV with columns: keywords, searchwhere

In [53]:
# # Clear directory acm-queries-dl and move all files to acm-queries-archive

# source_dir = os.getcwd() + '/acm-query-dl'
# target_dir = os.getcwd() + '/acm-query-archive'
    
# old_query_dl_files = os.listdir(source_dir)

# new_permissions = 0o777
# os.chmod(target_dir, new_permissions)

# for file_name in old_query_dl_files:
#     if not file_name.startswith('.'):
#         # If the file doesn't already exist, then move the file over
#         shutil.move(os.path.join(source_dir, file_name), os.path.join(target_dir, file_name))

In [54]:
queries_df = pd.read_csv('./acm-queries.csv')

searchtypes = queries_df['searchwhere'].unique()

for searchtype in searchtypes:
    temp_df = queries_df[queries_df['searchwhere'] == searchtype]
    keywords = [[keyword] for keyword in temp_df['keywords']]
    if searchtype == 'Title':
        crawl(keywords, Library.ACM, SearchWhere.Title)
    elif searchtype == 'Abstract':
        crawl(keywords, Library.ACM, SearchWhere.Abstract)
    elif searchtype == 'TitleAbstract':
        break
    elif searchtype == 'Anywhere':
        crawl(keywords, Library.ACM, SearchWhere.Anywhere)
    elif searchtype == 'Fulltext':
        crawl(keywords, Library.ACM, SearchWhere.Fulltext)
    else:
        print('Invalid search term.')

Start crawling Library.ACM
Driver setup complete.
Search for: ['dataset']
Searching ACM for title only
https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1=%22dataset%22&SeriesKeyAnd=imwut&startPage=0&pageSize=
New URL displaying all results: https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1=%22dataset%22&SeriesKeyAnd=imwut&startPage=0&pageSize=13
REMOVED.
Search for: ['survey']
Searching ACM for title only
https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1=%22survey%22&SeriesKeyAnd=imwut&startPage=0&pageSize=
New URL displaying all results: https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1=%22survey%22&SeriesKeyAnd=imwut&startPage=0&pageSize=5
No cookie popup found, continuing.
REMOVED.
Start crawling Library.ACM
Driver setup complete.
Search for: ['we+collect']
https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&fie

Create a loop here that takes in a csv of:
keywords, where they are searched

it coverts the keywords to lists based on where they are searched and searches all of them, creating .bib files

it returns a list of keywords for which there were greater than 1000 results (need a script for this, can modify the original script that goes through pages)

ready to run the next part of the notebook, that adds them to columns. 


# Take union of all CSVs

In [75]:
# Path to dir
folder_path = './acm-query-dl/'

# List to store the dataframes
dataframes = []

# Iterate through the files in the folder
for filename in os.listdir(folder_path):
    # Check if the filename starts with 'acm-openaccess-'
    if filename.startswith('acm-opensource') and filename.endswith('.csv'):
        # Read the CSV file into a dataframe
        df = pd.read_csv(os.path.join(folder_path, filename))
        # Append the dataframe to the list
        dataframes.append(df)

# Concatenate all the dataframes
merged_df = pd.concat(dataframes, ignore_index=True)

# Drop duplicate rows
unique_df = merged_df.drop_duplicates()

# Check for violations
violations = unique_df[(unique_df['Restricted'] == unique_df['Open Access'])]

if not violations.empty:
    print("Rows with violations found:")
    print(violations)
else:
    print("No violations found.")

# If no violations, delete "restrictd column"
unique_df = unique_df.rename(columns={'DOI': 'id'})
unique_df.drop(['Restricted'], inplace=True, axis=1)

print(unique_df.columns, unique_df.head())

# Load the master CSV
master_imwut_df = pd.read_csv('master_imwut.csv')['id']

# Perform a left join on the "DOI" column
combined_df = pd.merge(master_imwut_df, unique_df, on='id', how='left')
combined_df.fillna(value="NaN", inplace=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('master_imwut_oa_id.csv', index=False)

print('Final combined CSV file created successfully.')

No violations found.
Index(['id', 'Open Access'], dtype='object')                 id  Open Access
0  10.1145/3678591        False
1  10.1145/3610891         True
2  10.1145/3643541         True
3  10.1145/3678577        False
4  10.1145/3569478        False
Final combined CSV file created successfully.


# Append search queries to existing CSV
Input: directory of bibtex files, csv on which to append the bibtex files.
Output: csv with appended columns for each bib file

In [236]:
# import os
# import pandas as pd
# from pybtex.database.input import bibtex
# from pybtex.database import parse_file as parser
# import time

# # Path to the dir
# directory_path = os.getcwd()+ '/acm-query-dl'
# csv_directory_path = os.getcwd() + '/'
# csv_filename = 'master_imwut.csv'

# # Open CSV as dataframe
# df = pd.read_csv(csv_directory_path + csv_filename)

# # Iterate through all files in the directory, which contains search queries
# for filename in os.listdir(directory_path):
#     if filename.endswith('.bib'):
#         file_path = os.path.join(directory_path, filename)
#         print(file_path)

#         # Read the file with pybtex parser
#         parser = bibtex.Parser()
#         bib_data = parser.parse_file(file_path)
#         bib_dois = [key for (key,value) in bib_data.entries.items()]
#         print(f'Successfully parsed: {filename}, writing to csv.')

#         # Add column to df
#         colname = filename.split('.')[0]
#         df[colname] = df['id'].isin(bib_dois)
    
# # Write the updated DataFrame back to a new CSV file
# timestr = time.strftime("%Y%m%d-%H%M%S")
# # df.to_csv(csv_directory_path + 'query-modified-masters/' + csv_filename.split('.')[0] + f'_modified_{timestr}.csv', index=False)

# # Copy queries
# # queries_df.to_csv(csv_directory_path + 'query-modified-masters/' + 'acm-queries' + f'_{timestr}.csv', index=False)

/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_we+collect_Fulltext.bib
Successfully parsed: acm_we+collect_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_training+set_Fulltext.bib
Successfully parsed: acm_training+set_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_pretrain_Fulltext.bib
Successfully parsed: acm_pretrain_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_we+trained_Fulltext.bib
Successfully parsed: acm_we+trained_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_we+annotated_Fulltext.bib
Successfully parsed: acm_we+annotated_Fulltext.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_LLM_AbstractOnly.bib
Successfully parsed: acm_LLM_AbstractOnly.bib, writing to csv.
/Users/audreyxychang/Documents/hci-lab/acm-query-dl/acm_pre-trained_Fulltext.bib
Successfully parsed: acm_pre-trained_Fulltext

In [55]:
# Iterate through all CSVs, and make a column of open access

# Check that there is no overlap between the restricted and open access columns

# Generate a CSV of all the doi's and their restricted / open access statuses

Unnamed: 0,DOI,Restricted,Open Access
0,10.1145/3678505,False,True
1,10.1145/3678593,False,True
2,10.1145/3678517,False,True
3,10.1145/3678577,True,False
4,10.1145/3659594,False,True
...,...,...,...
541,10.1145/3264956,True,False
542,10.1145/3161196,True,False
543,10.1145/3130954,False,True
544,10.1145/3214274,True,False
