# ACM CRAWLER
Jupyter Notebook to crawl the ACM digital library for publications containing links to repositories hosted on GitHub  
For a repeated execution of the crawling process, the kernel has to be restarted, otherwise a ReactorNotRestartable error is thrown (https://github.com/scrapy/scrapy/issues/2594)

In [None]:
import re
import yaml
import scrapy
import time
import w3lib.html
from datetime import datetime
from pymongo import MongoClient
from scrapy.crawler import CrawlerProcess

Crawling started at 30.09.2020:
crawling higher than start page 40 (limited to 2000 results): 
"In order to show you the most relevant results, 
we have omitted some entries very similar to the 2000 already displayed.
Please refine the results either via the facet filters on the left or edit 
your search in Advanced Search via the button at the top.""
To manage the quantity of publications
in the search period from 2008 until 2020
search intervals are defined. Depending on 
the number of results, for each year is at 
least one start url specified.   
0 Results for: [All: github.com] AND [Publication Date: (01/01/1936 TO 12/31/2007)] (therefore search starts in 2008)  
results for 2008 to 2012: 586 (crawled: 586, total: 586)  
results for 2013: 705 (crawled: 705, total: 1291)  
results for 2014: 1,163 (crawled: 1163, total: 2454)  
results for 2015: 1,900 (crawled: 1900, total: 4354)  
results for 01.2016 to 06.2016: 1,228 (crawled: 1228, total: 5582)  
results for 07.2016 to 09.2016: 740 (crawled: 740, total: 6322)  
results for 10.2016 to 12.2016: 855 (crawled: 855, total: 7177)  
results for 01.2017 to 03.2017: 534 (crawled: 534, total: 7711)  
results for 04.2017 to 06.2017: 912 (crawled: 912, total: 8623)  
results for 07.2017 to 09.2017: 1,099 (crawled: 1099, total: 9722)  
results for 10.2017 to 12.2017: 1,175  (crawled: 1175, total: 10897)  
results for 01.2018 to 03.2018: 648 (crawled: 648, total: 11545)  
results for 04.2018 to 06.2018: 1,201 (crawled: 1201, total: 12746)  
results for 07.2018 to 09.2018: 1,106 (crawled: 1106, total: 13852)  
results for 10.2018 to 12.2018: 1,364 (crawled: 1364, total: 15216)  
results for 01.2019 to 03.2019: 700 (crawled: 700, total: 15916)  
results for 04.2019 to 06.2019: 1,415 (crawled: 1415, total: 17331)  
results for 07.2019 to 09.2019: 1,353 (crawled: 1353, total: 18684)  
results for 10.2019 to 12.2019: 1,239 (crawled: 1239, total: 19923)  
results for 01.2020 to 03.2020: 653 (crawled: 653, total: 20576)  
results for 04.2020 to 06.2020: 1,163 (crawled: 1163, total: 21739)  
results for 07.2020 to 09.2020: 1,064 (crawled: 1064, total: 22803)  
results for 10.2020: 22 (crawled: 22, total: 22825) 
results for 10.2020: 818 (crawled: 818, total: 23643)
results for 11.2020 to 12.2020: 597 (crawled: 597, total: 24240)
total results: 22,852  

In [None]:
class SpiderACM(scrapy.Spider):
    
    name = 'SpiderACM'
    
    custom_settings = {
        'METAREFRESH_ENABLED' : False
    }

    start_urls = [
        # processed:
        # 'https://dl.acm.org/action/doSearch?AllField=github.com&pageSize=50&AfterYear=2008&BeforeYear=2012&expand=all'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2013&BeforeMonth=12&BeforeYear=2013&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2014&BeforeMonth=12&BeforeYear=2014&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2015&BeforeMonth=12&BeforeYear=2015&AllField=github.com&startPage=0&pageSize=50'
        # last site with 50 results was not indicated on page 36, therefore the following extra request:
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2015&BeforeMonth=12&BeforeYear=2015&AllField=github.com&startPage=37&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2016&BeforeMonth=6&BeforeYear=2016&AllField=github.com&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=7&AfterYear=2016&BeforeMonth=9&BeforeYear=2016&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=10&AfterYear=2016&BeforeMonth=12&BeforeYear=2016&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2017&BeforeMonth=3&BeforeYear=2017&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=4&AfterYear=2017&BeforeMonth=6&BeforeYear=2017&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=7&AfterYear=2017&BeforeMonth=9&BeforeYear=2017&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=10&AfterYear=2017&BeforeMonth=12&BeforeYear=2017&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2018&BeforeMonth=3&BeforeYear=2018&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=4&AfterYear=2018&BeforeMonth=6&BeforeYear=2018&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=7&AfterYear=2018&BeforeMonth=9&BeforeYear=2018&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=10&AfterYear=2018&BeforeMonth=12&BeforeYear=2018&AllField=github.com&startPage=0&pageSize=50'
        #'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2019&BeforeMonth=3&BeforeYear=2019&AllField=github.com&startPage=0&pageSize=50'
        # last page not in pagination
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2019&BeforeMonth=3&BeforeYear=2019&AllField=github.com&startPage=13&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=4&AfterYear=2019&BeforeMonth=6&BeforeYear=2019&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=7&AfterYear=2019&BeforeMonth=9&BeforeYear=2019&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=10&AfterYear=2019&BeforeMonth=12&BeforeYear=2019&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=1&AfterYear=2020&BeforeMonth=3&BeforeYear=2020&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=4&AfterYear=2020&BeforeMonth=6&BeforeYear=2020&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=7&AfterYear=2020&BeforeMonth=9&BeforeYear=2020&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=10&AfterYear=2020&BeforeMonth=10&BeforeYear=2020&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=10&AfterYear=2020&BeforeMonth=10&BeforeYear=2020&AllField=github.com&startPage=0&pageSize=50'
        # 'https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=all&AfterMonth=11&AfterYear=2020&BeforeMonth=12&BeforeYear=2020&AllField=github.com&startPage=0&pageSize=50'
        # next:
    ]
    
    USER_AGENT='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/80.0'
    
    def parse(self, response):
        
        selectors = response.xpath('//div[has-class("issue-item__content")]')
        for sel in selectors: 
            full_text = ''
            summary = ''
            doi = ''
            title = (sel.xpath('.//span[has-class("hlFld-Title")]/a/text()').get())
            doi_from_link = (sel.xpath('.//span[has-class("hlFld-Title")]/a/@href').get()).replace('/doi/', '')
            link = 'https://dl.acm.org' + sel.xpath('.//span[has-class("hlFld-Title")]/a/@href').get()
            published = sel.xpath('.//span[has-class("dot-separator")]/span[1]/text()').get()
            journal_ref = (sel.xpath('.//span[has-class("epub-section__title")]/text()').get())
            doi_url = sel.xpath('.//a[has-class("issue-item__doi dot-separator")]/text()').get()
            pdf_url = sel.xpath('.//a[has-class("btn--icon simple-tooltip__block--b red btn")]/@href').get()
            abstract =  sel.xpath('.//div[has-class("abstract-text")]/p').extract()
            cited = sel.xpath('.//span[has-class("citation")]/span/text()').extract()
            downloads = sel.xpath('.//span[has-class("metric")]/span/text()').extract()            
            
            if pdf_url:
                pdf_url = 'https://dl.acm.org' + pdf_url

            if doi_url:
                doi_url = w3lib.html.remove_tags(doi_url).replace("\n","")
                doi = doi_url.rsplit('doi.org/', 1)[1]
                
            # cosmetic corrections, title may be None
            if title and '\n                     ' in title:
                    title = title.replace('\n                     ', ' ')
            if journal_ref and '\n                     ' in journal_ref:
                journal_ref = journal_ref.replace('\n                     ', ' ')
            if published:
                published = published.replace(', ','')
             
            # abstract segment from the highlights box
            if len(abstract):
                summary = w3lib.html.remove_tags(abstract[0])
            
            # full_text segment from the highlights box
            listing = sel.xpath('.//div[has-class("full-text")]/p').extract()
            for p in listing:
                full_text = full_text + p
            if full_text:
                full_text = w3lib.html.remove_tags(full_text)
            
            post = {'source': 'acm',
                    'request_date': datetime.now(),
                    'arxiv_id': None,
                    'doi': doi,
                    'doi_from_link': doi_from_link,
                    'title': title,
                    'published': published,
                    'updated': None,
                    'url' : link,
                    'doi_url': doi_url,
                    'pdf_url': pdf_url,
                    'primary_category': None,
                    'all_categories' : None,
                    'journal_ref' : journal_ref,
                    'total_citations': cited,
                    'total_downloads': downloads,
                    'summary': summary,
                    'full_text_extract': full_text
                   }  
            # no duplicate check as titles and dois may be None
            db.insert_one(post)
            
        next_page = response.xpath('//a[has-class("pagination__btn--next")]/@href').get()
        if (next_page is not None) and ('startPage=40' not in next_page):
            time.sleep(180)
            yield response.follow(next_page, callback=self.parse)

In [None]:
with open("config.yaml", 'r') as stream:
    try:
        params = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

db = getattr(getattr(MongoClient(), params['database']['name']), params['database']['collections']['publications'])

In [None]:
process = CrawlerProcess()
process.crawl(SpiderACM)
process.start()