# Collecting the arXiv publications related to COVID-19

The publications' data were collected from [arXiv webpage](https://arxiv.org/covid19search) related to COVID-19.

In [1]:
# Importing the required libraries.
import scrapy, re, csv, pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy import Selector

## 1. Getting the data from its URL

In [2]:
# Determining the URL of target page.
url = "https://arxiv.org/covid19search"

In [3]:
# Creating the repository of data.
data = []

In [4]:
# Definition of Spider class.
class SpiderArXiv(scrapy.Spider):
    name = "arXiv_covid"

    def start_requests(self):
        # Getting the URLs of papers' list.
        args = dict(css = "ul.pagination-list > li > a.pagination-link::attr(href)")
        yield scrapy.Request(url = url, callback=self.parse_links, cb_kwargs=args)

    def parse_links(self, response, css):
        # Extracting the relative URLs.
        links = response.css(css).extract()

        # Getting the list of papers contained in each page.
        args = dict(css = "ol.breathe-horizontal > li.arxiv-result")
        for link in links:
            yield response.follow(url = link, callback=self.parse_paper, cb_kwargs=args)

    def parse_paper(self, response, css):
        # Extracting the list of papers.
        papers = response.css(css).extract()

        # Creating the list of CSS Selector.
        css_list = {"id": "p.list-title > a::text",
                    "subject_areas": "div.tags > span.tag::attr(data-tooltip)",
                    "title": "p.title ::text",
                    "authors": "p.authors > a::text",
                    "abstract": "p.abstract > span.abstract-full ::text",
                    "date": "p.is-size-7::text"}

        # Extracting the data from paper's HTML.
        for paper in papers:
            sel = Selector(text=paper)
            record = {}
            record["id"] = sel.css(css_list["id"]).extract_first()
            record["subject_areas"] = sel.css(css_list["subject_areas"]).extract()
            record["title"] = "".join(sel.css(css_list["title"]).extract()).strip().replace("\n", "")
            record["authors"] = sel.css(css_list["authors"]).extract()
            record["abstract"] = re.sub(r"\s+", " ", "".join(
                sel.css(css_list["abstract"]).extract()).strip().replace("△ Less", ""))
            record["date"] = "".join(sel.css(css_list["date"]).extract()).strip().replace("\n", "")
            data.append(record)

In [5]:
# Executing the spider.
process = CrawlerProcess()
process.crawl(SpiderArXiv)
process.start()

2020-07-02 20:45:36 [scrapy.utils.log] INFO: Scrapy 1.7.3 started (bot: scrapybot)
2020-07-02 20:45:36 [scrapy.utils.log] INFO: Versions: lxml 4.4.0.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.1, Python 3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 23:51:54) - [GCC 7.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1f  31 Mar 2020), cryptography 2.8, Platform Linux-4.15.0-76-generic-x86_64-with-debian-buster-sid
2020-07-02 20:45:36 [scrapy.crawler] INFO: Overridden settings: {}
2020-07-02 20:45:36 [scrapy.extensions.telnet] INFO: Telnet Password: 4dbe84ab3189d520
2020-07-02 20:45:36 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-07-02 20:45:36 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downlo

In [6]:
# Printing the number of records collected.
print("Number of records collected: {}.".format(len(data)))

Number of records collected: 1000.


## 2. Saving the data collected

In [7]:
# Exporting the data to CSV file.
pd.DataFrame(data).to_csv("../../data/raw/arxiv_raw.csv", index=False, quoting=csv.QUOTE_ALL)