# Collecting the arXiv publications related to COVID-19

The publications' data were collected from [arXiv webpage](https://arxiv.org/covid19search) related to COVID-19.

In [None]:
# Importing the required libraries.
import scrapy, re, csv, pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy import Selector

## 1. Getting the data from its URL

In [None]:
# Determining the URL of target page.
url = "https://arxiv.org/covid19search"

In [None]:
# Creating the repository of data.
data = []

In [None]:
# Definition of Spider class.
class SpiderArXiv(scrapy.Spider):
    name = "arXiv_covid"

    def start_requests(self):
        # Getting the list of papers contained in the first page.
        args = dict(css = "ol.breathe-horizontal > li.arxiv-result")
        yield scrapy.Request(url = url, callback=self.parse_paper, cb_kwargs=args)

    def parse_paper(self, response, css):
        # Extracting the list of papers.
        papers = response.css(css).extract()

        # Creating the list of CSS Selector.
        css_list = {"id": "p.list-title > a::text",
                    "subject_areas": "div.tags > span.tag::attr(data-tooltip)",
                    "title": "p.title ::text",
                    "authors": "p.authors > a::text",
                    "abstract": "p.abstract > span.abstract-full ::text",
                    "date": "p.is-size-7::text"}

        # Extracting the data from paper's HTML.
        for paper in papers:
            sel = Selector(text=paper)
            record = {}
            record["id"] = sel.css(css_list["id"]).extract_first()
            record["subject_areas"] = sel.css(css_list["subject_areas"]).extract()
            record["title"] = "".join(sel.css(css_list["title"]).extract()).strip().replace("\n", "")
            record["authors"] = sel.css(css_list["authors"]).extract()
            record["abstract"] = re.sub(r"\s+", " ", "".join(
                sel.css(css_list["abstract"]).extract()).strip().replace("△ Less", ""))
            record["date"] = "".join(sel.css(css_list["date"]).extract()).strip().replace("\n", "")
            data.append(record)

        # Extracting the URL within the button "Next".
        link = response.css("a.pagination-next::attr(href)").extract_first()

        # Getting the list of papers contained in the next page.
        if link:
            args = dict(css = "ol.breathe-horizontal > li.arxiv-result")
            yield response.follow(url = link, callback=self.parse_paper, cb_kwargs=args)

In [None]:
# Executing the spider.
process = CrawlerProcess()
process.crawl(SpiderArXiv)
process.start()

In [None]:
# Printing the number of records collected.
print("Number of records collected: {}.".format(len(data)))

## 2. Saving the data collected

In [None]:
# Exporting the data to CSV file.
pd.DataFrame(data).to_csv("../../data/raw/arxiv_raw.csv", index=False, quoting=csv.QUOTE_ALL)