# arXiv Analyzing
### Purpose: 
1. Discover trending research topics
2. Discover how papers are related/build on each other (evolution of a topic)
3. Discover open questions/things to research/missing pieces of research puzzle
4. For fun!

### Mechanics:
Based on Tim Head's code on "Analysing the arXiv" http://betatim.github.io/posts/analysing-the-arxiv/


In [215]:
# Import stuff
import re
import time
import urllib.request
import requests
import urllib
import datetime
import feedparser
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
import xml.etree.ElementTree as ET

import matplotlib.pylab as plt
import pandas as pd
pd.set_option('mode.chained_assignment','warn')
import numpy as np

In [258]:
# Build up dataframe of paper titles, id's, abstracts, created dates, cateogry, and doi

OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

#Harvest from quantum physics
def harvest(beginDate,endDate):

    arxiv="physics:quant-ph"
    df = pd.DataFrame(columns=("title", "abstract", "authors", "categories", "created", "id", "doi"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = (base_url +
           "from="+beginDate+"&until="+endDate+"&" +
           "metadataPrefix=arXiv&set=%s"%arxiv)

    while True:
        print("fetching", url)
        try:
            response = urllib.request.urlopen(url).read()

        except urllib.error.HTTPError as e:
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print("Got 503. Retrying after {0:d} seconds.".format(to))

                time.sleep(to)
                continue

            else:
                raise

        root = ET.fromstring(response)

        for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
            authors = []
            arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
            meta = record.find(OAI+'metadata')
            info = meta.find(ARXIV+"arXiv")
            created = info.find(ARXIV+"created").text
            created = datetime.datetime.strptime(created, "%Y-%m-%d")
            categories = info.find(ARXIV+"categories").text
            try:
                for names in info[3]:
                    authors.append(names[1].text+' '+names[0].text)
            except:
                authors.append('')


            # if there is more than one DOI use the first one
            # often the second one (if it exists at all) refers
            # to an eratum or similar
            doi = info.find(ARXIV+"doi")
            if doi is not None:
                doi = doi.text.split()[0]

            contents = {'title': info.find(ARXIV+"title").text,
                        'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:],
                        'abstract': info.find(ARXIV+"abstract").text.strip(),
                        'authors': authors,
                        'created': created,
                        'categories': categories.split(),
                        'doi': doi,
                        }

            df = df.append(contents, ignore_index=True)

        # The list of articles returned by the API comes in chunks of
        # 1000 articles. The presence of a resumptionToken tells us that
        # there is more to be fetched.
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break

        else:
            url = base_url + "resumptionToken=%s"%(token.text)
            
    return df

### Import from arXiv
Import all articles from quant-ph over date range

In [259]:
startDate = '2019-01-01'
endDate = datetime.date.today().__str__()

# Import dataset
df = harvest(startDate, endDate)

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2019-01-01&until=2020-07-21&metadataPrefix=arXiv&set=physics:quant-ph


IndexError: child index out of range

In [None]:
df.head(), df.tail()

In [None]:
# Save DataFrame
df.to_pickle('./arxiv_articles.pkl')

### Import from Saved

In [None]:
# Load DataFrame
df = pd.read_pickle('./arxiv_articles.pkl')

### Filter Articles
Pick interested articles based on keywords or authors.

In [28]:
authors = ['Monroe', 'Richerme', 'Hayes']
keywords = ['trapped ion', 'trapped ions', 'ion trap', 'ion traps']

np.sum(df['abstract'] == keywords[0])

0

In [31]:
df['author']

KeyError: 'author'