# arXiv Analyzing
### Purpose: 
1. Discover trending research topics
2. Discover how papers are related/build on each other (evolution of a topic)
3. Discover open questions/things to research/missing pieces of research puzzle
4. For fun!

### Mechanics:
Based on Tim Head's code on "Analysing the arXiv" http://betatim.github.io/posts/analysing-the-arxiv/


In [4]:
# Import stuff
import re
import time
import urllib.request
import requests
import urllib
import datetime
import feedparser
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
import xml.etree.ElementTree as ET

import matplotlib.pylab as plt
import pandas as pd
pd.set_option('mode.chained_assignment','warn')
import numpy as np

In [141]:
# Build up dataframe of paper titles, id's, abstracts, created dates, cateogry, and doi

OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"
W3 = "{http://www.w3.org/2005/Atom}"

#Harvest from quantum physics
def harvest(beginDate,endDate):

    arxiv="physics:quant-ph"
    df = pd.DataFrame(columns=("title", "abstract", "categories", "created", "id", "doi"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = (base_url +
           "from="+beginDate+"&until="+endDate+"&" +
           "metadataPrefix=arXiv&set=%s"%arxiv)
    
    while True:
        print("fetching", url)
        try:
            response = urllib.request.urlopen(url).read()
            
        except urllib.error.HTTPError as e:
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print("Got 503. Retrying after {0:d} seconds.".format(to))

                time.sleep(to)
                continue
                
            else:
                raise

        root = ET.fromstring(response)

        '''for entry in feed.entries:
            arxiv_id = entry.id.split('/abs/')[-1]
            published = entry.published
            meta = record.find(OAI+'metadata')
            info = meta.find(ARXIV+"arXiv")
            created = info.find(ARXIV+"created").text
            created = datetime.datetime.strptime(created, "%Y-%m-%d")
            categories = info.find(ARXIV+"categories").text
            '''
            
        for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
            arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
            meta = record.find(OAI+'metadata')
            info = meta.find(ARXIV+"arXiv")
            created = info.find(ARXIV+"created").text
            created = datetime.datetime.strptime(created, "%Y-%m-%d")
            categories = info.find(ARXIV+"categories").text
            for author in record.find(W3+"entry").findall(W3+'author'):
                print(author.find(W3+'name').text)
        

            # if there is more than one DOI use the first one
            # often the second one (if it exists at all) refers
            # to an eratum or similar
            doi = info.find(ARXIV+"doi")
            if doi is not None:
                doi = doi.text.split()[0]
                
            contents = {'title': info.find(ARXIV+"title").text,
                        'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:],
                        'abstract': info.find(ARXIV+"abstract").text.strip(),
                        'created': created,
                        'categories': categories.split(),
                        'doi': doi,
                        }

            df = df.append(contents, ignore_index=True)

        # The list of articles returned by the API comes in chunks of
        # 1000 articles. The presence of a resumptionToken tells us that
        # there is more to be fetched.
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break

        else:
            url = base_url + "resumptionToken=%s"%(token.text)
            
    return df

In [65]:
url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1'
data = urllib.request.urlopen(url).read()
root = ET.fromstring(data)
data

b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dall%3Aelectron%26id_list%3D%26start%3D0%26max_results%3D1" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1</title>\n  <id>http://arxiv.org/api/cHxbiOdZaP56ODnBPIenZhzg5f8</id>\n  <updated>2020-07-20T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">160082</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/cond-mat/0102536v1</id>\n    <updated>2001-02-28T20:12:09Z</updated>\n    <published>2001-02-28T20:12:09Z</published>\n    <title>Impact of Electron-Electron C

In [138]:
for author in root.find("{http://www.w3.org/2005/Atom}entry").findall('{http://www.w3.org/2005/Atom}author'):
    print(author.find('{http://www.w3.org/2005/Atom}name').text)

David Prendergast
M. Nolan
Claudia Filippi
Stephen Fahy
J. C. Greer


In [214]:
beginDate = '2020-05-01'
endDate = datetime.date.today().__str__()

OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

arxiv="physics:quant-ph"
df = pd.DataFrame(columns=("title", "abstract", "authors", "categories", "created", "id", "doi"))
base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
url = (base_url +
       "from="+beginDate+"&until="+endDate+"&" +
       "metadataPrefix=arXiv&set=%s"%arxiv)

while True:
    print("fetching", url)
    try:
        response = urllib.request.urlopen(url).read()

    except urllib.error.HTTPError as e:
        if e.code == 503:
            to = int(e.hdrs.get("retry-after", 30))
            print("Got 503. Retrying after {0:d} seconds.".format(to))

            time.sleep(to)
            continue

        else:
            raise

    root = ET.fromstring(response)

    for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
        authors = []
        arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
        meta = record.find(OAI+'metadata')
        info = meta.find(ARXIV+"arXiv")
        created = info.find(ARXIV+"created").text
        created = datetime.datetime.strptime(created, "%Y-%m-%d")
        categories = info.find(ARXIV+"categories").text
        for names in info.find(OAI+'authors').findall(OAI+'author'):
            authors.append(names[1].text+' '+names[0].text)
            

        # if there is more than one DOI use the first one
        # often the second one (if it exists at all) refers
        # to an eratum or similar
        doi = info.find(ARXIV+"doi")
        if doi is not None:
            doi = doi.text.split()[0]

        contents = {'title': info.find(ARXIV+"title").text,
                    'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:],
                    'abstract': info.find(ARXIV+"abstract").text.strip(),
                    'authors': authors,
                    'created': created,
                    'categories': categories.split(),
                    'doi': doi,
                    }

        df = df.append(contents, ignore_index=True)

    # The list of articles returned by the API comes in chunks of
    # 1000 articles. The presence of a resumptionToken tells us that
    # there is more to be fetched.
    token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
    if token is None or token.text is None:
        break

    else:
        url = base_url + "resumptionToken=%s"%(token.text)

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2020-05-01&until=2020-07-20&metadataPrefix=arXiv&set=physics:quant-ph
Got 503. Retrying after 600 seconds.


KeyboardInterrupt: 

In [None]:
df.head()

In [154]:
for child in record:
    print(child)
    for c in child:
        print(c)
        for cc in c:
            print(cc)

<Element '{http://www.openarchives.org/OAI/2.0/}header' at 0x7f8f537207c8>
<Element '{http://www.openarchives.org/OAI/2.0/}identifier' at 0x7f8f5613bdb8>
<Element '{http://www.openarchives.org/OAI/2.0/}datestamp' at 0x7f8f5613ba98>
<Element '{http://www.openarchives.org/OAI/2.0/}setSpec' at 0x7f8f5613bc28>
<Element '{http://www.openarchives.org/OAI/2.0/}metadata' at 0x7f8f5613bbd8>
<Element '{http://arxiv.org/OAI/arXiv/}arXiv' at 0x7f8f5613bb38>
<Element '{http://arxiv.org/OAI/arXiv/}id' at 0x7f8f56141048>
<Element '{http://arxiv.org/OAI/arXiv/}created' at 0x7f8f56141188>
<Element '{http://arxiv.org/OAI/arXiv/}updated' at 0x7f8f56141138>
<Element '{http://arxiv.org/OAI/arXiv/}authors' at 0x7f8f561411d8>
<Element '{http://arxiv.org/OAI/arXiv/}title' at 0x7f8f561416d8>
<Element '{http://arxiv.org/OAI/arXiv/}categories' at 0x7f8f56141728>
<Element '{http://arxiv.org/OAI/arXiv/}comments' at 0x7f8f56141778>
<Element '{http://arxiv.org/OAI/arXiv/}license' at 0x7f8f561417c8>
<Element '{http:/

In [184]:
record[1],record[1][0], record[1][0][3]

(<Element '{http://www.openarchives.org/OAI/2.0/}metadata' at 0x7f8f5613bbd8>,
 <Element '{http://arxiv.org/OAI/arXiv/}arXiv' at 0x7f8f5613bb38>,
 <Element '{http://arxiv.org/OAI/arXiv/}authors' at 0x7f8f561411d8>)

In [178]:
for author in record[1][0][3]:
    print(author)

<Element '{http://arxiv.org/OAI/arXiv/}author' at 0x7f8f56141228>
<Element '{http://arxiv.org/OAI/arXiv/}author' at 0x7f8f56141318>
<Element '{http://arxiv.org/OAI/arXiv/}author' at 0x7f8f56141408>
<Element '{http://arxiv.org/OAI/arXiv/}author' at 0x7f8f561414f8>
<Element '{http://arxiv.org/OAI/arXiv/}author' at 0x7f8f561415e8>


In [211]:
for names in info.find('{http://arxiv.org/OAI/arXiv/}authors').findall('{http://arxiv.org/OAI/arXiv/}author'):
    print(names[0].text, names[1].text)

Li Qin
Li Chengqing
Wu Chunhui
Long Dongyang
Wang Changji


In [205]:
info.find('{http://arxiv.org/OAI/arXiv/}authors')[0][2]

IndexError: child index out of range

### Import from arXiv
Import all articles from quant-ph over date range

In [142]:
startDate = '2020-05-01'
endDate = datetime.date.today().__str__()

# Import dataset
df = harvest(startDate, endDate)

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2020-05-01&until=2020-07-20&metadataPrefix=arXiv&set=physics:quant-ph


AttributeError: 'NoneType' object has no attribute 'findall'

In [8]:
df.head(), df.tail()

Unnamed: 0,title,abstract,categories,created,id,doi
5421,Single-photon-level sub-Doppler pump-probe spe...,We propose and demonstrate pump-probe spectros...,"[physics.atom-ph, quant-ph]",2020-07-16,2007.08452,
5422,Achieving fair sampling in quantum annealing,Sampling all ground states of a Hamiltonian wi...,[quant-ph],2020-07-16,2007.08487,
5423,Optical spin initialization of spin-3/2 silico...,Silicon vacancies in silicon carbide have been...,"[quant-ph, cond-mat.mes-hall]",2020-07-14,2007.08516,
5424,Ground State Laser Cooling Beyond the Lamb-Dic...,We propose a laser cooling scheme that allows ...,[quant-ph],1997-06-07,quant-ph/9706017,10.1209/0295-5075/23/1/001
5425,On the Concept of Quantum State Reduction: Inc...,The argument is re-examined that the program o...,[quant-ph],1998-02-09,quant-ph/9802022,10.4288/jafpos1956.11.107


In [18]:
# Save DataFrame
df.to_pickle('./arxiv_articles.pkl')

### Import from Saved

In [None]:
# Load DataFrame
df = pd.read_pickle('./arxiv_articles.pkl')

### Filter Articles
Pick interested articles based on keywords or authors.

In [28]:
authors = ['Monroe', 'Richerme', 'Hayes']
keywords = ['trapped ion', 'trapped ions', 'ion trap', 'ion traps']

np.sum(df['abstract'] == keywords[0])

0

In [31]:
df['author']

KeyError: 'author'