In [16]:
import requests
import urllib.parse
from bs4 import BeautifulSoup

In [18]:
my_query = urllib.parse.quote_plus("Ecole Polytechnique Fédérale de Lausanne", safe='')

In [19]:
my_query

'Ecole+Polytechnique+F%C3%A9d%C3%A9rale+de+Lausanne'

In [20]:
url = 'http://export.arxiv.org/api/query?search_query={}&start=0&max_results=10'.format(my_query)
data = requests.get(url)
print(data)

<Response [200]>


In [21]:
soup = BeautifulSoup(data.text, "xml")

In [22]:
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
 <link href="http://arxiv.org/api/query?search_query%3DEcole%20Polytechnique%20F%C3%A9d%C3%A9rale%20de%20Lausanne%26id_list%3D%26start%3D0%26max_results%3D10" rel="self" type="application/atom+xml"/>
 <title type="html">
  ArXiv Query: search_query=Ecole Polytechnique Fédérale de Lausanne&amp;id_list=&amp;start=0&amp;max_results=10
 </title>
 <id>
  http://arxiv.org/api/cwrNr0BwGObcbdhTHZSGQGzCjKo
 </id>
 <updated>
  2016-12-07T00:00:00-05:00
 </updated>
 <totalResults>
  64826
 </totalResults>
 <startIndex>
  0
 </startIndex>
 <itemsPerPage>
  10
 </itemsPerPage>
 <entry>
  <id>
   http://arxiv.org/abs/1608.03324v1
  </id>
  <updated>
   2016-08-11T00:26:15Z
  </updated>
  <published>
   2016-08-11T00:26:15Z
  </published>
  <title>
   Architecture Diagrams: A Graphical Language for Architecture Style
  Specification
  </title>
  <summary>
   Architecture styles characterise families of architectures sha

In [24]:
soup.totalResults.text

'64826'

In [79]:
def num_results(query):
    query = urllib.parse.quote_plus(query, safe='')
    url = 'http://export.arxiv.org/api/query?search_query={}&start=0&max_results=10'.format(query)
    data = requests.get(url)
    soup = BeautifulSoup(data.text, "xml")
    print(soup.totalResults.text, " results for query ", query)

In [31]:
num_results("Swiss Federal Institute of Technology Lausanne")

1196209  results for query  Swiss+Federal+Institute+of+Technology+Lausanne


In [32]:
num_results("Ecole Polytechnique fédérale de Lausanne")

64826  results for query  Ecole+Polytechnique+f%C3%A9d%C3%A9rale+de+Lausanne


In [33]:
num_results("HEC Lausanne")

355  results for query  HEC+Lausanne


In [34]:
num_results("Eidgenössische Technische Hochschule Zürich")

428  results for query  Eidgen%C3%B6ssische+Technische+Hochschule+Z%C3%BCrich


In [35]:
num_results("ETHZ")

97  results for query  ETHZ


In [36]:
num_results("Swiss Federal Institute of Technology Zurich")

1196214  results for query  Swiss+Federal+Institute+of+Technology+Zurich


In [37]:
num_results("EPFL")

296  results for query  EPFL


In [40]:
len(soup.findAll("entry"))

10

In [72]:
def results(query):
    query = urllib.parse.quote_plus(query, safe='')
    url = 'http://export.arxiv.org/api/query?search_query={}&start=0&max_results=300'.format(query)
    data = requests.get(url)
    soup = BeautifulSoup(data.text, "xml")
    return [s.text for s in soup.findAll("summary")]

In [73]:
res = results("EPFL")

In [55]:
len(res)

297

In [83]:
import pandas as pd

In [None]:
zip()

In [86]:
def extract_df(query):
    query = urllib.parse.quote_plus(query, safe='')
    url = 'http://export.arxiv.org/api/query?search_query={}&start=0&max_results=300'.format(query)
    data = requests.get(url)
    soup = BeautifulSoup(data.text, "xml")
    summaries = [s.text for s in soup.findAll("summary")]
    titles = [s.text for s in soup.findAll("title")]
    ids = [s.text for s in soup.findAll("id")]
    published = [s.text for s in soup.findAll("published")]
    data = list(zip(ids, titles, summaries, published))
    df = pd.DataFrame(data, columns=["id", "title", "summary", "published"])
    return df

In [87]:
df = extract_df("EPFL")

In [88]:
df.head()

Unnamed: 0,id,title,summary,published
0,http://arxiv.org/api/r9qQvvV+R7PE5ahGrCZwSPO/ZME,ArXiv Query: search_query=EPFL&id_list=&start=...,We report the discovery of three new cases o...,2011-10-25T14:27:28Z
1,http://arxiv.org/abs/1110.5514v1,Three QSOs acting as strong gravitational lenses,We report our progress in scaling deductive ...,2016-11-23T03:16:55Z
2,http://arxiv.org/abs/1611.07625v1,An Update on Deductive Synthesis and Repair in...,This is a writeup of lectures given at the E...,2016-01-19T17:07:40Z
3,http://arxiv.org/abs/1601.05000v2,EPFL Lectures on Conformal Field Theory in D>=...,In this paper we study the existence of solu...,2016-04-15T07:40:48Z
4,http://arxiv.org/abs/1604.04387v1,An Elliptic System with Degenerate Coercivity,"In this work, we have studied the invariants...",1993-11-17T15:45:00Z


In [89]:
df["abstract"] = df.title + "\n" + df.summary

In [90]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import stop_words
import string

In [91]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [92]:
from nltk.corpus import wordnet

def get_wordnet_tag(treebank_tag):
    """ enables to map pos tags from treebank to wordnet valid pos tags 
    (in order to feed the lemmatizer) """
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:  # if starts with N or others (noun by default)
        return wordnet.NOUN

In [93]:
def lemmatize(token):
    """ final method to lemmatize using wordnet lemmatizer """
    
    trbk_tag = pos_tag([token])[0][1]  # get the treebank tag from pos_tag method
    wrdnt_tag = get_wordnet_tag(treebank_tag=trbk_tag)  # translate it to wordnet tag
    return lemmatizer.lemmatize(token, wrdnt_tag)

In [94]:
stop_words_ = stop_words.get_stop_words("en")

In [95]:
punctuation_filter = str.maketrans({key: None for key in string.punctuation})

In [96]:
def nlp_pre_process(text, lemma=False, stem=False):
    """
    Reduces an input text into a list of tokens, 
    using NLP filterings such as normalization, 
    PoS and stop word filtering, stemming.
    text: (str)
    tokens: list(str)
    """
    text = text.lower()  # normalize
    text = text.translate(punctuation_filter)  # remove punctuation
    tokens = word_tokenize(text)  # tokenize
    pos_tags = [pos_tag([t]) for t in tokens]  # PoS tagging
    tokens = [token for [(token, tag)] in pos_tags if tag != "DT"]  # PoS filtering
    tokens = [t for t in tokens if t not in stop_words_]  # stop words filtering
    
    if stem:
        tokens = [stemmer.stem(t) for t in tokens]  # stemming
    
    if lemma:
        tokens = [lemmatize(t) for t in tokens]  # lemmatizing
    
    return " ".join(tokens)

In [97]:
df["abstract_processed"] = df.abstract.apply(lambda t: nlp_pre_process(t, lemma=True))

## Topic Extraction

In [75]:
from gensim.models import LdaModel
from gensim import corpora
import logging
#logging.basicConfig(format='%(message)s', level=logging.INFO)
logging.getLogger("gensim").setLevel(logging.WARNING)



In [98]:
# define a term-document matrix
final_text = [t.split(" ") for t in df.abstract_processed.values]
dictionary = corpora.Dictionary(final_text)
corpus = [dictionary.doc2bow(text) for text in final_text]

In [113]:
# run model
lda = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=5)

In [115]:
lda.print_topics(num_words=8)

[(0,
  '0.010*"energy" + 0.010*"magic" + 0.009*"gammaray" + 0.008*"emission" + 0.008*"vhe" + 0.007*"data" + 0.007*"telescope" + 0.007*"observation"'),
 (1,
  '0.007*"galaxy" + 0.006*"model" + 0.004*"state" + 0.004*"show" + 0.004*"sample" + 0.004*"problem" + 0.004*"system" + 0.004*"use"'),
 (2,
  '0.015*"galaxy" + 0.007*"image" + 0.007*"model" + 0.007*"use" + 0.006*"mass" + 0.006*"cluster" + 0.005*"quasar" + 0.005*"star"')]

In [110]:
lda.get_document_topics

TypeError: get_document_topics() missing 1 required positional argument: 'bow'