### Setup

In [118]:
import numpy as np
import os
import pdftotext
from PyPDF2 import PdfFileReader
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import spacy
import pandas as pd
from py2neo import Graph
from py2neo.bulk import create_nodes
from neo4j import GraphDatabase
from methods import *
import arxiv
import urllib.request as libreq
import tarfile

Generalised uncertainty relations for angular momentum and spin in quantum geometry
Uncertainty Relations for Angular Momentum


http://arxiv.org/abs/1505.00049v2


In [None]:
import urllib, urllib.request
url = 'http://export.arxiv.org/api/query?search_query=all:10.1088/0004-637X/704/1/496&start=0&max_results=1'
data = urllib.request.urlopen(url)
print(data.read().decode('utf-8'))

#### Initialize Graph Database

In [119]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response


conn = Neo4jConnection(uri="bolt://localhost:7687/", 
                       user="neo4j",              
                       pwd="berjis89")

In [None]:

graph = Graph("http://localhost:7474/", auth=("neo4j", "berjis89"))
graph.run("UNWIND range(1, 3) AS n RETURN n, n * n as n_sq")

In [142]:
doi = "10.1051/0004-6361:200810797"
search = arxiv.Search(
          query = doi,
          max_results = 10,
          sort_by = arxiv.SortCriterion.SubmittedDate
        )

for result in search.results():
    print(result)

In [144]:
def extract_file(folder, file):

    if file.endswith("tar.gz"):
        tar = tarfile.open(file, "r:gz")
        tar.extractall(path=folder)
        tar.close()
    elif file.endswith("tar"):
        tar = tarfile.open(file, "r:")
        tar.extractall()
        tar.close()
    
    
def get_paper(p, baseURL="/home/amir/Projects/papyrus/hoopoe/Texs/"):
    """
    This function download a paper from arxiv and put it to the proper place and return its unzipped folder.
    """
    arxiv_id = p["arxiv_id"]
    title = p["title"]
    papers = []
    
    if arxiv_id != "":
        search = arxiv.Search(id_list=[arxiv_id])
        papers.append(next(search.results()))
    elif(title != ""):
        search = arxiv.Search(
          query = "ti:%22"+title+"%22",
          max_results = 10,
          sort_by = arxiv.SortCriterion.SubmittedDate
        )
        for result in search.results():
            if result.title == title:
                papers.append(result)
    else:
        return False
    #print(papers)
    for paper in papers:
        paper_id = paper.doi
        paper_id = paper_id.replace("/", "_")
        filename=baseURL+paper_id+".tar.gz"
        paper.download_source(filename=baseURL+paper_id+".tar.gz")
        folder=baseURL+paper_id
        extract_file(folder,filename)
        
    return True

def get_all_papers(paper_folder):
    """
    This function will get all the papers of a paper citations
    """
    a0,b0 = extract_info(paper_folder)
    

def add_node(g, N, labels={"Paper"}, keys=[]):
    papers = get_data("Texs")
    
    if len(keys) == 0:
        keys = ["id", "title", "authors"]
    data = []
    
    for i in range(N):
        folder_path = get_paper()
        a, b = extract_info(folder_path)
        data.append([a[k] for k in keys])
    
    
    create_nodes(g.auto(), data, labels=labels, keys=keys)
    x = g.nodes.match(list(labels)[0]).count()
    print(x, " nodes added to the graph ", g)
    
    return g

In [53]:
paper = {"title":"Uncertainty Relations for Angular Momentum", "arxiv_id":""}

In [124]:
get_paper(paper)

'10.1088_1367-2630_17_9_093046'

In [145]:
papers = get_data("Texs")
papers

['/home/amir/Projects/papyrus/hoopoe/Texs/1511_03498',
 '/home/amir/Projects/papyrus/hoopoe/Texs/2112_13970',
 '/home/amir/Projects/papyrus/hoopoe/Texs/2112',
 '/home/amir/Projects/papyrus/hoopoe/Texs/2112_14214']

In [147]:
a0,b0 = extract_info(folder=papers[2])

IndexError: list index out of range

In [139]:
a0["introduction --cts"]

[['Espaillat14', [], ['10.2458/azu_uapress_9780816531240-ch022']],
 ['Andrews11', [], ['10.1088/2041-8205/742/1/L5']],
 ['Pietu05', [], ['10.1051/0004-6361:20042050']],
 ['Hughes07', [], ['10.1086/518885']],
 ['Casassus13', [], ['10.1038/nature11769']],
 ['Avenhaus14', [], ['10.1088/0004-637X/781/2/87']],
 ['Espaillat14', [], ['10.2458/azu_uapress_9780816531240-ch022']],
 ['Carmona14', [], ['10.1051/0004-6361/201322534']],
 ['Bruderer14', [], ['10.1051/0004-6361/201322857']],
 ['vanderMarel2015', [], ['10.1051/0004-6361/201525658']],
 ['Birnstiel2012', [], ['10.1051/0004-6361/201219262']],
 ['AlexanderPP6', [], ['10.2458/azu_uapress_9780816531240-ch021']],
 ['Owen11', [], ['10.1111/j.1365-2966.2010.17818.x']],
 ['Rosenfeld14', [], ['10.1088/0004-637X/782/2/62']],
 ['Rosenfeld14', [], ['10.1088/0004-637X/782/2/62']],
 ['crida06', [], ['10.1016/j.icarus.2005.10.007']],
 ['pm04', [], ['10.1051/0004-6361:200400053']],
 ['Fouchet07', [], ['10.1051/0004-6361:20077586']],
 ['Zhu12', [], ['10.

In [None]:
a0.keys()

In [None]:
a0['results --cts']

In [None]:
a1,b1 = extract_info(folder=papers[3])

In [None]:
a0["introduction --cts"]

In [None]:
a0_f = [a[0] for a in a0['authors']]
a1_f = [a[0] for a in a1['authors']]

In [None]:
a0_f = {key:a0[key] for key in a0.keys() if key!='sections'}
a1_f = {key:a1[key] for key in a1.keys() if key!='sections'}


In [None]:
a0_f.keys() 

In [None]:
data=a0.keys()

In [None]:
a0_f['results --cts']

In [None]:
df = pd.DataFrame(columns=["doi", "title","url", 'authors', 'abstract', 'citations' ])

In [None]:
df

In [None]:
conn.query('CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper) ASSERT p.doi IS UNIQUE')

In [None]:
create_nodes(graph, data, labels={"Paper"})

In [None]:
a_df.head()

In [None]:
b

In [None]:
paper = get_data(papers[2])
paper

In [None]:
a["Sections"]["Introduction"]

In [None]:
sent = sent_tokenize(t) 
print(sent)

In [None]:
#sent = nltk.corpus.treebank.tagged_sents()[22]
words = word_tokenize(t)
words_tagged = nltk.pos_tag(words)

#print(nltk.ne_chunk(words_tagged, binary=False))
tree = nltk.ne_chunk(words_tagged, binary=False)
print(tree)


In [None]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
#for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):

#VAN = re.compile(words_tagged, re.VERBOSE)
for doc in sent_tagged:
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

In [None]:
nltk.download('ieer')

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

### Pipline:

In [None]:
data = get_data("pdfs")

In [None]:
data[0]

In [None]:
t = extract_text(data[1], "pdftotext_cli")

In [None]:
print(t)

In [None]:
extract_entities(t)

In [None]:
tt = sent_tokenize(t, language='english', preserve_line=False)

In [None]:
words = word_tokenize(t[10])
tags = nltk.pos_tag(words)
tree = nltk.ne_chunk(tags, binary=True)

In [None]:
tree.draw()

In [None]:
print(nltk.ne_chunk(tt[10], binary=True))