### Setup

In [4]:
import numpy as np
import os
import pdftotext
from PyPDF2 import PdfFileReader
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import spacy
import pandas as pd
from py2neo import Graph
from py2neo.bulk import create_nodes


#### Initialize Graph Database

In [5]:
graph = Graph("http://localhost:7474/", auth=("neo4j", "berjis89"))
graph.run("UNWIND range(1, 3) AS n RETURN n, n * n as n_sq")

n,n_sq
1,1
2,4
3,9


In [7]:
class MyDict(dict):
    def __init__(self,*args,**kwargs):
        super().__init__(*args,**kwargs)
    def __getitem__(self,key):
        return dict.__getitem__(self, key.lower())
    def __setitem__(self,key,value):
        return dict.__setitem__(self,key.lower(), value)


In [357]:
def get_data(folder_name):
    """
    This function assumes that the folder "folder_name" is stored inside the the notebooks folder.
    Params:
    folder_name: string
    Returns: the paths to the data files in the given folder as a list.
    """
    arr = os.listdir(folder_name)
    return [os.path.join(os.getcwd(),folder_name, a) for a in arr]


def pdftotext_wrapper(input_file, options=None, output_file=None):
    """
    This function wraps the pdftotext command line tool.
    Params:
    input_file: string of path to the input pdf file.
    output_file: string of path to the output text file.
    options: string
    Returns: the text as a string.
    """
    if options is None:
        options = ""

    if output_file is None:
        output_file = ""

    check = os.popen("pdftotext " + options + " " + input_file + " " + output_file).read()
    if check == "":
        return "Success"


def extract_text(path, method):
    """
    This function extracts the text from a pdf file.
    Params:
    path: string
    method: string
    Returns: the text as a string.
    """
    if method == "pdftotext_cli":
        file_name = path.replace(os.path.dirname(data[0])+"/", "").replace(".pdf", "")
        output_dir = os.path.join(os.getcwd(), "texts")
        output_file = os.path.join(output_dir, file_name + ".txt")
        pdftotext_wrapper(data[0], "-raw", output_file) 
        with open(output_file, 'r') as f:
            #return f.read()
            return f.read().replace("\n", " ")
            #return f.readlines()

    if method == "pdftotext_python":
        with open(path, "rb") as f:
            return pdftotext.PDF(f)

    if method == "pypdf2":
        text = []
        with open(path, "rb") as f:
            pdf = PdfFileReader(f)
            text = [pdf.getPage(i).extractText() for i in range(pdf.numPages)]
            return text

def extract_entities(quote):
    words = word_tokenize(quote)
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=False)
    return set(
        " ".join(i[0] for i in t)
        for t in tree if hasattr(t, "label") and t.label() != "NE"
    )

def extract_info(folder, source="Tex"):
    """
    This function extracts the information from a paper using different methods and returns it as a dictionary with the following keys:
    {
        "Author/Authors": string,
        "Title": string,
        "Year": string,
        "Journal": string,
        "Volume": string,
        "Pages": string,
        "Abstract": string,
        "Sections": list of strings,
        "References_Sections": list of pairs of strings (refrence, section),
        "refrences": list of strings,
        "Keywords": string,
        "Language": string,
        "Source": string,
        }
    Params:
    paper: string
    source: string
    Returns: a dictionary with the extracted information.

    Reg Tips:
    1. r"\\author.*?\\\\" -> mathc from \authors command till the first \\ using lazy match in the Tex file.


    """
    info = MyDict()

    if source != "Tex":
        return info
    

    contents = get_data(folder)
 
    tex_files = [ct for ct in contents if os.path.splitext(ct)[1] == '.tex']
    bib_files = [ct for ct in contents if os.path.splitext(ct)[1] == '.bib']


    text = ""
    for tex_file in tex_files:
        with open(tex_file, "r") as f:
            temp = f.read()
            text+= "\n"+temp
    
    refs = ""
    ref_style=""
    if len(bib_files) != 0:
        ref_style="file"
        for bib_file in bib_files:
            with open(bib_file, "r") as f:
                temp = f.read()
                refs+= "\n"+temp
        # Adding an "@" sign at the end of the refs text, it will help in the extracting data using regex:
        refs+="\n @"
    else:
        ref_style="bibitem"
        refs = re.findall(r"\\bibitem.*?\\end", text, re.DOTALL)[0]
        text = text.replace(refs,"")
        refs+="\n \\bibitem"

    # Remove all comments:
    #text = re.sub(r"\%.*?\n", "", text,  re.DOTALL)

    #tmp = re.findall(r"\@\w+.[\s]*?.?(?="+"2017AREPS..45..359J"+r")(.*?)\@", refs, re.DOTALL)
    #print(tmp)
    
    # Extracting the author(s)
    #authors = re.findall(r"\\author\{(.*?)\}", text)
 
    Author_main = re.findall(r"\\author\[(.*?)\]", text)
    temp_authors = re.findall(r"\\author.*?\\\\", text, re.DOTALL)
    temp_authors = re.findall(r"\].*?\\\\", temp_authors[0], re.DOTALL)
    temp_authors = re.findall(r"(([A-Zéúßäüö]\.?\s?)*([A-Zéúßäüö][a-zéúßäüö]+\.?\s?)+([A-Zéúßäüö]\.?\s?[a-zéúßäüö]*)*)", temp_authors[0], re.DOTALL)
    Authors = [aut[0] for aut in temp_authors]


    info["Author_main"] = "; ".join(Author_main)
    info["Authors"] = "; ".join(Authors)

    # Extracting the title
    title_temp = re.findall(r"\\title.*?]", text, re.DOTALL)
    title = re.findall(r"\[(.*?)]", title_temp[0], re.DOTALL)
    info["Title"] = title[0]

    # Extracting the Abstract
    abstract_temp = re.findall(r"\\begin{abstract}(.*?)\\end{abstract}", text, re.DOTALL)
    
    # remove comments
    abstract = re.sub(r"\%.*?\n", "", abstract_temp[0],  re.DOTALL)
    info["Abstract"] = abstract


    # Find titles of sections
    section_titles = re.findall(r"\\section{(.*?)}", text, re.DOTALL)

    

    # Extract text of each section:

    sections_text = []
    Sections = MyDict()
    for s_t in section_titles:
        section_grammer= r"\\section{" + s_t + "}" + r"(.*?)" + r"\\section"
        temp =  re.findall(section_grammer, text, re.DOTALL)
        sections_text.append(temp)
        Sections[s_t] = MyDict({"text": temp[0]})
        # Same data saved as flatten version to use in neo4j, later we need to disscuss
        info[s_t + " --text"]=temp[0]
        
    info["Sections"] = Sections

    # Extracting the citeations in each section:
   
    if ref_style=="file":
        info, citations_info = extract_file_ref_style(info, refs)
    if ref_style=="bibitem":
        info, citations_info = extract_bibitem_ref_style(info, refs)
    
    return info, citations_info

def extract_bibitem_ref_style(info, refs):
    cts = []
    for s_t in info["Sections"]:
        cts_mtch = []
        
        # Citation style: \cite[][]{ref1, ref2, ref3}     
        temp_citep = re.findall(r"\\citep.*?(\[.*?\])?{(.*?)}", info["Sections"][s_t]["text"], re.DOTALL)
        if len(temp_citep)!=0:
            cts_mtch.extend(temp_citep) 
        
        # Citation style: \citet[][]{ref1, ref2, ref3} 
        temp_citet = re.findall(r"\\citet.*?(\[.*?\])?{(.*?)}", info["Sections"][s_t]["text"], re.DOTALL)
        if len(temp_citet)!=0:
            cts_mtch.extend(temp_citet)
        
        # Citation style: \citealp[][]{ref1, ref2, ref3} 
        temp_citealp = re.findall(r"\\citealp.*?(\[.*?\])?{(.*?)}", info["Sections"][s_t]["text"], re.DOTALL)
        if len(temp_citealp)!=0:
            cts_mtch.extend(temp_citealp)

        cts_in_text = []
        for c in cts_mtch:
            if len(c)>=1:
                if type(c[-1])==tuple:
                    cts_in_text.append(list(c[-1]))
                else:
                    cts_in_text.append(c[-1])
        
         
        info["Sections"][s_t]["citations"] = [t.split(",") for t in cts_in_text]
    
        # Same data saved as flatten version to use in neo4j, later we need to disscuss
        if len(cts_in_text)!=0:
            cts = [t.split(",") for t in cts_in_text]

        cts_flatten_temp= [c for ct in cts for c in ct]
        cts_flatten = []
        for c in cts_flatten_temp:
            if type(c) is list:
                cts_flatten.append(c[0].strip())
            else:
                cts_flatten.append(c.strip())

        cts_info = []
        for bib_item in cts_flatten:
            bib_item_section = re.findall(r"\{[\s]*?"+bib_item+r"[\s]*?\}.*?\\bibitem", refs, re.DOTALL)
            title = re.findall(r"(?i)title.*?{(.*?)}", bib_item_section[0],re.DOTALL)
            doi = re.findall(r"(?i)doi.*?{(.*?)}", bib_item_section[0],re.DOTALL)
            
            cts_info.append([bib_item,title, doi])

        info[s_t + " --cts"] = cts_info
    
    
    return info, refs


def extract_file_ref_style(info, refs):
    cts = []
    for s_t in info["Sections"]:
        cts_mtch = []
        
        # Citation style: \cite[][]{ref1, ref2, ref3}     
        temp_citep = re.findall(r"\\citep.*?(\[.*?\])?{(.*?)}", info["Sections"][s_t]["text"], re.DOTALL)
        if len(temp_citep)!=0:
            cts_mtch.extend(temp_citep) 
        
        # Citation style: \citet[][]{ref1, ref2, ref3} 
        temp_citet = re.findall(r"\\citet.*?(\[.*?\])?{(.*?)}", info["Sections"][s_t]["text"], re.DOTALL)
        if len(temp_citet)!=0:
            cts_mtch.extend(temp_citet)
        
        # Citation style: \citealp[][]{ref1, ref2, ref3} 
        temp_citealp = re.findall(r"\\citealp.*?(\[.*?\])?{(.*?)}", info["Sections"][s_t]["text"], re.DOTALL)
        if len(temp_citealp)!=0:
            cts_mtch.extend(temp_citealp)

        cts_in_text = []
        for c in cts_mtch:
            if len(c)>=1:
                if type(c[-1])==tuple:
                    cts_in_text.append(list(c[-1]))
                else:
                    cts_in_text.append(c[-1])
        
        info["Sections"][s_t]["citations"] = [t.split(",") for t in cts_in_text]
    
                # Same data saved as flatten version to use in neo4j, later we need to disscuss
        if len(cts_in_text)!=0:
            cts = [t.split(",") for t in cts_in_text]

        cts_flatten_temp= [c for ct in cts for c in ct]
        cts_flatten = []
        for c in cts_flatten_temp:
            if type(c) is list:
                cts_flatten.append(c[0].strip())
            else:
                cts_flatten.append(c.strip())

        cts_info = []
        for bib_item in cts_flatten:
            bib_item_section = re.findall(r"\@\w+.[\s]*?.?(?=[\s]*?"+c+r")(.*?)\@", refs, re.DOTALL)
            title = re.findall(r"(?i)title.*?{(.*?)}", bib_item_section[0],re.DOTALL)
            doi = re.findall(r"(?i)doi.*?{(.*?)}", bib_item_section[0],re.DOTALL)
            cts_info.append([bib_item,title, doi])

        info[s_t + " --cts"] = cts_info

    citations_info = MyDict()
    # Adding citations from bib files to the sections:
    bib_cts = []
    all_bib_cts = []
    for s_t in info["Sections"]:
        all_cts = info["Sections"][s_t]["citations"]
        for cts in all_cts:
            for ct in cts:
                if len(ct)==1:
                    tmp = re.findall(r"(\@\w+.[\s]*?.?(?=[\s]*?"+ct[0]+r")(.*?))\@", refs, re.DOTALL)
                else:
                    tmp = re.findall(r"(\@\w+.[\s]*?.?(?=[\s]*?"+ct+r")(.*?))\@", refs, re.DOTALL)
              #  print(tmp)
                bib_cts.append(tmp)
            all_bib_cts.append(bib_cts)
        citations_info[s_t] = all_bib_cts
    
    return info, citations_info

In [206]:
papers = get_data("Texs")
papers[0]

'/home/amir/Projects/papyrus/hoopoe/Texs/1511_03498'

In [349]:
a0,b0 = extract_info(folder=papers[0])

In [353]:
a0["introduction --cts"]

[['Espaillat14', [], ['10.2458/azu_uapress_9780816531240-ch022']],
 ['Andrews11', [], ['10.1088/2041-8205/742/1/L5']],
 ['Pietu05', [], ['10.1051/0004-6361:20042050']],
 ['Hughes07', [], ['10.1086/518885']],
 ['Casassus13', [], ['10.1038/nature11769']],
 ['Avenhaus14', [], ['10.1088/0004-637X/781/2/87']],
 ['Espaillat14', [], ['10.2458/azu_uapress_9780816531240-ch022']],
 ['Carmona14', [], ['10.1051/0004-6361/201322534']],
 ['Bruderer14', [], ['10.1051/0004-6361/201322857']],
 ['vanderMarel2015', [], ['10.1051/0004-6361/201525658']],
 ['Birnstiel2012', [], ['10.1051/0004-6361/201219262']],
 ['AlexanderPP6', [], ['10.2458/azu_uapress_9780816531240-ch021']],
 ['Owen11', [], ['10.1111/j.1365-2966.2010.17818.x']],
 ['Rosenfeld14', [], ['10.1088/0004-637X/782/2/62']],
 ['Rosenfeld14', [], ['10.1088/0004-637X/782/2/62']],
 ['crida06', [], ['10.1016/j.icarus.2005.10.007']],
 ['pm04', [], ['10.1051/0004-6361:200400053']],
 ['Fouchet07', [], ['10.1051/0004-6361:20077586']],
 ['Zhu12', [], ['10.

In [334]:
a0['introduction --cts']


[['Espaillat14', [], ['10.2458/azu_uapress_9780816531240-ch022']],
 ['Andrews11', [], ['10.1088/2041-8205/742/1/L5']],
 ['Pietu05', [], ['10.1051/0004-6361:20042050']],
 ['Hughes07', [], ['10.1086/518885']],
 ['Casassus13', [], ['10.1038/nature11769']],
 ['Avenhaus14', [], ['10.1088/0004-637X/781/2/87']],
 ['Espaillat14', [], ['10.2458/azu_uapress_9780816531240-ch022']],
 ['Carmona14', [], ['10.1051/0004-6361/201322534']],
 ['Bruderer14', [], ['10.1051/0004-6361/201322857']],
 ['vanderMarel2015', [], ['10.1051/0004-6361/201525658']],
 ['Birnstiel2012', [], ['10.1051/0004-6361/201219262']],
 ['AlexanderPP6', [], ['10.2458/azu_uapress_9780816531240-ch021']],
 ['Owen11', [], ['10.1111/j.1365-2966.2010.17818.x']],
 ['Rosenfeld14', [], ['10.1088/0004-637X/782/2/62']],
 ['Rosenfeld14', [], ['10.1088/0004-637X/782/2/62']],
 ['crida06', [], ['10.1016/j.icarus.2005.10.007']],
 ['pm04', [], ['10.1051/0004-6361:200400053']],
 ['Fouchet07', [], ['10.1051/0004-6361:20077586']],
 ['Zhu12', [], ['10.

In [303]:
a=[]
b = [3,67]
a = a+b
a

[3, 67]

In [358]:
a1,b1 = extract_info(folder=papers[3])

In [363]:
a1.keys()

dict_keys(['author_main', 'authors', 'title', 'abstract', 'introduction --text', 'physical model and numerical setup --text', 'summary of previous results on the pebble isolation mass for circular planets --text', 'results of hydrodynamical simulations --text', 'a simple fitting formula for the pebble isolation mass for eccentric planets --text', 'discussion and conclusions --text', 'sections', 'introduction --cts', 'physical model and numerical setup --cts', 'summary of previous results on the pebble isolation mass for circular planets --cts', 'results of hydrodynamical simulations --cts', 'a simple fitting formula for the pebble isolation mass for eccentric planets --cts', 'discussion and conclusions --cts'])

In [379]:
faltted_a0 = {key:a0[key] for key in a0.keys() if key=='introduction --cts'}
faltted_a1 = {key:a1[key] for key in a1.keys() if key=='introduction --cts'}
data = [faltted_a0,faltted_a1]

In [375]:
faltted_a0.keys()

dict_keys(['author_main'])

In [380]:
create_nodes(graph.auto(), data, labels={"Paper"})

ClientError: [Statement.TypeError] Collections containing collections can not be stored in properties.

In [None]:
a_df.head()

In [None]:
b

In [None]:

paper = get_data(papers[2])
paper


In [None]:
a["Sections"]["Introduction"]

In [None]:
sent = sent_tokenize(t) 
print(sent)

In [None]:
#sent = nltk.corpus.treebank.tagged_sents()[22]
words = word_tokenize(t)
words_tagged = nltk.pos_tag(words)

#print(nltk.ne_chunk(words_tagged, binary=False))
tree = nltk.ne_chunk(words_tagged, binary=False)
print(tree)


In [None]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
#for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):

#VAN = re.compile(words_tagged, re.VERBOSE)
for doc in sent_tagged:
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

In [None]:
nltk.download('ieer')

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

### Pipline:

In [None]:
data = get_data("pdfs")

In [None]:
data[0]

In [None]:
t = extract_text(data[1], "pdftotext_cli")

In [None]:
print(t)

In [None]:
extract_entities(t)

In [None]:
tt = sent_tokenize(t, language='english', preserve_line=False)

In [None]:
words = word_tokenize(t[10])
tags = nltk.pos_tag(words)
tree = nltk.ne_chunk(tags, binary=True)

In [None]:
tree.draw()

In [None]:
print(nltk.ne_chunk(tt[10], binary=True))