### Setup

In [2]:
import numpy as np
import os
import pdftotext
from PyPDF2 import PdfFileReader
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import spacy



In [201]:
def get_data(folder_name):
    """
    This function assumes that the folder "folder_name" is stored inside the the notebooks folder.
    Params:
    folder_name: string
    Returns: the paths to the data files in the given folder as a list.
    """
    arr = os.listdir(folder_name)
    return [os.path.join(os.getcwd(),folder_name, a) for a in arr]


def pdftotext_wrapper(input_file, options=None, output_file=None):
    """
    This function wraps the pdftotext command line tool.
    Params:
    input_file: string of path to the input pdf file.
    output_file: string of path to the output text file.
    options: string
    Returns: the text as a string.
    """
    if options is None:
        options = ""

    if output_file is None:
        output_file = ""

    check = os.popen("pdftotext " + options + " " + input_file + " " + output_file).read()
    if check == "":
        return "Success"


def extract_text(path, method):
    """
    This function extracts the text from a pdf file.
    Params:
    path: string
    method: string
    Returns: the text as a string.
    """
    if method == "pdftotext_cli":
        file_name = path.replace(os.path.dirname(data[0])+"/", "").replace(".pdf", "")
        output_dir = os.path.join(os.getcwd(), "texts")
        output_file = os.path.join(output_dir, file_name + ".txt")
        pdftotext_wrapper(data[0], "-raw", output_file) 
        with open(output_file, 'r') as f:
            #return f.read()
            return f.read().replace("\n", " ")
            #return f.readlines()

    if method == "pdftotext_python":
        with open(path, "rb") as f:
            return pdftotext.PDF(f)

    if method == "pypdf2":
        text = []
        with open(path, "rb") as f:
            pdf = PdfFileReader(f)
            text = [pdf.getPage(i).extractText() for i in range(pdf.numPages)]
            return text

def extract_entities(quote):
    words = word_tokenize(quote)
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=False)
    return set(
        " ".join(i[0] for i in t)
        for t in tree if hasattr(t, "label") and t.label() != "NE"
    )

def extract_info(paper, source="Tex"):
    """
    This function extracts the information from a paper using different methods and returns it as a dictionary with the following keys:
    {
        "Author/Authors": string,
        "Title": string,
        "Year": string,
        "Journal": string,
        "Volume": string,
        "Pages": string,
        "Abstract": string,
        "Sections": list of strings,
        "References_Sections": list of pairs of strings (refrence, section),
        "refrences": list of strings,
        "Keywords": string,
        "Language": string,
        "Source": string,
        }
    Params:
    paper: string
    source: string
    Returns: a dictionary with the extracted information.

    Reg Tips:
    1. r"\\author.*?\\\\" -> mathc from \authors command till the first \\ using lazy match in the Tex file.


    """
    info = {}

    if source != "Tex":
        return info
    
    with open(paper, "r") as f:
        text = f.read()
    

    # Remove all comments:
     # remove comments
    text = re.sub(r"\%.*?\n", "", text,  re.DOTALL)



    # Extracting the author(s)
    #authors = re.findall(r"\\author\{(.*?)\}", text)
 
    Author_main = re.findall(r"\\author\[(.*?)\]", text)
    temp_authors = re.findall(r"\\author.*?\\\\", text, re.DOTALL)
    temp_authors = re.findall(r"\].*?\\\\", temp_authors[0], re.DOTALL)
    temp_authors = re.findall(r"(([A-Zéúßäüö]\.?\s?)*([A-Zéúßäüö][a-zéúßäüö]+\.?\s?)+([A-Zéúßäüö]\.?\s?[a-zéúßäüö]*)*)", temp_authors[0], re.DOTALL)
    Authors = [aut[0] for aut in temp_authors]


    info["Author_main"] = "; ".join(Author_main)
    info["Authors"] = "; ".join(Authors)

    # Extracting the title
    title_temp = re.findall(r"\\title.*?]", text, re.DOTALL)
    title = re.findall(r"\[(.*?)]", title_temp[0], re.DOTALL)
    info["Title"] = title[0]

    # Extracting the Abstract
    abstract_temp = re.findall(r"\\begin{abstract}(.*?)\\end{abstract}", text, re.DOTALL)
    
    # remove comments
    abstract = re.sub(r"\%.*?\n", "", abstract_temp[0],  re.DOTALL)
    print(abstract)

    info["Abstract"] = abstract[0]

    return info

In [155]:
papers = get_data("Texs")
paper = get_data(papers[2])
paper[13]


'/home/amir/Projects/papyrus/hoopoe/Texs/2112_14214/paper.tex'

In [202]:
a = extract_info(paper[13], "Tex")

TypeError: expected string or bytes-like object

In [193]:
a

{'Author_main': 'Chametla et al.',
 'Authors': 'Raúl O. Chametla; Frédéric S. Masset; Clément Baruteau; Bertram Bitsch',
 'Title': 'eccentricity influences the pebble isolation mass',
 'Abstract': ' '}

In [78]:
sent = sent_tokenize(t) 
print(sent)

NameError: name 't' is not defined

In [50]:
#sent = nltk.corpus.treebank.tagged_sents()[22]
words = word_tokenize(t)
words_tagged = nltk.pos_tag(words)

#print(nltk.ne_chunk(words_tagged, binary=False))
tree = nltk.ne_chunk(words_tagged, binary=False)
print(tree)


(S
  Evaluating/VBG
  covid-19/JJ
  vaccine/NN
  efficacy/NN
  and/CC
  safety/NN
  in/IN
  the/DT
  post-authorisation/NN
  phase/NN
  When/WRB
  covid-19/NN
  vaccines/NNS
  were/VBD
  first/RB
  authorised/VBN
  ,/,
  regulators/NNS
  required/VBD
  post-authorisation/NN
  studies/NNS
  to/TO
  tackle/VB
  important/JJ
  uncertainties/NNS
  about/IN
  efficacy/NN
  and/CC
  safety/NN
  ./.
  But/CC
  these/DT
  studies/NNS
  may/MD
  have/VB
  little/JJ
  practical/JJ
  value/NN
  unless/IN
  there/EX
  is/VBZ
  greater/JJR
  engagement/NN
  and/CC
  scrutiny/NN
  from/IN
  the/DT
  wider/NN
  scientific/JJ
  community/NN
  ,/,
  argue/VBP
  (PERSON Christof/NNP Prugger/NNP)
  and/CC
  colleagues/NNS
  (PERSON Christof/NNP Prugger/NNP)
  ,/,
  1/CD
  (PERSON Angela/NNP Spelsberg/NNP)
  ,/,
  2/CD
  (PERSON Ulrich/NNP Keil/NNP)
  ,/,
  3/CD
  (PERSON Juan/NNP Erviti/NNP)
  ,/,
  4/CD
  (PERSON Peter/NNP Doshi5/NNP Expedited/NNP)
  approval/NN
  pathways/NNS
  have/VBP
  been/VBN
  in

In [52]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
#for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):

#VAN = re.compile(words_tagged, re.VERBOSE)
for doc in sent_tagged:
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

AttributeError: 'tuple' object has no attribute 'text'

In [30]:
nltk.download('ieer')

[nltk_data] Downloading package ieer to /home/amir/nltk_data...
[nltk_data]   Unzipping corpora/ieer.zip.


True

In [53]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

### Pipline:

In [5]:
data = get_data("pdfs")

In [27]:
data[0]

'/home/amir/Projects/papyrus/hoopoe/pdfs/bmj-2021-067570.full.pdf'

In [45]:
t = extract_text(data[1], "pdftotext_cli")

In [46]:
print(t)

Evaluating covid-19 vaccine efficacy and safety in the post-authorisation phase When covid-19 vaccines were first authorised, regulators required post-authorisation studies to tackle important uncertainties about efficacy and safety. But these studies may have little practical value unless there is greater engagement and scrutiny from the wider scientific community, argue Christof Prugger and colleagues Christof Prugger, 1 Angela Spelsberg, 2 Ulrich Keil, 3 Juan Erviti, 4 Peter Doshi5 Expedited approval pathways have been increasingly used over the past 30 years to bring new medicines to market. The basic premise has been to give patients earlier access to medicines, often achieved by relying on less robust forms of evidence at the time of approval, such as showing efficacy against surrogate endpoints rather than patient outcomes.1 Expedited approvals are often coupled with requirements to conduct post-authorisation studies to confirm that the medicines safely provide the anticipated b

In [17]:
extract_entities(t)

{'ACCESS',
 'ANALYSIS Landbauwissenschaft',
 'AZD1222',
 'Aachen',
 'Access',
 'Adults',
 'Afek A',
 'Aggarwal A',
 'American',
 'Aminawung JA',
 'Amit S',
 'Anderson EJ',
 'Andrews N',
 'Angela Spelsberg',
 'AstraZeneca',
 'Avorn J',
 'BMJ',
 'BMJ Opinion',
 'BNT162b2',
 'Baltimore',
 'Barda N',
 'Barnéoud L. Ce',
 'Basic',
 'Berlin',
 'BioNTech',
 'Brazil',
 'Bujkiewicz S.',
 'C4591001',
 'C4591011',
 'C4591012',
 'COVID Vaccine Trial Group',
 'Centre Aachen',
 'Centres',
 'ChAdOx1',
 'Christof Prugger',
 'Ciani O',
 'Cite',
 'Clemens SAC',
 'Clin Epidemiol',
 'Clinical',
 'Clinical Data',
 'Clinical Trial Group',
 'Clinical Trials Register',
 'Cochrane Methods Innovations Fund',
 'Colleges',
 'Dagan N',
 'Darrow JJ',
 'Davis C',
 'Doshi P',
 'Doshi P. Canada',
 'Doshi P. Pfizer',
 'Drug Administration',
 'EMA',
 'ENCePP',
 'EU',
 'Efficacy',
 'England',
 'Epidemiology',
 'EudraVigilance',
 'Europe',
 'European',
 'European Clinical Trials Register',
 'European Medicines',
 'European

In [47]:
tt = sent_tokenize(t, language='english', preserve_line=False)

TypeError: sent_tokenize() got an unexpected keyword argument 'preserve_line'

In [55]:
words = word_tokenize(t[10])
tags = nltk.pos_tag(words)
tree = nltk.ne_chunk(tags, binary=True)

In [56]:
tree.draw()

In [2]:
print(nltk.ne_chunk(tt[10], binary=True))

NameError: name 'nltk' is not defined