In [1]:
# add autoreload
%load_ext autoreload
%autoreload 2
import sickle
from tqdm import tqdm
import os
import requests
import lxml
import bs4
import random
from time import sleep


In [89]:
tlist = ['cardiovascular', 'cardiogram', 'cardiology', 'heart', 'vascular']
institute = 'LUMC'
base_url =  'https://oai.narcis.nl/oai2' # https://scholarlypublications.universiteitleiden.nl/oai2, http://dspace.library.uu.nl/oai/dissertation 'https://pure.uvt.nl/ws/oai?metadataPrefix=oai_dc'
pdf_path = '//Ds/data/LAB/laupodteam/AIOS/Bram/language_modeling/MEDICAL_TEXT/RAW/PhDTheses'

In [90]:
sickler = sickle.Sickle(base_url)


In [91]:
sets = sickler.ListSets()

In [92]:
Sets = {}
for s in sets:
    Sets[s.setSpec]  = s.setName    

In [93]:
Sets

{}

In [94]:
Sets_to_mine = []
for key, val in Sets.items():
    if any([c in val.lower() for c in ['diss', 'phd', 'thesis']]):
        print(f'Set: {key} contains diss')
        Sets_to_mine.append(key)

In [50]:
Sets_to_mine

[]

In [None]:
# get records 
from collections import defaultdict
records_lists = defaultdict(list)
for set_to_mine in tqdm(Sets_to_mine):
    records = sickler.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True, set=set_to_mine) # dissertation com_1874_298213
    for record in records:
        records_lists[set_to_mine].append(record)

In [None]:
meta['identifier']

In [None]:
link_list = []
error_list = []
for set_to_mine in Sets_to_mine:
    relevant_counter = 0
    for r in tqdm(records_lists[set_to_mine]):
        meta = r.get_metadata()
        relevant = False
        
        try:
            if any([t in subj for subj in meta['subject'] for t in tlist]):
                relevant = True
        except:
            pass
            
        try:
            if any([t in subj for subj in meta['description'] for t in tlist]):
                relevant = True
        except:
            pass

        try:
            if 'embargo' in meta['rights'][0].lower():
                relevant = False
        except:
            pass   
        
        if relevant:
            relevant_counter += 1
            if institute == 'LUMC':
                try:
                    link = meta['identifier'][-1] 
                    # identify first url in list
                    found_link = False
                    for l in meta['identifier']:
                        if ('http:' in l) or ('https:' in l):
                            link = l
                            found_link = True
                            break
                    if not found_link:
                        error_list.append(f'No link found for {meta["identifier"]}')  

                    doc_id = link.split('/')[-1]
                    doc_id_int = int(doc_id)+2

                    link = f"https://scholarlypublications.universiteitleiden.nl/handle/1887/{doc_id}"
                    linkPdf = f"https://scholarlypublications.universiteitleiden.nl/access/item%3A{doc_id_int}/download"

                    # extract through link url. The directory can be found in <li class='ubl-file-download'> <a href='...'>
                    # only if <a href in ubl-file-view is "full"
                    r = requests.get(link)
                    # random number between 0.5 and 2.5 seconds
                    rndSleep = round(random.uniform(0.5, 2.5), 2)
                    sleep(rndSleep)
                    soup = bs4.BeautifulSoup(r.text, 'html.parser')
                    found, dsfound, esfound = False, False, False
                    for _res in soup.findAll('li', {'class':'ubl-file-view'}):
                        if _res.a is not None:
                            if _res.a.contents[0].strip().lower() == 'full text':
                                _pdfdir = _res.a['href']
                                found = True
                            elif _res.a.contents[0].strip().lower() == 'summary in dutch':
                                _dutch_summary = _res.a['href']
                                dsfound = True
                            elif _res.a.contents[0].strip().lower() == 'summary in english':
                                _english_summary = _res.a['href']
                                esfound = True

                    linkPdfAlt = f"https://scholarlypublications.universiteitleiden.nl{_pdfdir}" if found else None
                    DutchSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_dutch_summary}" if dsfound else None
                    EnglishSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_english_summary}" if esfound else None
                except Exception as e:
                    error_list.append(f'Error: {e} for link: {link}, with meta data: {meta["identifier"]}')
                    pass
                    #raise ValueError(f'Could not find pdf link for {link}, with error raised: {e}')
            elif institute == 'UU':                
                link = meta['identifier'][0]
                baselink = link.replace('dspace.library.uu.nl/', 'dspace.library.uu.nl/bitstream/')
                linkPdf = baselink + '/full.pdf'

                try:
                    linkPdfAlt = baselink +'/'+meta['creator'][0].split(',')[0].lower()+'.pdf'
                except:
                    linkPdfAlt = None
                
                DutchSummaryLink = None
                EnglishSummaryLink = None

            try:
                link_list.append({'Set':set_to_mine, 
                                'Link': link, 
                                'PdfLink': linkPdf,
                                'PdfLinkAlt': linkPdfAlt,
                                'DutchSummaryLink': DutchSummaryLink,
                                'EnglishSummaryLink': EnglishSummaryLink,
                                'Title': meta['title'][0],
                                'Description': meta['description'][0],
                                'Date': meta['date'][0],
                                'Language': meta['language'][0],
                                }
                                )
            except Exception as e:
                link_list.append({'Set':set_to_mine, 
                                'Link': link, 
                                'PdfLink': linkPdf,                                
                                'PdfLinkAlt': linkPdfAlt,
                                'DutchSummaryLink': None,
                                'EnglishSummaryLink': None,
                                'Title': None,
                                'Description': None,
                                'Date': None,
                                'Language': None,
                                }
                                )
                error_list.append(f'Error: {e} for link: {link}')
    print(f'Found {relevant_counter} relevant records in set: {set_to_mine}')      

In [None]:
len(link_list)

In [None]:
# remove duplicate entries, based on the title
title_set = set()
unique_link_list = []
for el in link_list:
    try:
        title = el['Title']
        if title not in title_set:
            unique_link_list.append(el)
            title_set.add(title)
    except:
        pass

In [None]:
len(unique_link_list), len(link_list)

In [None]:
# now we parse the link list and download the pdfs

headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36' }

# add sleep

def pdf_writer(pdf_url, _pdf_path):
    r = requests.get(pdf_url, stream=True)
    if r.status_code == 200:
        with open(_pdf_path, 'wb') as f:
            for i,chunk in enumerate(r.iter_content(chunk_size=1024)): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
        return True
    else:
        return False 

pdf_error_list = []
skipped_list = []
src_list = []
rcode_list = []
extract_full_text = True
for link in tqdm(unique_link_list):
    # MAIN PDF
    pdf_name = link['Link'].split('/')[-1] + '.pdf'
    _pdf_path = os.path.join(pdf_path, pdf_name)
        
    if extract_full_text:
        pdf_url = link['PdfLink']
        # check if pdf already exists
        if os.path.isfile(_pdf_path):
            skipped_list.append(f'Pdf already exists: {_pdf_path}')
        else:
            # try to download the pdf
            try:
                write_file = pdf_writer(pdf_url, _pdf_path)
                src_list.append(f'Pdf downloaded from original link: {pdf_url}')            
                
                if not write_file:
                    pdf_url = link['PdfLinkAlt']
                    write_file = pdf_writer(pdf_url, _pdf_path)

                    if write_file:                
                        src_list.append(f'Pdf downloaded from alternative link: {pdf_url}')
                    else:
                        pdf_error_list.append(f'Error: No success for both : {link["PdfLinkAlt"]} and {link["PdfLink"]}')
            
            except Exception as e:
                pdf_error_list.append(f'Error: {e} for link: {link["Link"]}')  
            sleep(1)

    if link['DutchSummaryLink'] is not None:
        pdf_url = link['DutchSummaryLink']
        _pdf_path_ds = _pdf_path.replace('.pdf', '_dutch_summary.pdf')
        if os.path.isfile(_pdf_path):
            skipped_list.append(f'Pdf already exists: {_pdf_path_ds}')
        else:
            write_file = pdf_writer(pdf_url, _pdf_path_ds)
            
            if not write_file:
                pdf_error_list.append(f'Error: No success for dutch summary: {pdf_url}')

    if link['EnglishSummaryLink'] is not None:
        pdf_url = link['EnglishSummaryLink']
        _pdf_path_es = _pdf_path.replace('.pdf', '_english_summary.pdf')
        if os.path.isfile(_pdf_path_es):
            skipped_list.append(f'Pdf already exists: {_pdf_path_es}')
        else:
            write_file = pdf_writer(pdf_url, _pdf_path_es)
            if not write_file:
                pdf_error_list.append(f'Error: No success for english summary: {pdf_url}')    
    
print(f'Found {len(pdf_error_list)} errors while downloading pdfs')
print(f'Skipped {len(skipped_list)} pdfs because they already existed')

## Extract PDF text

In [None]:
# read in pdf and extract text
# We want to ignore all the decorum and only extract the text (i.e. no page number etc.)
import pytesseract
from PyPDF2 import PdfReader
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re

In [None]:
def pdf_to_text(path, backend='pypdf'):
    '''Extract text from pdf documents
        Source: https://towardsdatascience.com/pdf-preprocessing-with-python-19829752af9f
    '''

    if backend=='pdfminer':
        manager = PDFResourceManager()
        retstr = StringIO()
        layout = LAParams(all_texts=False, detect_vertical=True)
        device = TextConverter(manager, retstr, laparams=layout)
        interpreter = PDFPageInterpreter(manager, device)
        with open(path, 'rb') as filepath:
            for page in PDFPage.get_pages(filepath, check_extractable=True):
                interpreter.process_page(page)
        text = retstr.getvalue()
        device.close()
        retstr.close()
        return text
    elif backend=='pypdf':
        reader = PdfReader(path)
        return [p.extract_text(0) for p in reader.pages]
    elif backend=='pytesseract':
        return pytesseract.image_to_string(path)

In [None]:
Files = [f for f in os.listdir(pdf_path) if f.endswith('.pdf')]

In [None]:
_File = Files[245]
Text = pdf_to_text(os.path.join(pdf_path, _File), backend='pypdf')

## Extract Summary from text

In [None]:
page_nums_samenvatting = [i for i,p in enumerate(Text) if 'samenvatting' in p.lower()]
page_nums_dankwoord = [i for i,p in enumerate(Text) if any([t in p.lower() for t in ['dankwoord', 'nawoord']])]
page_nums_empty = [i for i,p in enumerate(Text) if p.strip()=='']
page_nums_ToC = [i for i,p in enumerate(Text) if 'content' in p.lower()]
print(page_nums_samenvatting, page_nums_dankwoord, page_nums_empty, page_nums_ToC, _File)
# extract the text from the pdfs based on the page_nums:
# if there is a page number, we assume that text on that page is relevant

# then we want to find the delimiters for the different sections



In [None]:
Text[10]

In [None]:
def get_samenvatting_page(txts, page_nums_ToC):
    ToC_num = min([n for n in page_nums_ToC if n>1])
    ToC_page = txts[ToC_num]
    return int(re.findall(r'Samenvatting[\s\t]+(\d+)', ToC_page))

# first element in ToC, after the samenvatting



## Translate the remainder

In [None]:
# get all text starting from the first introduction header, after the table of contents.
