In [2]:
# add autoreload
%load_ext autoreload
%autoreload 2
import sickle
from tqdm import tqdm

In [3]:
tlist = ['cardiovascular', 'cardiogram', 'cardiology', 'heart', 'vascular']
base_url = 'http://dspace.library.uu.nl/oai/dissertation'
pdf_path = '//Ds/data/LAB/laupodteam/AIOS/Bram/language_modeling/MEDICAL_TEXT/RAW/PhDTheses'

In [4]:
sickler = sickle.Sickle(base_url)


In [5]:
sets = sickler.ListSets()

In [6]:
Sets = {}
for s in sets:
    Sets[s.setSpec]  = s.setName    

In [7]:
Sets_to_mine = []
for key, val in Sets.items():
    if 'umc' in val.lower():
        print(f'Set: {key} contains umcu')
        Sets_to_mine.append(key)
    elif 'diss' in val.lower():
        print(f'Set: {key} contains diss')
        Sets_to_mine.append(key)
    elif 'phd' in val.lower():
        print(f'Set: {key} contains phd') 
        Sets_to_mine.append(key)
    

Set: dissertation contains diss
Set: com_1874_298213 contains umcu
Set: col_1874_298214 contains umcu


In [49]:
link_list = []
error_list = []
records_list = []
for set_to_mine in tqdm(Sets_to_mine):
    records = sickler.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True, set=set_to_mine) # dissertation com_1874_298213

    relevant_counter = 0
    for r in tqdm(records):
        records_list.append(r)      
        meta = r.get_metadata()
        relevant = False
        
        try:
            if any([t in subj for subj in meta['subject'] for t in tlist]):
                relevant = True
        except:
            pass
            
        try:
            if any([t in subj for subj in meta['description'] for t in tlist]):
                relevant = True
        except:
            pass

        try:
            if 'embargo' in meta['rights'][0].lower():
                relevant = False
        except:
            pass   
        
        if relevant:
            relevant_counter += 1
            link = meta['identifier'][0]
            baselink = link.replace('dspace.library.uu.nl/', 'dspace.library.uu.nl/bitstream/')
            linkPdf = baselink + '/full.pdf'
            try:
                linkPdfAlt = baselink +'/'+meta['creator'][0].split(',')[0].lower()+'.pdf'
            except:
                linkPdfAlt = None
            try:
                link_list.append({'Set':set_to_mine, 
                                'Link': link, 
                                'PdfLink': linkPdf,
                                'PdfLinkAlt': linkPdfAlt,
                                'Title': meta['title'][0],
                                'Description': meta['description'][0],
                                'Date': meta['date'][0],
                                'Language': meta['language'][0],
                                }
                                )
            except Exception as e:
                link_list.append({'Set':set_to_mine, 
                                'Link': link, 
                                'PdfLink': linkPdf,                                
                                'PdfLinkAlt': linkPdfAlt,
                                'Title': "whoopsie",
                                'Description': "daisey",
                                'Date': "in the present",
                                'Language': "klingon?",
                                }
                                )
                error_list.append(f'Error: {e} for link: {link}')
    print(f'Found {relevant_counter} relevant records in set: {set_to_mine}')      

10398it [01:19, 130.52it/s]<?, ?it/s]
 33%|███▎      | 1/3 [01:19<02:39, 79.74s/it]

Found 975 relevant records in set: dissertation


3463it [00:27, 127.50it/s]
 67%|██████▋   | 2/3 [01:46<00:48, 48.85s/it]

Found 784 relevant records in set: com_1874_298213


3463it [00:24, 141.88it/s]
100%|██████████| 3/3 [02:11<00:00, 43.83s/it]

Found 784 relevant records in set: col_1874_298214





In [50]:
# remove duplicate entries, based on the title
title_set = set()
unique_link_list = []
for el in link_list:
    try:
        title = el['Title']
        if title not in title_set:
            unique_link_list.append(el)
            title_set.add(title)
    except:
        pass

In [51]:
len(unique_link_list), len(link_list)

(970, 2543)

In [57]:
# now we parse the link list and download the pdfs
import requests
import os
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36' }

# add sleep
from time import sleep

def pdf_writer(pdf_url, _pdf_path):
    r = requests.get(pdf_url, stream=True)
    with open(_pdf_path, 'wb') as f:
        for i,chunk in enumerate(r.iter_content(chunk_size=1024)): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
    return i

pdf_error_list = []
skipped_list = []
for link in tqdm(unique_link_list):
    pdf_url = link['PdfLink']
    pdf_name = link['Link'].split('/')[-1] + '.pdf'
    _pdf_path = os.path.join(pdf_path, pdf_name)
    
    # check if pdf already exists
    if os.path.isfile(_pdf_path):
        skipped_list.append(f'Pdf already exists: {_pdf_path}')
        continue
    # try to download the pdf
    try:
        request_code = requests.get(pdf_url, headers=headers).status_code
        if request_code == 404:
            # try alternative link
            pdf_url = link['PdfLinkAlt']
            request_code = requests.get(pdf_url, headers=headers).status_code
        if request_code == 200:
            i = pdf_writer(pdf_url, _pdf_path)
        else:
            pdf_error_list.append(f'Error: got request code {request_code} : {link["Link"]}, {link["PdfLink"]}')
    
    except Exception as e:
        pdf_error_list.append(f'Error: {e} for link: {link["Link"]}')
        
    sleep(1)

print(f'Found {len(pdf_error_list)} errors while downloading pdfs')
print(f'Skipped {len(skipped_list)} pdfs because they already existed')

100%|██████████| 970/970 [38:58<00:00,  2.41s/it]  

Found 542 errors while downloading pdfs
Skipped 155 pdfs because they already existed





In [58]:
pdf_error_list

['Error: got request code 404 : https://dspace.library.uu.nl/handle/1874/515, https://dspace.library.uu.nl/bitstream/handle/1874/515/full.pdf',
 'Error: got request code 404 : https://dspace.library.uu.nl/handle/1874/1705, https://dspace.library.uu.nl/bitstream/handle/1874/1705/full.pdf',
 'Error: got request code 404 : https://dspace.library.uu.nl/handle/1874/26201, https://dspace.library.uu.nl/bitstream/handle/1874/26201/full.pdf',
 'Error: got request code 404 : https://dspace.library.uu.nl/handle/1874/26611, https://dspace.library.uu.nl/bitstream/handle/1874/26611/full.pdf',
 'Error: got request code 404 : https://dspace.library.uu.nl/handle/1874/27029, https://dspace.library.uu.nl/bitstream/handle/1874/27029/full.pdf',
 'Error: got request code 404 : https://dspace.library.uu.nl/handle/1874/27482, https://dspace.library.uu.nl/bitstream/handle/1874/27482/full.pdf',
 'Error: got request code 404 : https://dspace.library.uu.nl/handle/1874/27576, https://dspace.library.uu.nl/bitstream

## Extract PDF text

In [139]:
# read in pdf and extract text
# We want to ignore all the decorum and only extract the text (i.e. no page number etc.)
import pytesseract
from PyPDF2 import PdfReader
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re

In [102]:
def pdf_to_text(path, backend='pypdf'):
    '''Extract text from pdf documents
        Source: https://towardsdatascience.com/pdf-preprocessing-with-python-19829752af9f
    '''

    if backend=='pdfminer':
        manager = PDFResourceManager()
        retstr = StringIO()
        layout = LAParams(all_texts=False, detect_vertical=True)
        device = TextConverter(manager, retstr, laparams=layout)
        interpreter = PDFPageInterpreter(manager, device)
        with open(path, 'rb') as filepath:
            for page in PDFPage.get_pages(filepath, check_extractable=True):
                interpreter.process_page(page)
        text = retstr.getvalue()
        device.close()
        retstr.close()
        return text
    elif backend=='pypdf':
        reader = PdfReader(path)
        return [p.extract_text(0) for p in reader.pages]
    elif backend=='pytesseract':
        return pytesseract.image_to_string(path)

In [90]:
Files = [f for f in os.listdir(pdf_path) if f.endswith('.pdf')]

In [130]:
_File = Files[113]
Text = pdf_to_text(os.path.join(pdf_path, _File), backend='pypdf')

## Extract Summary from text

In [135]:
page_nums_samenvatting = [i for i,p in enumerate(Text) if 'samenvatting' in p.lower()]
page_nums_dankwoord = [i for i,p in enumerate(Text) if any([t in p.lower() for t in ['dankwoord', 'nawoord']])]
page_nums_empty = [i for i,p in enumerate(Text) if p.strip()=='']
page_nums_ToC = [i for i,p in enumerate(Text) if 'contents' in p.lower()]
print(page_nums_samenvatting, page_nums_dankwoord, page_nums_empty, page_nums_ToC, _File)
# extract the text from the pdfs based on the page_nums:
# if there is a page number, we assume that text on that page is relevant

# then we want to find the delimiters for the different sections



[2, 4, 148, 150, 152] [4, 158] [5, 48] [4] 215182.pdf


In [141]:
ToC_num = min([n for n in page_nums_ToC if n>1])
ToC_page = Text[ToC_num]
re.findall(r'Samenvatting[\s\t]+(\d+)', ToC_page)

['149']

## Translate the remainder