In [87]:
# add autoreload
%load_ext autoreload
%autoreload 2
import sickle
from tqdm import tqdm
import os

import requests
import lxml
import bs4
import random
from time import sleep

import oai

import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
source = 'Tilburg'

In [89]:
tlist = ['athero', 
         'plaque', 
         'cardiovascular',
         'cardiogram', 
         'cardiology', 
         'cardiologie',
         'hartvaten',
         'klinsch',
         'medische',
         'hartvaat',
         'heart', 
         'vascular',
         'angiogram', 
         'cardiologie', 
         'hartziekte', 
         'vaatziekte',
         'medicine',
         'disease', 
         'medical', 
         'therapy',
         'therapeutic',
         'diagnosic',
         'clinical',
         'surgical', 
         'metabolic',
         'myocard',]

tlist = ['recht', 'wetten', 'juridisch', 'wetboek', 'jurisprudentie', 'precedent', 'wet bibop', 'wetboek van strafrecht', 'wetboek van strafvordering']

base_url =   oai.sources[source]['link'] #'https://repository.ubn.ru.nl/oai/openaire'  https://scholarlypublications.universiteitleiden.nl/oai2, http://dspace.library.uu.nl/oai/dissertation
pdf_path = f'//Ds/data/LAB/laupodteam/AIOS/Bram/language_modeling/MEDICAL_TEXT/RAW/PhDTheses/{source}/'

In [90]:
OpenAIRE_institutes = ['VU', 'UVA', 'Maastricht', 'Tilburg', 'RUG', 'UTwente', 'TUE', 'UU', 'Erasmus']


In [91]:
sickler = sickle.Sickle(base_url)


In [92]:
sets = sickler.ListSets()

In [93]:
Sets = {}
for s in sets:
    Sets[s.setSpec]  = s.setName    

In [94]:
keywords = ['clinical', 'medisch', 'medical', 'dissertation', 'umc', 'medicine',
            'diss', 'phd', 'thesis', 'doctorate', 'dissertatie',
            'doctoraat', 'proefschrift']
if source in OpenAIRE_institutes:
    keywords = keywords + ['publications:withfiles']

Sets_to_mine = []
for key, val in Sets.items():
    #print(key,val)
    if any([c in val.lower() for c in keywords]) | any([c in key.lower() for c in keywords]):
        print(f'Set: {key} contains keyword')
        Sets_to_mine.append(key)

Set: publications:withFiles contains keyword


In [95]:
# get records 
# Beware: this takes a long time.
from collections import defaultdict
records_lists = defaultdict(list)
for set_to_mine in Sets_to_mine:
    #if set_to_mine in ['com_1874_298213']:
    #    continue
    print(f"Mining from set: {set_to_mine}")
    try:
        records = sickler.ListRecords(metadataPrefix='oai_dc', 
                                    ignore_deleted=True, 
                                    set=set_to_mine) # dissertation com_1874_298213
        for record in tqdm(records):
            records_lists[set_to_mine].append(record)
    except Exception as e:
        print(e)
        continue

Mining from set: publications:withFiles


24925it [09:42, 42.82it/s]


In [96]:
filtered_records_lists = defaultdict(list)
excluded_records_lists = defaultdict(list)

cond_list = []

for set_to_mine in Sets_to_mine:
    relevant_counter = 0
    for r in tqdm(records_lists[set_to_mine]):
        meta = r.get_metadata()
        relevant = False
        
        TOPIC = False
        PDF = False
        DOCTORATE = True if 'dissertation' in set_to_mine.lower() else False
        EMBARGO = False

        if source in ['Radboud']: 
            PDF=True
        
        try:
            if any([t in subj.lower() for subj in meta['subject'] for t in tlist]):
                TOPIC = True
        except:
            pass

        try:
            if any([t in subj.lower() for subj in meta['title'] for t in tlist]):
                TOPIC = True
        except:
            pass

        try:
            if any([t in subj.lower() for subj in meta['description'] for t in tlist]):
                TOPIC = True
        except:
            pass                
            
        try:
            if ('pdf' in meta['format'][0].lower()):
                PDF = True
        except:
            pass
        
        try:
            if ('doctoral' in meta['type'][0].lower()) |\
                    ('book' in meta['type'][0].lower()):
                DOCTORATE = True
        except:
            pass

        try:
            if ('embargo' in meta['rights'][0].lower()) |\
                    ('restricted' in meta['rights'][0].lower()):
                EMBARGO = True
        except:
            pass  
        
        cond_list.append({'Topic': TOPIC, 
                          'PDF': PDF, 
                          'Doctorate': DOCTORATE, 
                          'Embargo': EMBARGO}
                         )
        
        if TOPIC & PDF & DOCTORATE & ~EMBARGO:
            relevant_counter += 1
            filtered_records_lists[set_to_mine].append(r)
        else:
            excluded_records_lists[set_to_mine].append(r)

    print(f'Found {relevant_counter} relevant records in set: {set_to_mine}')      

100%|██████████| 24925/24925 [00:02<00:00, 11754.22it/s]

Found 359 relevant records in set: publications:withFiles





In [97]:
types_counts = defaultdict(int)
for r in excluded_records_lists['publications:withFiles']:
    try:
        types_counts[r.metadata['type'][0]] += 1
    except:
        pass

In [98]:
types_counts

defaultdict(int,
            {'article': 12069,
             'book': 5644,
             'bookPart': 1833,
             'other': 362,
             'contributionToPeriodical': 706,
             'workingPaper': 3864,
             'conferenceObject': 88})

In [99]:
conds_df = pd.DataFrame(cond_list)

In [100]:
conds_df.sum()

Topic         1294
PDF          24349
Doctorate     7836
Embargo      10646
dtype: int64

In [101]:
link_list = []
error_list = []

for set_to_mine in filtered_records_lists.keys():
    for record in tqdm(filtered_records_lists[set_to_mine]):
        META = record.get_metadata()
        try:
            List_of_identifiers = META['identifier'] if 'identifier' in META.keys() else ['']
            Title = META['title'] if 'title' in META.keys() else ['']
            Description = META['description'] if 'description' in META.keys() else ['']
            Date = META['date'] if 'date' in META.keys() else ['']
            Language = META['language'] if 'language' in META.keys() else ['']       
            Creator = META['creator'] if 'creator' in META.keys() else ['']
            
            
            List_of_identifiers = [id for id in List_of_identifiers if id is not None]
            
            if len(List_of_identifiers)>0:
                link_list.append({'Set': set_to_mine, 
                                'Link': List_of_identifiers, 
                                'Title': Title,
                                'Description': Description,
                                'Date': Date ,
                                'Language': Language,
                                'Creator': Creator,
                                'Publisher': META.get('publisher'),
                                }
                                )
            else:
                continue
        except Exception as e:
            error_list.append(f"Error parsing {e}: {META}: ")
            pass

100%|██████████| 359/359 [00:00<00:00, 8255.92it/s]


In [102]:
publisher_counts = defaultdict(int)
for r in link_list:
    try:
        publisher_counts[r['Publisher']] += 1
    except:
        pass
publisher_counts

defaultdict(int, {None: 22})

In [106]:
link_list[80]

{'Set': 'publications:withFiles',
 'Link': ['https://research.tilburguniversity.edu/en/publications/8b5ba7c6-2467-4e66-aa35-fa78fe44858c',
  'https://pure.uvt.nl/ws/files/1063325/3240346.pdf',
  'urn:ISBN:9789058503558'],
 'Title': ['Een rechtsvergelijkend onderzoek naar de positie van consumenten op de Nederlandse en Belgische woningbouwmarkt:Bijdrage aan een mogelijke Europese regulering'],
 'Description': [''],
 'Date': ['2008'],
 'Language': ['nld'],
 'Creator': ['Dierikx, M.'],
 'Publisher': ['Wolf Legal Publishers (WLP)']}

In [18]:
import re

In [23]:
def get_pdfs_Leiden(url):
    try:
        r = requests.get(url, timeout=10)
        # random number between 0.5 and 2.5 seconds
        rndSleep = round(random.uniform(1, 5), 2)
        sleep(rndSleep)
        soup = bs4.BeautifulSoup(r.text, 'html.parser')
        found, dsfound, esfound = False, False, False
        for _res in soup.findAll('li', {'class':'ubl-file-view'}):
            if _res.a is not None:
                if _res.a.contents[0].strip().lower() == 'full text':
                    _pdfdir = _res.a['href']
                    found = True
                elif _res.a.contents[0].strip().lower() == 'summary in dutch':
                    _dutch_summary = _res.a['href']
                    dsfound = True
                elif _res.a.contents[0].strip().lower() == 'summary in english':
                    _english_summary = _res.a['href']
                    esfound = True

        linkPdfAlt = f"https://scholarlypublications.universiteitleiden.nl{_pdfdir}" if found else None
        DutchSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_dutch_summary}" if dsfound else None
        EnglishSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_english_summary}" if esfound else None
        
        return linkPdfAlt, DutchSummaryLink, EnglishSummaryLink, r.status_code
    except Exception as e:
        return None, None, None, None

In [58]:
def extract_pdf_links(links, institute=None):
    pdf_links = []
    if institute in ['VU', 'UVA', 'UTwente']:
        inclusion_terms = [r'abstract', r'full', r'complete', r'samenvatting', r'summary', r'thesis', r'chapter']
    elif institute in ['Radboud']:
        inclusion_terms = [r'handle', r'bitstream']
    elif institute in ['Maastricht']:
        inclusion_terms = [r'ASSET1', r'c[0-9]{3,4}\.pdf']
    elif institute in ['Tilburg']:
        inclusion_terms = [r'\.pdf']
    elif institute in ['RUG']:
        inclusion_terms = [r'summ\.pdf', r'summary\.pdf',  r'samenv\.pdf', r'samenvat\.pdf', r'[ch][0-9]{1,2}\.pdf', 
                           r'thesis\.pdf', r'proefschrift\.pdf', r'dissertation\.pdf', r'dissertatie\.pdf']
    elif institute in ['TUE']:
        inclusion_terms = [r'summ\.pdf', r'summary\.pdf',  r'samenv\.pdf', r'samenvat\.pdf', r'[ch][0-9]{1,2}\.pdf', 
                           r'thesis\.pdf', r'proefschrift\.pdf', r'dissertation\.pdf', r'dissertatie\.pdf']
        inclusion_terms = inclusion_terms + [r'abstract', r'full', r'complete', r'samenvatting', r'summary', r'thesis', r'chapter']
    elif institute in ['UU']:
        inclusion_terms = [r'dspace\.library\.uu\.nl']
    elif institute in ['Leiden']:
        inclusion_terms = [r'handle']
    elif institute in ['Erasmus']:
        inclusion_terms = [r'files']
    else:
        raise ValueError(f'Institute {institute} not recognized')
    
    inclusion_terms = [re.compile(rs) for rs in inclusion_terms]
    
    if institute in ['UU']:
        _pdf_links = []
        for link in links:
            if (link.lower().startswith('http')):
                baselink = link.replace('dspace.library.uu.nl/', 'dspace.library.uu.nl/bitstream/')
                _pdf_links.append(baselink + '/full.pdf')
        pdf_links = _pdf_links
    elif institute in ['Leiden']:
        _pdf_links = []
        for link in links:
            if (link.lower().startswith('http')) & (any([t.search(link) is not None for t in inclusion_terms])):
                main, dutch_summ, engl_summ, return_code = get_pdfs_Leiden(link)
                if return_code == 429:
                    print("Too many requests. Waiting 60 seconds")
                    sleep(60)
                else:
                    _pdf_links.extend([main, dutch_summ, engl_summ])
        pdf_links = [l for l in _pdf_links if l is not None]
    else:
        for link in links:
            if (link.lower().startswith('http')) & \
                    (link.lower().endswith('.pdf')):
                if any([t.search(link) is not None for t in inclusion_terms]):
                    pdf_links.append(link)        

    return pdf_links

In [59]:
len(link_list)

1119

In [60]:
link_pdf_list = []
for l in tqdm(link_list):
    links = extract_pdf_links(l['Link'], institute=source)
    tmp = []
    for _l in links:
        Creator = l['Creator'][0] if l['Creator'][0] is not None else 'Unknown'
        Date = l['Date'][0] if l['Date'][0] is not None else 'Unknown'
        
        pdfPath = _l.split("/")[-1].replace("%20", "_").rstrip('.pdf')
        pdfPath = source + "_" + Creator + "_" + Date + "_" + pdfPath
        pdfPath = pdfPath.replace(",", "")
        pdfPath = pdfPath.replace(":", "")
        pdfPath = pdfPath.replace(".", "")
        pdfPath = pdfPath.replace(" ", "")
        pdfPath = os.path.join(pdf_path, pdfPath+".pdf")
        tmp.append((_l, pdfPath))
    if len(tmp)==0:
        continue
    l['pdf_links'] = tmp
    link_pdf_list.append(l)

100%|██████████| 1119/1119 [00:00<00:00, 48589.71it/s]


In [62]:
len(link_pdf_list)

1119

In [63]:
# remove duplicate entries, based on the title
title_set = set()
unique_link_list = []
for el in link_pdf_list:
    try:
        title = el['Title'][0]
        if title not in title_set:
            unique_link_list.append(el)
            title_set.add(title)
    except Exception as e:
        pass

In [64]:
len(unique_link_list)

1118

In [65]:
# now we parse the link list and download the pdfs
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36' }

# add sleep
def pdf_writer(pdf_url, _pdf_path):
    r = requests.get(pdf_url, stream=True)
    if r.status_code == 200:
        with open(_pdf_path, 'wb') as f:
            for i,chunk in enumerate(r.iter_content(chunk_size=1024)): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
        return True, 200
    else:
        return False, r.status_code

In [66]:
if source=='Radboud':
    filelist = os.listdir(pdf_path)
    pre_list = [f.split("_")[-1] for f in filelist]
if source=='UU':
    filelist = os.listdir(pdf_path)
    pre_list = [f.split(".")[-2] for f in filelist]

pdf_error_list = []
skipped_list = []
src_list = []
rcode_list = []
success_list = []
extract_full_text = True
for link in tqdm(unique_link_list):
    for lt in link['pdf_links']:
        _pdf_path = lt[1]
        pdf_url = lt[0]
        return_code = None

        if source=='Radboud':
            if _pdf_path.split("_")[-1] in pre_list:
                skipped_list.append(f'Pdf already exists: {_pdf_path}')
                continue
        if source=='UU':
            if link['pdf_links'][0][0].split("/")[-2] in pre_list:
                skipped_list.append(f'Pdf already exists: {_pdf_path}')
                continue
            
        if os.path.isfile(_pdf_path):
            skipped_list.append(f'Pdf already exists: {_pdf_path}')
        else:
            # try to download the pdf
            try:
                write_file, return_code = pdf_writer(pdf_url, _pdf_path)
                success_list.append(f'Pdf downloaded: {_pdf_path}')
            except Exception as e:
                pdf_error_list.append(f'Error: {e} for link: {link["Link"]}')  

            if return_code == 429:
                print(f'Rate limit exceeded...sleeping 30 seconds')
                sleep(30)
                continue
            else:
                sleep(4)


100%|██████████| 1118/1118 [3:43:14<00:00, 11.98s/it] 


In [67]:
print(f'Found {len(pdf_error_list)} errors while downloading pdfs')
print(f'Skipped {len(skipped_list)} pdfs because they already existed')

Found 0 errors while downloading pdfs
Skipped 5 pdfs because they already existed
