In [128]:
# add autoreload
%load_ext autoreload
%autoreload 2
import sickle
from tqdm import tqdm
import os
import requests
import lxml
import bs4
import random
from time import sleep

import oai

import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [129]:
source = 'Leiden'

In [130]:
tlist = ['athero', 
         'plaque', 
         'cardiovascular',
         'cardiogram', 
         'cardiology', 
         'cardiologie',
         'hartvaten',
         'klinsch',
         'medische',
         'hartvaat',
         'heart', 
         'vascular',
         'angiogram', 
         'cardiologie', 
         'hartziekte', 
         'vaatziekte',
         'medicine',
         'disease', 
         'medical', 
         'therapy',
         'therapeutic',
         'diagnosic',
         'clinical',
         'surgical', 
         'metabolic',
         'myocard',]
base_url =   oai.sources[source]['link'] #'https://repository.ubn.ru.nl/oai/openaire'  https://scholarlypublications.universiteitleiden.nl/oai2, http://dspace.library.uu.nl/oai/dissertation
pdf_path = f'//Ds/data/LAB/laupodteam/AIOS/Bram/language_modeling/MEDICAL_TEXT/RAW/PhDTheses/{source}/'

In [131]:
OpenAIRE_institutes = ['VU', 'UVA', 'Maastricht', 'Tilburg', 'RUG', 'UTwente', 'TUE', 'UU']


In [132]:
sickler = sickle.Sickle(base_url)


In [133]:
sets = sickler.ListSets()

In [134]:
Sets = {}
for s in sets:
    Sets[s.setSpec]  = s.setName    

In [135]:
keywords = ['clinical', 'medisch', 'medical', 'dissertation', 'umc', 'medicine',
            'diss', 'phd', 'thesis', 'doctorate', 'dissertatie',
            'doctoraat', 'proefschrift']
if source in OpenAIRE_institutes:
    keywords = keywords + ['publications:withfiles']

Sets_to_mine = []
for key, val in Sets.items():
    #print(key,val)
    if any([c in val.lower() for c in keywords]) | any([c in key.lower() for c in keywords]):
        print(f'Set: {key} contains keyword')
        Sets_to_mine.append(key)

Set: hdl_1887_9744 contains keyword
Set: hdl_1887_85175 contains keyword
Set: hdl_1887_20801 contains keyword
Set: hdl_1887_55785 contains keyword
Set: hdl_1887_20777 contains keyword


In [136]:
# get records 
# Beware: this takes a long time.
from collections import defaultdict
records_lists = defaultdict(list)
for set_to_mine in Sets_to_mine:
    if set_to_mine in ['com_1874_298213']:
        continue
    print(f"Mining from set: {set_to_mine}")
    try:
        records = sickler.ListRecords(metadataPrefix='oai_dc', 
                                    ignore_deleted=True, 
                                    set=set_to_mine) # dissertation com_1874_298213
        for record in tqdm(records):
            records_lists[set_to_mine].append(record)
    except Exception as e:
        print(e)
        continue

Mining from set: hdl_1887_9744


7247it [05:41, 21.19it/s]


Mining from set: hdl_1887_85175


21574it [17:30, 21.32it/s]

In [95]:
filtered_records_lists = defaultdict(list)
excluded_records_lists = defaultdict(list)

cond_list = []

for set_to_mine in Sets_to_mine:
    relevant_counter = 0
    for r in tqdm(records_lists[set_to_mine]):
        meta = r.get_metadata()
        relevant = False
        
        TOPIC = False
        PDF = False
        DOCTORATE = True if 'dissertation' in set_to_mine.lower() else False
        EMBARGO = False

        if source in ['Radboud']: 
            PDF=True
        
        try:
            if any([t in subj.lower() for subj in meta['subject'] for t in tlist]):
                TOPIC = True
        except:
            pass

        try:
            if any([t in subj.lower() for subj in meta['title'] for t in tlist]):
                TOPIC = True
        except:
            pass

        try:
            if any([t in subj.lower() for subj in meta['description'] for t in tlist]):
                TOPIC = True
        except:
            pass                
            
        try:
            if ('pdf' in meta['format'][0].lower()):
                PDF = True
        except:
            pass
        
        try:
            if ('doctoral' in meta['type'][0].lower()) |\
                    ('book' in meta['type'][0].lower()):
                DOCTORATE = True
        except:
            pass

        try:
            if ('embargo' in meta['rights'][0].lower()) |\
                    ('restricted' in meta['rights'][0].lower()):
                EMBARGO = True
        except:
            pass  
        
        cond_list.append({'Topic': TOPIC, 
                          'PDF': PDF, 
                          'Doctorate': DOCTORATE, 
                          'Embargo': EMBARGO}
                         )
        
        if TOPIC & PDF & DOCTORATE & ~EMBARGO:
            relevant_counter += 1
            filtered_records_lists[set_to_mine].append(r)
        else:
            excluded_records_lists[set_to_mine].append(r)

    print(f'Found {relevant_counter} relevant records in set: {set_to_mine}')      

100%|██████████| 10527/10527 [00:04<00:00, 2357.13it/s]


Found 3183 relevant records in set: dissertation


0it [00:00, ?it/s]


Found 0 relevant records in set: com_1874_298213


0it [00:00, ?it/s]


Found 0 relevant records in set: col_1874_27033


0it [00:00, ?it/s]


Found 0 relevant records in set: col_1874_30958


100%|██████████| 3527/3527 [00:01<00:00, 2600.00it/s]

Found 1 relevant records in set: col_1874_298214





In [97]:
types_counts = defaultdict(int)
for r in excluded_records_lists['publications:withFiles']:
    try:
        types_counts[r.metadata['type'][0]] += 1
    except:
        pass

In [89]:
types_counts

defaultdict(int, {})

In [98]:
conds_df = pd.DataFrame(cond_list)

In [99]:
conds_df.sum()

Topic         7518
PDF          10417
Doctorate    10528
Embargo        270
dtype: int64

In [116]:
link_list = []
error_list = []

for set_to_mine in filtered_records_lists.keys():
    for record in tqdm(filtered_records_lists[set_to_mine]):
        META = record.get_metadata()
        try:
            List_of_identifiers = META['identifier'] if 'identifier' in META.keys() else ['']
            Title = META['title'] if 'title' in META.keys() else ['']
            Description = META['description'] if 'description' in META.keys() else ['']
            Date = META['date'] if 'date' in META.keys() else ['']
            Language = META['language'] if 'language' in META.keys() else ['']       
            Creator = META['creator'] if 'creator' in META.keys() else ['']
            
            
            List_of_identifiers = [id for id in List_of_identifiers if id is not None]
            
            if len(List_of_identifiers)>0:
                link_list.append({'Set': set_to_mine, 
                                'Link': List_of_identifiers, 
                                'Title': Title,
                                'Description': Description,
                                'Date': Date ,
                                'Language': Language,
                                'Creator': Creator,
                                'Publisher': META.get('publisher'),
                                }
                                )
            else:
                continue
        except Exception as e:
            error_list.append(f"Error parsing {e}: {META}: ")
            pass

100%|██████████| 3183/3183 [00:00<00:00, 15488.90it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


In [114]:
publisher_counts = defaultdict(int)
for r in link_list:
    try:
        publisher_counts[r['Publisher']] += 1
    except:
        pass
publisher_counts

defaultdict(int, {None: 439})

In [117]:
link_list[67]

{'Set': 'dissertation',
 'Link': ['https://dspace.library.uu.nl/handle/1874/342'],
 'Title': ['Mycobacteria and the Nramp1 gene in asthma'],
 'Description': ['Allergic diseases, such as allergic asthma, are steadily increasing in developed countries. Most likely, the cause of the rise in allergic diseases must be sought in environmental factors. In this respect, the suggestion that a change in the level and the kind of early childhood infections would be factor influencing the development of allergic diseases has drawn most attention and discussion. This suggested relationship between hygiene and allergic disease is called the \x93hygiene hypothesis\x94. Basically, this hypothesis states that improved hygiene in industrialized societies, with improved public health measures and the use of vaccines and antibiotics has reduced the incidence of infections that normally stimulate the immune system in some way that mitigates against asthma. In line with the hygiene hypothesis it has been su

In [81]:
link_list = []
error_list = []
for r in tqdm(link_list):
    if source == 'LUMC':
        try:
            link = meta['identifier'][-1] 
            # identify first url in list
            found_link = False
            for l in meta['identifier']:
                if ('http:' in l) or ('https:' in l):
                    link = l
                    found_link = True
                    break
            if not found_link:
                error_list.append(f'No link found for {meta["identifier"]}')  

            doc_id = link.split('/')[-1]
            doc_id_int = int(doc_id)+2

            link = f"https://scholarlypublications.universiteitleiden.nl/handle/1887/{doc_id}"
            linkPdf = f"https://scholarlypublications.universiteitleiden.nl/access/item%3A{doc_id_int}/download"

            # extract through link url. The directory can be found in <li class='ubl-file-download'> <a href='...'>
            # only if <a href in ubl-file-view is "full"
            r = requests.get(link)
            # random number between 0.5 and 2.5 seconds
            rndSleep = round(random.uniform(0.5, 2.5), 2)
            sleep(rndSleep)
            soup = bs4.BeautifulSoup(r.text, 'html.parser')
            found, dsfound, esfound = False, False, False
            for _res in soup.findAll('li', {'class':'ubl-file-view'}):
                if _res.a is not None:
                    if _res.a.contents[0].strip().lower() == 'full text':
                        _pdfdir = _res.a['href']
                        found = True
                    elif _res.a.contents[0].strip().lower() == 'summary in dutch':
                        _dutch_summary = _res.a['href']
                        dsfound = True
                    elif _res.a.contents[0].strip().lower() == 'summary in english':
                        _english_summary = _res.a['href']
                        esfound = True

            linkPdfAlt = f"https://scholarlypublications.universiteitleiden.nl{_pdfdir}" if found else None
            DutchSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_dutch_summary}" if dsfound else None
            EnglishSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_english_summary}" if esfound else None
            
            try:
                lang = meta['language'][0]
            except:
                lang = None
        except Exception as e:
            error_list.append(f'Error: {e} for link: {link}, with meta data: {meta["identifier"]}')
            pass
            #raise ValueError(f'Could not find pdf link for {link}, with error raised: {e}')

0it [00:00, ?it/s]


In [118]:
import re

In [119]:
def extract_pdf_links(links, institute=None):
    pdf_links = []
    if institute in ['VU', 'UVA', 'UTwente']:
        inclusion_terms = [r'abstract', r'full', r'complete', r'samenvatting', r'summary', r'thesis', r'chapter']
    elif institute in ['Radboud']:
        inclusion_terms = [r'handle', r'bitstream']
    elif institute in ['Maastricht']:
        inclusion_terms = [r'ASSET1', r'c[0-9]{3,4}\.pdf']
    elif institute in ['Tilburg']:
        inclusion_terms = [r'\.pdf']
    elif institute in ['RUG']:
        inclusion_terms = [r'summ\.pdf', r'summary\.pdf',  r'samenv\.pdf', r'samenvat\.pdf', r'[ch][0-9]{1,2}\.pdf', 
                           r'thesis\.pdf', r'proefschrift\.pdf', r'dissertation\.pdf', r'dissertatie\.pdf']
    elif institute in ['TUE']:
        inclusion_terms = [r'summ\.pdf', r'summary\.pdf',  r'samenv\.pdf', r'samenvat\.pdf', r'[ch][0-9]{1,2}\.pdf', 
                           r'thesis\.pdf', r'proefschrift\.pdf', r'dissertation\.pdf', r'dissertatie\.pdf']
        inclusion_temrs = inclusion_terms + [r'abstract', r'full', r'complete', r'samenvatting', r'summary', r'thesis', r'chapter']
    elif institute in ['UU']:
        inclusion_terms = [r'dspace\.library\.uu\.nl']
    else:
        raise ValueError(f'Institute {institute} not recognized')
    
    inclusion_terms = [re.compile(rs) for rs in inclusion_terms]
        
    for link in links:
        if (link.lower().startswith('http')) & \
                (link.lower().endswith('.pdf')):
            if any([t.search(link) is not None for t in inclusion_terms]):
                pdf_links.append(link)
    
    if institute in ['UU']:
        _pdf_links = []
        for pdf_link in links:
            if (link.lower().startswith('http')):
                baselink = pdf_link.replace('dspace.library.uu.nl/', 'dspace.library.uu.nl/bitstream/')
                _pdf_links.append(baselink + '/full.pdf')
        pdf_links = _pdf_links
    return pdf_links

In [120]:
link_list

[{'Set': 'dissertation',
  'Link': ['https://dspace.library.uu.nl/handle/1874/88'],
  'Title': ['The Correspondence between Descartes and Henricus Regius'],
  'Description': ['In 1638 the Dutch philosopher and physician Henricus Regius (1598 1679) introduced himself to René Descartes (1596 1650), allegedly because he owed his appointment as professor of theoretical medicine at Utrecht University to his being a Cartesian. During the following years Regius established himself as the main advocate of Cartesianism at Utrecht. In fact, he was the first university professor to teach Cartesian ideas and to publish a number of disputations, which provide a fairly complete picture of Cartesian natural philosophy. \r\n\r\nApart from De Vrijer s theological thesis of 1917 little has been done so far to establish the significance of Regius  work or study the way in which he took up Descartes  ideas and amalgamated them with his own. Although the necessary sources have become available in the past 

In [121]:
link_pdf_list = []
for l in link_list:
    links = extract_pdf_links(l['Link'], institute=source)
    tmp = []
    for _l in links:
        pdfPath = _l.split("/")[-1].replace("%20", "_").rstrip('.pdf')
        pdfPath = source + "_" + l['Creator'][0] + "_" + l['Date'][0] + "_" + pdfPath
        pdfPath = pdfPath.replace(",", "")
        pdfPath = pdfPath.replace(":", "")
        pdfPath = pdfPath.replace(".", "")
        pdfPath = pdfPath.replace(" ", "")
        pdfPath = os.path.join(pdf_path, pdfPath+".pdf")
        tmp.append((_l, pdfPath))
    if len(tmp)==0:
        continue
    l['pdf_links'] = tmp
    link_pdf_list.append(l)

In [122]:
# remove duplicate entries, based on the title
title_set = set()
unique_link_list = []
for el in link_pdf_list:
    try:
        title = el['Title'][0]
        if title not in title_set:
            unique_link_list.append(el)
            title_set.add(title)
    except Exception as e:
        pass

In [123]:
# now we parse the link list and download the pdfs
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36' }

# add sleep
def pdf_writer(pdf_url, _pdf_path):
    r = requests.get(pdf_url, stream=True)
    if r.status_code == 200:
        with open(_pdf_path, 'wb') as f:
            for i,chunk in enumerate(r.iter_content(chunk_size=1024)): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
        return True, 200
    else:
        return False, r.status_code

In [125]:
len(unique_link_list)

3050

In [126]:
if source=='Radboud':
    filelist = os.listdir(pdf_path)
    pre_list = [f.split("_")[-1] for f in filelist]
if source=='UU':
    filelist = os.listdir(pdf_path)
    pre_list = [f.split(".")[-2] for f in filelist]

pdf_error_list = []
skipped_list = []
src_list = []
rcode_list = []
success_list = []
extract_full_text = True
for link in tqdm(unique_link_list):
    for lt in link['pdf_links']:
        _pdf_path = lt[1]
        pdf_url = lt[0]
        return_code = None

        if source=='Radboud':
            if _pdf_path.split("_")[-1] in pre_list:
                skipped_list.append(f'Pdf already exists: {_pdf_path}')
                continue
        if source=='UU':
            if link['pdf_links'][0][0].split("/")[-2] in pre_list:
                skipped_list.append(f'Pdf already exists: {_pdf_path}')
                continue
            
        if os.path.isfile(_pdf_path):
            skipped_list.append(f'Pdf already exists: {_pdf_path}')
        else:
            # try to download the pdf
            try:
                write_file, return_code = pdf_writer(pdf_url, _pdf_path)
                success_list.append(f'Pdf downloaded: {_pdf_path}')
            except Exception as e:
                pdf_error_list.append(f'Error: {e} for link: {link["Link"]}')  

            if return_code == 429:
                print(f'Rate limit exceeded...sleeping 30 seconds')
                sleep(30)
                continue
            else:
                sleep(4)


100%|██████████| 3050/3050 [2:58:01<00:00,  3.50s/it]  


In [127]:
print(f'Found {len(pdf_error_list)} errors while downloading pdfs')
print(f'Skipped {len(skipped_list)} pdfs because they already existed')

Found 0 errors while downloading pdfs
Skipped 492 pdfs because they already existed
