In [98]:
# add autoreload
%load_ext autoreload
%autoreload 2
import sickle
from tqdm import tqdm
import os
import requests
import lxml
import bs4
import random
from time import sleep

import oai

import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
source = 'Maastricht'

In [100]:
tlist = ['athero', 
         'plaque', 
         'cardiovascular',
         'cardiogram', 
         'cardiology', 
         'cardiologie',
         'hartvaten',
         'klinsch',
         'medische',
         'hartvaat',
         'heart', 
         'vascular',
         'angiogram', 
         'cardiologie', 
         'hartziekte', 
         'vaatziekte',
         'medicine',
         'disease', 
         'medical', 
         'therapy',
         'therapeutic',
         'diagnosic',
         'clinical',
         'surgical', 
         'metabolic',
         'myocard',]
base_url =   oai.sources[source]['link'] #'https://repository.ubn.ru.nl/oai/openaire'  https://scholarlypublications.universiteitleiden.nl/oai2, http://dspace.library.uu.nl/oai/dissertation
pdf_path = f'//Ds/data/LAB/laupodteam/AIOS/Bram/language_modeling/MEDICAL_TEXT/RAW/PhDTheses/{source}/'

In [109]:
OpenAIRE_institutes = ['VU', 'UVA', 'Maastricht']


In [110]:
sickler = sickle.Sickle(base_url)


In [111]:
sets = sickler.ListSets()

In [112]:
Sets = {}
for s in sets:
    Sets[s.setSpec]  = s.setName    

In [113]:
keywords = ['clinical', 'medisch', 'medical', 'dissertation',
            'diss', 'phd', 'thesis', 'doctorate', 
            'doctoraat', 'proefschrift']
if source in OpenAIRE_institutes:
    keywords = keywords + ['publications:withfiles']

Sets_to_mine = []
for key, val in Sets.items():
    if any([c in val.lower() for c in keywords]) | any([c in key.lower() for c in keywords]):
        print(f'Set: {key} contains keyword')
        Sets_to_mine.append(key)

Set: publications:withFiles contains keyword


In [114]:
# get records 
from collections import defaultdict
records_lists = defaultdict(list)
for set_to_mine in tqdm(Sets_to_mine):
    print(f"Mining from set: {set_to_mine}")
    records = sickler.ListRecords(metadataPrefix='oai_dc', 
                                  ignore_deleted=True, 
                                  set=set_to_mine) # dissertation com_1874_298213
    for record in records:
        records_lists[set_to_mine].append(record)

  0%|          | 0/1 [00:00<?, ?it/s]

Mining from set: publications:withFiles


100%|██████████| 1/1 [15:21<00:00, 921.12s/it]


In [132]:
filtered_records_lists = defaultdict(list)
excluded_records_lists = defaultdict(list)

cond_list = []

for set_to_mine in Sets_to_mine:
    relevant_counter = 0
    for r in tqdm(records_lists[set_to_mine]):
        meta = r.get_metadata()
        relevant = False
        
        TOPIC = False
        PDF = False
        DOCTORATE = False
        EMBARGO = False

        if source in ['Radboud']: 
            PDF=True
        
        try:
            if any([t in subj.lower() for subj in meta['subject'] for t in tlist]):
                TOPIC = True
        except:
            pass

        try:
            if any([t in subj.lower() for subj in meta['title'] for t in tlist]):
                TOPIC = True
        except:
            pass

        try:
            if any([t in subj.lower() for subj in meta['description'] for t in tlist]):
                TOPIC = True
        except:
            pass                
            
        try:
            if ('pdf' in meta['format'][0].lower()):
                PDF = True
        except:
            pass
        
        try:
            if ('doctoral' in meta['type'][0].lower()) |\
                    ('book' in meta['type'][0].lower()):
                DOCTORATE = True
        except:
            pass

        try:
            if ('embargo' in meta['rights'][0].lower()) |\
                    ('restricted' in meta['rights'][0].lower()):
                EMBARGO = True
        except:
            pass  
        
        cond_list.append({'Topic': TOPIC, 
                          'PDF': PDF, 
                          'Doctorate': DOCTORATE, 
                          'Embargo': EMBARGO}
                         )
        
        if TOPIC & PDF & DOCTORATE & ~EMBARGO:
            relevant_counter += 1
            filtered_records_lists[set_to_mine].append(r)
        else:
            excluded_records_lists[set_to_mine].append(r)

    print(f'Found {relevant_counter} relevant records in set: {set_to_mine}')      

  0%|          | 0/21600 [00:00<?, ?it/s]

100%|██████████| 21600/21600 [00:03<00:00, 6280.05it/s]

Found 1814 relevant records in set: publications:withFiles





In [133]:
types_counts = defaultdict(int)
for r in excluded_records_lists['publications:withFiles']:
    try:
        types_counts[r.metadata['type'][0]] += 1
    except:
        pass

In [134]:
types_counts

defaultdict(int,
            {'bookPart': 931,
             'article': 9945,
             'book': 6050,
             'workingPaper': 2464,
             'contributionToPeriodical': 309,
             'other': 73,
             'conferenceObject': 14})

In [137]:
conds_df = pd.DataFrame(cond_list)

In [138]:
conds_df.sum()

Topic         5671
PDF          21497
Doctorate     8795
Embargo        318
dtype: int64

In [139]:
link_list = []
error_list = []

for set_to_mine in filtered_records_lists.keys():
    for record in tqdm(filtered_records_lists[set_to_mine]):
        META = record.get_metadata()

        try:
            List_of_identifiers = META['identifier'] if 'identifier' in META.keys() else ['']
            Title = META['title'] if 'title' in META.keys() else ['']
            Description = META['description'] if 'description' in META.keys() else ['']
            Date = META['date'] if 'date' in META.keys() else ['']
            Language = META['language'] if 'language' in META.keys() else ['']       
            Creator = META['creator'] if 'creator' in META.keys() else ['']
            
            
            List_of_identifiers = [id for id in List_of_identifiers if id is not None]
            
            if len(List_of_identifiers)>0:
                link_list.append({'Set': set_to_mine, 
                                'Link': List_of_identifiers, 
                                'Title': Title,
                                'Description': Description,
                                'Date': Date ,
                                'Language': Language,
                                'Creator': Creator
                                }
                                )
            else:
                continue
        except Exception as e:
            error_list.append(f"Error parsing {e}: {META}: ")
            pass

100%|██████████| 1814/1814 [00:00<00:00, 15982.43it/s]


In [69]:
link_list = []
error_list = []
for set_to_mine in Sets_to_mine:
    for r in tqdm(filtered_records_lists[set_to_mine]):
        meta = r.get_metadata()
        
        if relevant:
            relevant_counter += 1
            if source == 'LUMC':
                try:
                    link = meta['identifier'][-1] 
                    # identify first url in list
                    found_link = False
                    for l in meta['identifier']:
                        if ('http:' in l) or ('https:' in l):
                            link = l
                            found_link = True
                            break
                    if not found_link:
                        error_list.append(f'No link found for {meta["identifier"]}')  

                    doc_id = link.split('/')[-1]
                    doc_id_int = int(doc_id)+2

                    link = f"https://scholarlypublications.universiteitleiden.nl/handle/1887/{doc_id}"
                    linkPdf = f"https://scholarlypublications.universiteitleiden.nl/access/item%3A{doc_id_int}/download"

                    # extract through link url. The directory can be found in <li class='ubl-file-download'> <a href='...'>
                    # only if <a href in ubl-file-view is "full"
                    r = requests.get(link)
                    # random number between 0.5 and 2.5 seconds
                    rndSleep = round(random.uniform(0.5, 2.5), 2)
                    sleep(rndSleep)
                    soup = bs4.BeautifulSoup(r.text, 'html.parser')
                    found, dsfound, esfound = False, False, False
                    for _res in soup.findAll('li', {'class':'ubl-file-view'}):
                        if _res.a is not None:
                            if _res.a.contents[0].strip().lower() == 'full text':
                                _pdfdir = _res.a['href']
                                found = True
                            elif _res.a.contents[0].strip().lower() == 'summary in dutch':
                                _dutch_summary = _res.a['href']
                                dsfound = True
                            elif _res.a.contents[0].strip().lower() == 'summary in english':
                                _english_summary = _res.a['href']
                                esfound = True

                    linkPdfAlt = f"https://scholarlypublications.universiteitleiden.nl{_pdfdir}" if found else None
                    DutchSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_dutch_summary}" if dsfound else None
                    EnglishSummaryLink = f"https://scholarlypublications.universiteitleiden.nl{_english_summary}" if esfound else None
                    
                    try:
                        lang = meta['language'][0]
                    except:
                        lang = None
                except Exception as e:
                    error_list.append(f'Error: {e} for link: {link}, with meta data: {meta["identifier"]}')
                    pass
                    #raise ValueError(f'Could not find pdf link for {link}, with error raised: {e}')
            elif source == 'UU':                
                link = meta['identifier'][0]
                baselink = link.replace('dspace.library.uu.nl/', 'dspace.library.uu.nl/bitstream/')
                linkPdf = baselink + '/full.pdf'

                try:
                    linkPdfAlt = baselink +'/'+meta['creator'][0].split(',')[0].lower()+'.pdf'
                except:
                    linkPdfAlt = None
                
                DutchSummaryLink = None
                EnglishSummaryLink = None
                try:
                    lang = meta['language'][0]
                except:
                    lang = None

            elif source == 'Radboud':
                try:
                    found_link = False
                    found_pdf_link = False
                    for l in meta['identifier']:
                        if l is not None:
                            if (('http:' in l) or ('https:' in l)) & (l.split("/")[-1].endswith('.pdf')):
                                linkPdf = l
                                found_pdf_link = True                            
                            elif (('http:' in l) or ('https:' in l)):
                                link = l
                                found_link = True                           

                    if not found_link:                        
                        error_list.append(f'No link found for {meta["identifier"]}')  
                        continue
                    if not found_pdf_link:
                        error_list.append(f'No pdf link found for {meta["identifier"]}')
                        linkPdf = None

                    linkPdfAlt = None
                    DutchSummaryLink = None
                    EnglishSummaryLink = None
                    try:
                        lang = meta['language'][0]
                    except:
                        lang = None
                except Exception as e:
                    error_list.append(f'Error: {e} for link: {link}, with meta data: {meta["identifier"]}')
                    continue
                    #raise ValueError(f'Could not find pdf link for {link}, with error raised: {e}')                


100%|██████████| 886/886 [00:00<00:00, 8949.35it/s]


In [154]:
import re

In [160]:
tstring ="De BLidieBLa bl"
re_str = re.compile(r'bla', re.IGNORECASE)

re_str.search(tstring) is not None

True

In [140]:
def extract_pdf_links(links, institute=None):
    pdf_links = []
    if institute in [r'VU', r'UVA']:
        inclusion_terms = [r'abstract', r'full', r'complete', r'samenvatting', r'summary', r'thesis', r'chapter']
    elif institute in [r'Radboud']:
        inclusion_terms = [r'handle', r'bitstream']
    elif institute in ['Maastricht']:
        inclusion_terms = [r'ASSET1', r'c[0-9]{3,4}\.pdf']
    
    inclusion_terms = [re.compile(rs) for rs in inclusion_terms]
        
    for link in links:
        if (link.lower().startswith('http')) & \
                (link.lower().endswith('.pdf')):
            if any([t.search(link) is not None for t in inclusion_terms]):
                pdf_links.append(link)    
    return pdf_links

In [141]:
link_pdf_list = []
for l in link_list:
    links = extract_pdf_links(l['Link'], institute=source)
    tmp = []
    for _l in links:
        pdfPath = _l.split("/")[-1].replace("%20", "_").rstrip('.pdf')
        pdfPath = source + "_" + l['Creator'][0] + "_" + l['Date'][0] + "_" + pdfPath
        pdfPath = pdfPath.replace(",", "")
        pdfPath = pdfPath.replace(":", "")
        pdfPath = pdfPath.replace(".", "")
        pdfPath = pdfPath.replace(" ", "")
        pdfPath = os.path.join(pdf_path, pdfPath+".pdf")
        tmp.append((_l, pdfPath))
    l['pdf_links'] = tmp
    link_pdf_list.append(l)

In [84]:
# remove duplicate entries, based on the title
title_set = set()
unique_link_list = []
for el in link_pdf_list:
    try:
        title = el['Title'][0]
        if title not in title_set:
            unique_link_list.append(el)
            title_set.add(title)
    except Exception as e:
        pass

In [85]:
# now we parse the link list and download the pdfs
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36' }

# add sleep
def pdf_writer(pdf_url, _pdf_path):
    r = requests.get(pdf_url, stream=True)
    if r.status_code == 200:
        with open(_pdf_path, 'wb') as f:
            for i,chunk in enumerate(r.iter_content(chunk_size=1024)): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
        return True, 200
    else:
        return False, r.status_code

In [86]:
pdf_error_list = []
skipped_list = []
src_list = []
rcode_list = []
extract_full_text = True
for link in tqdm(unique_link_list):
    for lt in link['pdf_links']:
        _pdf_path = lt[1]
        pdf_url = lt[0]
        return_code = None

        if os.path.isfile(_pdf_path):
            skipped_list.append(f'Pdf already exists: {_pdf_path}')
        else:
            # try to download the pdf
            try:
                write_file, return_code = pdf_writer(pdf_url, _pdf_path)
            except Exception as e:
                pdf_error_list.append(f'Error: {e} for link: {link["Link"]}')  

            if return_code == 429:
                print(f'Rate limit exceeded...sleeping 30 seconds')
                sleep(30)
                continue
            else:
                sleep(4)  


 24%|██▍       | 573/2408 [1:11:15<2:46:35,  5.45s/it] 

Rate limit exceeded...sleeping 30 seconds


 25%|██▍       | 597/2408 [1:14:48<3:00:43,  5.99s/it]

Rate limit exceeded...sleeping 30 seconds


 25%|██▌       | 608/2408 [1:16:15<2:52:30,  5.75s/it]

Rate limit exceeded...sleeping 30 seconds


 27%|██▋       | 647/2408 [1:21:00<2:45:26,  5.64s/it]

Rate limit exceeded...sleeping 30 seconds


 28%|██▊       | 664/2408 [1:23:12<2:43:09,  5.61s/it]

Rate limit exceeded...sleeping 30 seconds


 28%|██▊       | 680/2408 [1:25:24<2:45:02,  5.73s/it]

Rate limit exceeded...sleeping 30 seconds


 30%|███       | 733/2408 [1:31:44<2:30:23,  5.39s/it]

Rate limit exceeded...sleeping 30 seconds


 32%|███▏      | 771/2408 [1:36:39<3:25:14,  7.52s/it]


KeyboardInterrupt: 

In [87]:
  
print(f'Found {len(pdf_error_list)} errors while downloading pdfs')
print(f'Skipped {len(skipped_list)} pdfs because they already existed')

Found 0 errors while downloading pdfs
Skipped 0 pdfs because they already existed
