In [272]:
# read in pdf and extract text
# We want to ignore all the decorum and only extract the text (i.e. no page number etc.)
import pytesseract
from PyPDF2 import PdfReader
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import fitz
from PIL import Image
import ftfy

import matplotlib.pyplot as plt

from io import StringIO
import re
import os

from tqdm import tqdm
from typing import List, Tuple

In [2]:
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'

In [None]:
institute = 'UU'
pdf_path = f'//Ds/data/LAB/laupodteam/AIOS/Bram/language_modeling/MEDICAL_TEXT/RAW/PRETRAINING/PhDTheses/{institute}'

In [4]:
def pdf_to_text(path, backends=['pypdf', 'fitz']):
    '''Extract text from pdf documents
        Source: https://towardsdatascience.com/pdf-preprocessing-with-python-19829752af9f
    '''
    def pdfminer(_path):
        manager = PDFResourceManager()
        retstr = StringIO()
        layout = LAParams(all_texts=False, detect_vertical=True)
        device = TextConverter(manager, retstr, laparams=layout)
        interpreter = PDFPageInterpreter(manager, device)
        with open(_path, 'rb') as filepath:
            for page in PDFPage.get_pages(filepath, check_extractable=True):
                interpreter.process_page(page)
        text = retstr.getvalue()
        device.close()
        retstr.close()
        return text

    def pypdfer(_path):
        reader = PdfReader(_path)
        return [p.extract_text(0) for p in reader.pages]
    
    error = ""

    if 'pypdf' in backends:
        try:
            return pypdfer(path), None
        except Exception as e_1:
            error = error + f"\n PyPDF failed: {e_1}"
   
    if 'fitz' in backends:
        try:
            pdf_file = fitz.open(path)
            pages = []
            for _p in pdf_file:
                pages.append(_p.getText())
            return pages, error
        except Exception as e_2:
            error = error + f"\n PyMuPDF failed: {e_2}"
        
    if 'pdfminer' in backends:
        try:
            return pdfminer(path), None
        except Exception as e_3:
            error = error + f"pdfminer failed: {e_3}"

    if 'pytesseract' in backends:
        try:
            return pytesseract.image_to_string(path, lang='en'), None
        except Exception as e_4:
            return error + f"\n PyTesseract failed: {e_4}" 
        
# https://arxiv.org/abs/2308.13418

In [None]:
# We want to ignore 
# Title pages
# Table of contents
# Reference lists 
# Acknowledgements
# List of abbreviations
# List of figures

# Remove
# all empty lines or lines that only have numbers
# all pages with less than K words

# We want to extract
# body, summary(english), summary(dutch)

In [255]:
def extract_summary(Texts: List[str], max_scount: int=20) -> str:
    samenvatting = []
    capture = False
    scount = 0
    for page in Texts:
        if any(page.lower().startswith(x) for x in ['s amenvatting', 'samenvatting', 
                                                    'nederlandse samenvatting', 'n ederlandse samenvatting']):
            capture = True
            scount += 1
        elif any(page.lower().startswith(x) for x in ['d ankwoord', 
                                              'na woord',
                                              'a cknowledgment',
                                              'c ontents', 
                                              't able of contents', 
                                              'l ist of figures', 
                                              'l ist of abbreviations', 
                                              'a cknowledgements', 
                                              'r eferences',
                                              'dankwoord',
                                              'nawoord', 
                                              'acknowledgment',
                                              'contents', 
                                              'table of contents', 
                                              'list of figures', 
                                              'list of abbreviations', 
                                              'acknowledgements', 
                                              'references']):
            capture = False
        if capture:
            scount += 1
            samenvatting.append(page)
        if scount >= max_scount:
            break
    summary = []
    capture = False
    scount = 0
    for page in Texts:
        if any(page.lower().startswith(x) for x in ['s ummary', 'summary', 'english summary']):
            capture = True
            scount += 1
        elif any(page.lower().startswith(x) for x in ['d ankwoord', 
                                              'na woord',
                                              'a cknowledgment',
                                              'c ontents', 
                                              't able of contents', 
                                              'l ist of figures', 
                                              'l ist of abbreviations', 
                                              'a cknowledgements', 
                                              'r eferences',
                                              'dankwoord',
                                              'nawoord', 
                                              'acknowledgment',
                                              'contents', 
                                              'table of contents', 
                                              'list of figures', 
                                              'list of abbreviations', 
                                              'acknowledgements', 
                                              'references']):
            capture = False
        if capture:
            scount += 1
            summary.append(page)
        if scount >= max_scount:
            break
    return '\n'.join(summary), '\n'.join(samenvatting)

In [264]:
re_numbers_at_start_of_sentence = re.compile(r'(\d+)\n')  # Matches numbers at the start of a sentence
re_numbers_at_start_of_string = re.compile(r'^(\d+)')  # Matches numbers at the start of a sentence
re_lines_with_only_numbers = re.compile(r'^\s*\d+\s*$', re.MULTILINE)  # Matches lines that contain only numbers
re_multiple_newlines = re.compile(r'\n+')
re_empty_lines = re.compile(r'\n\s*\n')
re_empty_lines_start = re.compile(r'^\s*\n')
re_empty_lines_end = re.compile(r'\n\s*$')
re_multiple_spaces = re.compile(r'\s+')


def extractor(Text: List[str], min_words: int=100) -> List[str]:
    Text = [ftfy.fix_text(t) for t in Text]

    Text = [t for t in Text if len(t.split())>50]
    Text = [re_numbers_at_start_of_sentence.sub('', t) for t in Text]
    Text = [re_numbers_at_start_of_string.sub('', t) for t in Text]
    Text = [re_lines_with_only_numbers.sub('', t) for t in Text]
    Text = [re_multiple_newlines.sub('\n', t) for t in Text]
    Text = [re_empty_lines.sub('\n', t) for t in Text]
    Text = [re_empty_lines_start.sub('', t) for t in Text]
    Text = [re_empty_lines_end.sub('', t) for t in Text]
    Text = [re_multiple_spaces.sub(' ', t) for t in Text]
    Text = [t for t in Text if len(t.split())>50]

    # ignore references
    reference_phrases = ['references', 'literature', 'bibliography', 'referenties', 'literatuurlijst']
    Text = [t for t in Text if not any(reference_phrase in t.lower() for reference_phrase in reference_phrases)]
    # ignore lime that start with numbers after the linebreak or have "doi:10" in them
    # scan the page for lines that start with numbers after a linebreak
    _TEXT = []
    for page in Text:
        lines = page.split('\n')
        __page= []
        for line in lines:
            if (not re.search(r'^\d+', line)) and ('doi:10' not in line.lower()):
                __page.append(line)
        _TEXT.append('\n'.join(__page))
    Text = _TEXT

    # ignore list of figures
    figure_phrases = ['list of figures', 'lijst van figuren']
    Text = [t for t in Text if not any(figure_phrase in t.lower() for figure_phrase in figure_phrases)]

    # ignore list of abbreviations
    abbreviation_phrases = ['list of abbreviations', 'lijst van afkortingen']
    Text = [t for t in Text if not any(abbreviation_phrase in t.lower() for abbreviation_phrase in abbreviation_phrases)]

    # ignore copyright page
    copyright_phrases = ['all rights reserved', 'no part of this publication may be reproduced', 'copyright', 'uitgeverij']
    Text = [t for t in Text if not any(copyright_phrase in t.lower() for copyright_phrase in copyright_phrases)]

    phd_phrases = ['volgens besluit van het college voor promoties', 'de graad van doctor aan']
    Text = [t for t in Text if not any(phd_phrase in t.lower() for phd_phrase in phd_phrases)]

    # ignore table of contents
    toc_phrases = ['inhoudsopgave', 'table of contents']
    Text = [t for t in Text if not any(toc_phrase in t.lower() for toc_phrase in toc_phrases)]
    # ignore if multiple sentences in a page start with "chapter \d"
    chapter_phrases = ['chapter ', 'hoofdstuk ']
    Text = [t for t in Text if sum(t.lower().count(chapter_phrase) for chapter_phrase in chapter_phrases)<2]

    # ignore acknowledgements
    acknowledgement_phrases = ['acknowledgements', 'acknowledgements', 'dankwoord', 'dankbetuiging']
    Text = [t for t in Text if not any(acknowledgement_phrase in t.lower() for acknowledgement_phrase in acknowledgement_phrases)]

    # ignore list of publications
    publication_phrases = ['list of publications', 'lijst van publicaties', 'bibliography', 'bibliografie']
    Text = [t for t in Text if not any(publication_phrase in t.lower() for publication_phrase in publication_phrases)]


    Text = [t for t in Text if len(t.split())>25]

    TextNumWords = [len(t.split()) for t in Text]
    

    return Text, TextNumWords

In [None]:
Files = [f for f in os.listdir(pdf_path) if f.endswith('.pdf')]

ListOfTexts = []    
ListOfSummaries = []
ListOfNumWordLists = []
for _File in tqdm(Files):
    _path = os.path.join(pdf_path, _File)
    try:
        Text, error = pdf_to_text(_path)
    except Exception as e:
        print(f"Error processing {_File}: {e}")
        continue
    SummaryEnglish, SummaryDutch = extract_summary(Text)
    CleanedText, NumWordList = extractor(Text, min_words=25)

    for k, page in enumerate(CleanedText):
        ListOfTexts.append({
            'institute': institute,
            'file': _File,
            'pseudo_pagenum': k,
            'text': page
        })
    ListOfSummaries.append({
        'institute': institute,
        'file': _File,
        'summary_english': SummaryEnglish,
        'summary_dutch': SummaryDutch
    })
    ListOfNumWordLists.append({
        'institute': institute,
        'file': _File,
        'num_words_per_page': NumWordList
    })          
            

  3%|▎         | 32/1223 [02:10<52:35,  2.65s/it]  