In [1]:
import PyPDF2
import os
import pandas as pd
from PyPDF2 import PdfFileReader
import pikepdf
import datetime
import re
from dateutil.tz import tzutc, tzoffset
import sys
import camelot as cam

#Formating how we want to transform the date object from a string to a date time format
pdf_date_pattern = re.compile(''.join([
    r"(D:)?",
    r"(?P<year>\d\d\d\d)",
    r"(?P<month>\d\d)",
    r"(?P<day>\d\d)",
    r"(?P<hour>\d\d)",
    r"(?P<minute>\d\d)",
    r"(?P<second>\d\d)",
    r"(?P<tz_offset>[+-zZ])?",
    r"(?P<tz_hour>\d\d)?",
    r"'?(?P<tz_minute>\d\d)?'?"]))

"""
    OBS! this function will be utilized in the later function
    Creating a functiong that converts a pdf date such as "D:20120321183444+07'00'" into a usable datetime
    http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm
    (D:YYYYMMDDHHmmSSOHH'mm')
    :param date_str: pdf date string
    :return: datetime object
    
    As this is not the data extraction i wount dwell to much at this
"""
def transform_date(date_str):

    global pdf_date_pattern
    match = re.match(pdf_date_pattern, date_str)
    if match:
        date_info = match.groupdict()

        for k, v in date_info.items():  # transform values
            if v is None:
                pass
            elif k == 'tz_offset':
                date_info[k] = v.lower()  # so we can treat Z as z
            else:
                date_info[k] = int(v)

        if date_info['tz_offset'] in ('z', None):  # UTC
            date_info['tzinfo'] = tzutc()
        else:
            multiplier = 1 if date_info['tz_offset'] == '+' else -1
            date_info['tzinfo'] = tzoffset(None, multiplier*(3600 * date_info['tz_hour'] + 60 * date_info['tz_minute']))

        for k in ('tz_offset', 'tz_hour', 'tz_minute'):  # no longer needed
            del date_info[k]

        return datetime.datetime(**date_info)


"""
   Creating a function which navigates through the input "directory", and only extracts files if they end with .pdf
   next step uses to different pdf extration libraries, one for the genereal information about the pdf. PyPDF2,
   the other camelot in order to extract the tables of the pdf. 
"""

def pdf_to_df(directory):

    df_final = pd.DataFrame(columns=['Prioriteret \nanvendelse','Præparatnavn \n(lægemiddelnavn)','Dosering og dispenseringsform','author','title','creation date'])

    for file in os.listdir(directory):
        #iterating through the files and checking if they end with .pdf
        if not file.endswith(".pdf"):
            continue
            #opening the files if they end with .pdf
        with open(os.path.join(directory,file), 'rb') as pdfFileObj:  # Changes here
            print(pdfFileObj)
            #reading the file using the PyPDF2
            pdf = PdfFileReader(pdfFileObj)
            info = pdf.getDocumentInfo()

            #reading the file using the camelot 
            pdf = cam.read_pdf(
            pdfFileObj.name,
            pages='all',
            password=None,
            flavor='lattice',
            suppress_stdout=False,
            layout_kwargs={},
            )
            #etracting he firs table of the camelot object
            df=pdf[0].df
            df
            #Iterating over the rest of them and appending them to a collected daaframe
            for i in pdf[1:]:
                dfit = i.df
                df = df.append(dfit)
            df = df.drop(df.index[0])
            #Setting the column names of the dataframe
            mapping = {df.columns[0]: 'Prioriteret \nanvendelse', df.columns[1]: 'Præparatnavn \n(lægemiddelnavn)', df.columns[2]:'Dosering og dispenseringsform'}

            #Appending meta data for information
            df['author'] = info.author
            df['title'] = info.title
            #Using the date time function, in order to get a readable format
            df['creation date'] = transform_date(str(info['/CreationDate']))
            #Applying the names of the columns to ourdataframe
            df = df.rename(columns=mapping)
            #Appending the dataframe generated to our final dataframe
            df_final = df_final.append(df, ignore_index = True)
            
     #Exporing it to a csv-file and returning he dataframe
    df_final.to_csv('Pdf_to_csv.csv',index=False)
    return df_final


if __name__ == '__main__':
    directory = 'MedPDF'
    df = pdf_to_df(directory)
    
"""
   Chalenges within the field of pdf scrapping, is the assumphoin the all the tables have structured
   table format with three columns. As it is demonstrated in the pdf from "test2.pdf", some of the tables will have 
   only two columns. This is a problem, which can be targetted by extracting the different meaning across different table 
   structures. This will lay a foundation for a more accurate naming convention.
"""

C:\Users\ander\anaconda3\envs\BDP\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\ander\anaconda3\envs\BDP\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


<_io.BufferedReader name='MedPDF\\Test.pdf'>
<_io.BufferedReader name='MedPDF\\test2.pdf'>


'\n   Chalenges within the field of pdf scrapping, is the assumphoin the all the tables have structured\n   table format with three columns. As it is demonstrated in the pdf from "test2.pdf", some of the tables will have \n   only two columns. This is a problem, which can be targetted by extracting the different meaning across different table \n   structures. This will lay a foundation for a more accurate naming convention.\n'

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 6 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Prioriteret 
anvendelse         75 non-null     object
 1   Præparatnavn 
(lægemiddelnavn)  75 non-null     object
 2   Dosering og dispenseringsform   57 non-null     object
 3   author                          75 non-null     object
 4   title                           75 non-null     object
 5   creation date                   75 non-null     object
dtypes: object(6)
memory usage: 3.6+ KB


In [3]:
df.head()

Unnamed: 0,Prioriteret \nanvendelse,Præparatnavn \n(lægemiddelnavn),Dosering og dispenseringsform,author,title,creation date
0,Anvend som 1. valg \ntil mindst 80 % af \npati...,Hyrimoz (adalimumab),Induktion: \n80 mg s.c. i uge 0 og 40 mg i uge...,Medicinrådet,Medicinrådets lægemiddelrek. og behandlingsvej...,2022-01-18 10:00:56+01:00
1,2. valg,Taltz (ixekizumab),"Induktion: \n160 mg s.c. i uge 0, 80 mg i uge ...",Medicinrådet,Medicinrådets lægemiddelrek. og behandlingsvej...,2022-01-18 10:00:56+01:00
2,3. valg,Cosentyx (secukinumab),"Induktion: \n300 mg s.c. i uge 0, 1, 2, 3 og 4...",Medicinrådet,Medicinrådets lægemiddelrek. og behandlingsvej...,2022-01-18 10:00:56+01:00
3,4. valg,Tremfya (guselkumab),Induktion: \n100 mg s.c. i uge 0 og 4 \nVedlig...,Medicinrådet,Medicinrådets lægemiddelrek. og behandlingsvej...,2022-01-18 10:00:56+01:00
4,5. valg,Skyrizi (risankizumab),Induktion: \n150 mg s.c. i uge 0 og 4 \nVedli...,Medicinrådet,Medicinrådets lægemiddelrek. og behandlingsvej...,2022-01-18 10:00:56+01:00
