## PDFPlumber
```Using PDFPlumber to process pdf files to extract table content to individual csvs per PDF and combine to a single csv
See blog post https://adamsramblings.xyz
December 2020, Adam Squire

/
--/csvs       
  --/pdfplumber_csvs -- holds csvs that are generated during processing
  --/camelot_csvs   -- holds csvs that are generated during processing
--/notebooks  -- holds notebooks
--/outputs    
  --/csv      -- holds output csvs
  --/sqldb    -- holds output sql dbs
--/scripts    -- holds scripts
--/srchtml    -- holds source html
--/srcpdfs    -- holds source pdfs
--/urls       -- holds urls
```

In [None]:
import pandas as pd
import pdfplumber
import os
import sys
import numpy as np
import glob
import re
from datetime import datetime
startTime = datetime.now()

# base path
path = "../"

# Each file contains the same columns but the headers are slightly different,conform here
headers = ['Date','Time','Town / Village','Area','Occupation (Where Relevant)','Description','Page','File','ReportYear']

# Some of the PDF pages do not have a bottom line at the end of the table on the first page
# PDFPlumber can't identify the end of the table so it takes the penultimate row as the final
# row of the table, add the filename and position of the line at the bottom of the page
line = {"UFOReports2006WholeoftheUK.pdf":521,"UFOReports2004WholeoftheUK.pdf":552.875}

# Some of the PDF documents repeat the header on each page. Add the document
# title to this list to use the headers from the table on the first page and
# drop the headers from the rest of the tables
drop = ["ufo_report_2008.pdf", "UFOReport2000.pdf", "UFOReport1999.pdf","UFOReport1998.pdf"]


# Function make_df()
#
# Args:
#        page_number (int): The page number being processed.
#        table (list): List representing the table as prepared by PDFPlumber.
#        dropheader (bool): used to denote files that have repeated headers
#
# Returns:
#        df: pandas dataframe containing the table data. 
#            Page number is added, empty rows are removed, tabs and new line are removed
def make_df(page_number,table,dropheader=False):
        df = pd.DataFrame(table)
        # replace tab, new line
        df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True) 
        # convert empty strings to NaN for removal of full rows
        df.replace("", np.nan, inplace=True) 
        # remove empty rows
        df.dropna(axis = 0, how = 'all', inplace = True) 
        # instances where the first character has been cut off (2 = Town / Village, 3 = Area
        df.loc[df[3].str.count(r'(^[a-z]+)') >0, 3] = df[2].str[-1] + df[3] 
        #instances where the last character is uppercase
        df.loc[df[2].str.count(r'\w[A-Z]') >0, 2] = df[3].str[:-1] 
        # page_number is zero based, increment by 1 to match PDF document
        df['Page']=page_number+1
        if page_number == 0:
                df = df[1:] # remove header from first page data
        if page_number > 0 and dropheader:
                df = df[1:] # remove repeated header if present
        return  df
    
# Function process_pdf()
#
# Args:
#        file (str): The path to the file to process with PDFPlumber
#
# Returns:
#        void: no return
def process_pdf(file):
        with pdfplumber.open(os.path.join(path,"srcpdfs",file)) as pdf:
                data = []
                # get year from file name
                year = re.findall('\d+',file)[0]
                # for each page in pages
                for page_number,page in enumerate(pdf.pages):
                        settings = {}
                        dropheaders = False
                        # get any custom settings
                        if file in line and page_number == 0:
                                settings["explicit_horizontal_lines"] = [line[file]]
                        # set if we need to drop repeated headers
                        if file in drop:
                                dropheaders = True
                        # get out table data from the page
                        table=page.extract_table(table_settings=settings)
                        # get a data frame from make_df()
                        filedata = make_df(page_number,table,dropheaders)
                        # append to output data
                        data.append(filedata)
                # set the output file name
                csv_file = os.path.join(path,"csvs","pdfplumber_csvs",file.replace(".pdf",".csv"))
                # concatenate the tables data
                df = pd.concat(data, axis=0)
                # add additional columns
                df['File'],df['ReportYear']=file,year
                # set headers
                df.columns = headers
                # write csv
                df.to_csv(csv_file,index=False,header=True)
                print("CSV File Written to ",csv_file)


files = os.listdir(os.path.join(path, "srcpdfs"))
# for each pdf file call process_pdf()
for file in files:
        if ".pdf" in file:
                print("Processing file ",file)
                process_pdf(file)


files = glob.glob(os.path.join(path, "csvs","pdfplumber_csvs","*.csv"))
if len(files) > 0:
        full_file = os.path.join(path,"outputs","csv","ufo_all_data.csv")
        # for each created csv file combine into a single data frame to create a single csv file
        df = pd.concat((pd.read_csv(f, header=0, names=headers) for f in files), ignore_index=True)
        df.to_csv(full_file, index=False,header=True)
        print("Created file ",full_file)
        print("File has",len(df)," rows")
        df.groupby('File')['File'].count()       
else:
        print("No CSV files found")

print("Time to complete",datetime.now() - startTime)