## Camelot no cli
```
Using camelot to process pdf files to extract table content to a single csv, no cli
See blog post https://adamsramblings.xyz]
December 2020
Adam Squire

/
--/csvs       
  --/pdfplumber_csvs -- holds csvs that are generated during processing
  --/camelot_csvs   -- holds csvs that are generated during processing
--/notebooks  -- holds notebooks
--/outputs    
  --/csv      -- holds output csvs
  --/sqldb    -- holds output sql dbs
--/scripts    -- holds scripts
--/srchtml    -- holds source html
--/srcpdfs    -- holds source pdfs
--/urls       -- holds urls
```

In [None]:
import pandas as pd
import numpy as np
import os
import re
import camelot
from datetime import datetime
startTime = datetime.now()

# These are the headers that will be set as the first row of the combined csv file
headers = ['Date','Time','Town / Village','Area','Occupation (Where Relevant)','Description','Page','File','ReportYear']

# These files contain repeated table headers on every page
drop = ["ufo_report_2008.pdf","UFOReport2000.pdf","UFOReport1999.pdf","UFOReport1998.pdf"]

# This is the path to where the output csv files are stored and the combined csv file will be created
base_path = '../'

# This is the name of the output csv file
full_file = os.path.join(base_path,"outputs","csv","ufo_actvity_data_camelot_nocli.csv")

# Get pdf file list
files = os.listdir(os.path.join(base_path,"srcpdfs"))

# initialise list
data = []

for file in files:
    print("Processing file ",file)
    # get the year from the filename
    year = re.findall('\d+',file)[0]
    dropheader = False
    # determine if headers repeat and set drop to true if they do
    if file in drop:
            dropheader = True
    tables =camelot.read_pdf(os.path.join(base_path,"srcpdfs",file), pages='1-end')
    for table in tables:
        # replace tab, new line
        table.df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
        # convert empty strings to NaN for removal of full rows
        table.df.replace("", np.nan, inplace=True)
        # remove empty rows
        table.df.dropna(axis = 0, how = 'all', inplace = True)
        # set the page number
        table.df['Page'] = table.page
        # set the file name
        table.df['File'] = file
        # set the report year
        table.df['ReportYear'] = year
        # if the page is the first page or the file contains repeated headers then we want to drop the first row
        if table.page == 1 or (table.page > 0 and dropheader):
            data.append(table.df[1:])
        else:
            data.append(table.df)
        
# join the data to a single data frame        
df = pd.concat(data, ignore_index=True) 
# set the header row
df.columns = headers 
# find instances of null area and repated lastword in Town / Village e.g. South East LondonLondon
# replace the Area value with the last word from Town / Village
df.loc[(df['Area'].isnull() ) & (df['Town / Village'].str.count(r'([A-Z][a-z]+)([A-Z][a-z]+)+')>0),'Area'] = df['Town / Village'].str.split().str[-1]
# strip the repeated word from Area
df.loc[( (df['Area'].str.count(r'([A-Z][a-z]+)([A-Z][a-z]+)+$')>0) & (df['Town / Village'].str.count(r'([A-Z][a-z]+)([A-Z][a-z]+)+$')>0) ),'Area'] = df['Area'].str.replace(r"(\w[a-z]+)","",1)
# strip the repeated word from Town / Village
df.loc[(df['Town / Village'].str.count(r'([A-Z][a-z]+)([A-Z][a-z]+)+$')>0),'Town / Village'] = df['Town / Village'].str.replace(r"\w[a-z]+$","",1)
# generate output csv file
df.to_csv(full_file, index=False,header=True)
print("Created file ",full_file)
print("File has",len(df)," rows")
# get row counts
df.groupby('File')['File'].count()
print("Time to complete",datetime.now() - startTime)