In [28]:
import camelot
import csv, os, requests, time, zipfile
from datetime import datetime

url = 'https://omma.ok.gov/sites/g/files/gmc736/f/omma_processors_list.pdf'

def current_time():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    return current_time

def time_delta(start_time):
    fmt = "%H:%M:%S"
    tdelta = datetime.strptime(current_time(), fmt) - datetime.strptime(start_time, fmt)
    return tdelta
    

def download_omma_pdf(url):
    start_time = current_time()
    print(f'Started download from {url} at {start_time}...')
    r = requests.get(url)
    print(f'HTTP Response: {r.status_code}')
    with open('omma_processor_list.pdf', 'wb') as f:
        f.write(r.content)
    print(f'Download complete. {time_delta(start_time)} seconds elapsed')
    return csv_to_pdf('omma_processor_list.pdf')
    
    
def csv_to_pdf(file):
    start_time = current_time()
    file_path  = os.path.abspath(file)
    print(f'Starting conversion PDF -> CSV for file {file_path}')
    tables = camelot.read_pdf(file, pages='all')
    tables[0]
    tables[0].parsing_report
    tables.export('omma_processors_list.csv', f='csv', compress=True)
    print(time_delta(start_time))
    return unzip_csv('omma_processors_list.zip')
    
def unzip_csv(file):
    start_time = current_time()
    unpack_directory = os.path.join(os.getcwd(), 'omma_processors')
    print(f'Unpacking ZIP file to destination {unpack_directory}...')
    with zipfile.ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall(os.getcwd() + '/omma_processors')
    number_of_files = len([f for f in os.listdir(unpack_directory)if os.path.isfile(os.path.join(unpack_directory, f))])
    print(f'ZIP file unpacked at {os.getcwd()}/omma_processors, {number_of_files} files unpacked')
    directory = os.getcwd() + '/omma_processors'
    print(time_delta(start_time))
    return consolidate_csv_files(directory)
    
    
def consolidate_csv_files(csv_directory):
    start_time = current_time()
    abspath = os.path.abspath(csv_directory)
    dname = os.path.dirname(abspath)

    csv_buffer = []
    consolidated_list = []
    csv_header = "name,license_no,email,phone,city,zip,county"
    csv_out = f"{dname}/consolidated_processors_list.csv"
    os.chdir(csv_directory)
    dir_tree = os.walk(csv_directory)
    print('Consolidating CSV files...')
    
    # append all csv files to buffer
    for dirpath, dirnames, filenames in dir_tree:
        for file in filenames:
            if file.endswith('.csv'):
                consolidated_list.append(file)
    
    # create empty final consolidated csv
    consolidated_csv = open(csv_out, 'w')
    consolidated_csv.write(csv_header)
    consolidated_csv.write('\n')
    
    # parse each csv in consolidated_list and add all rows to consolidated_csv.csv
    for file in consolidated_list:
        csv_in = open(file)
        for line in csv_in:
            if line.startswith(csv_header):
                continue
            consolidated_csv.write(line)
        csv_in.close()
    print(f'Consolidated CSV file saved at {csv_out}...')
    print(time_delta(start_time))
    os.chdir(f'{dname}')
    clean_consolidated_csv('consolidated_processors_list.csv')

    
def clean_consolidated_csv(file):
    csv_header = ('name', 'trade_name', 'license_no','email','phone','city','zip_code','county','license_type')
    print('Cleaning consolidated CSV')
    with open(file, 'r') as input:
        with open('final_file.csv', 'w') as output:
            writer = csv.writer(output)
            reader = csv.reader(input)
            
            next(reader)
            writer.writerow(csv_header)
            
            for row in reader:
                try:
                    name = row[0].split('\n')[0].strip()
                    trade_name = row[0].split('\n')[1].split(':')[1].strip()
                    license_no = row[1].strip()
                    email = row[2].strip()
                    phone = row[3].strip()
                    city = row[4].strip()
                    zip_code = row[5].strip()
                    county = row[6].strip()
                    license_type = 'Processor'
                    writer.writerow((name, trade_name, license_no, email, phone, city, zip_code, county, license_type))
                except:
                    pass
            print('Consolidation complete.')
    
download_omma_pdf(url)

Started download from https://omma.ok.gov/sites/g/files/gmc736/f/omma_processors_list.pdf at 21:50:55...
HTTP Response: 200
Download complete. 0:00:08 seconds elapsed
Starting conversion PDF -> CSV for file /Users/ivorybook/code/jupyter_notebook/ok_mmj/omma_processor_list.pdf
0:03:24
Unpacking ZIP file to destination /Users/ivorybook/code/jupyter_notebook/ok_mmj/omma_processors...
ZIP file unpacked at /Users/ivorybook/code/jupyter_notebook/ok_mmj/omma_processors, 94 files unpacked
0:00:00
Consolidating CSV files...
Consolidated CSV file saved at /Users/ivorybook/code/jupyter_notebook/ok_mmj/consolidated_processors_list.csv...
0:00:00
Cleaning consolidated CSV
Consolidation complete.
