This notebook will let you analyze all certificate documents uploaded to [californiascouting.org](https://californiascouting.org) and query for the following information:

|  |
| --- |
| This certificate is presented to who? |
| For completing the California Child Abuse Mandated Reporter Online Training for what? |
| Hours of education earned? |
| What is the date of completion? |
| What is the certificate number? |

In [25]:
import os
import io
from urllib.parse import urlparse

import boto3
import pandas as pd
import requests
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.errors import PdfReadError

input_csv_filename = "bsa-glaac-ca-ab506-certificates-input2.csv"
bucket_name = "bsa-ca-ab506-training"
second_page_only_prefix = 'second_page_only/'

session = boto3.Session(profile_name='bsa-ca-ab506-textract')

In [31]:
def upload_from_url(url: str, bucket):
    if url.startswith('https://californiascouting.org/'):
        r = requests.get(url)
        if r.status_code == 200:
            # get file name
            file_name = os.path.basename(urlparse(url).path)

            # load file
            raw_data = r.content

            # detect pages
            try:
                reader = PdfReader(io.BytesIO(raw_data))
                if len(reader.pages) > 1:
                    # extracting the 2nd page
                    # (Textract synchronous document analysis only works on one page PDFs)
                    file_name = f"{second_page_only_prefix}{file_name}"
                    page = reader.pages[1]
                    writer = PdfWriter()
                    writer.add_page(page)
                    with io.BytesIO() as bytes_stream:
                        writer.write(bytes_stream)
                        bytes_stream.seek(0)
                        bucket.upload_fileobj(bytes_stream, file_name)
                else:
                    # only one page PDF, saving the file directly to S3
                    bucket.upload_fileobj(io.BytesIO(raw_data), file_name)
            except PdfReadError:
                # not a PDF, saving the file directly to S3
                bucket.upload_fileobj(io.BytesIO(raw_data), file_name)
            print(f'Uploaded {file_name}')

def upload_files(bucket_name: str, input_csv_filename: str, session: boto3.Session, testing=False) -> pd.DataFrame:
    s3 = session.resource('s3')
    bucket = s3.Bucket(bucket_name)

    # load full URL list
    df = pd.read_csv(input_csv_filename)
    if testing:
        df.sort_values(by=['upload_doc_url'], ascending=False, ignore_index=True, inplace=True)
    if 'document' not in df:
        df.insert(
            loc=df.columns.get_loc('upload_doc_url') + 1,
            column='document',
            value=df['upload_doc_url'].apply(lambda x: os.path.basename(urlparse(x).path))
        )

    # check which have already been uploaded (we will filter these out when re-running this script)
    objects = [obj.key.replace(second_page_only_prefix, '') for obj in bucket.objects.all()]
    upload_df = df[~df['document'].isin(objects)]
    if testing:
        upload_df = upload_df.head(6)

    # upload each URL to the bucket
    object_count = len(upload_df.index)
    for index, row in upload_df.iterrows():
        print(f"{index + 1} of {object_count}")
        upload_from_url(row['upload_doc_url'], bucket)
    return df

input_df = upload_files(bucket_name, input_csv_filename, session)