In [None]:
'''
This script processes data from DOCCS' M187 incarcerated individual death forms into a table format. It first
classifies the forms by layout type using a Microsoft Azure AI Document Intelligence model, then, it extracts
fields from each form using a Document Intelligence model custom built for each layout type. Finally,
following a manual review of the extracted fields, it standardizes and cleans fields that need to be converted
to the appropriate types (e.g. float, date, time).

To run this code, perform the following steps:
1. Install the packages in the cell below on your local machine.
2. Insert your Azure blob key, URL, and container name for the M187s in the cell below. Also insert your Azure formrecognizer endpoint and key.
3. Upload the M187s as PDFs to your Azure blob container, using a separate file for each M187 (first page only).
4. Create a document classification model in FormRecognizer, using a few M187 samples to train the model. Also create document extraction
models for each M187 type (boldface and non-boldface). To match the field names used throughout this code pipeline, use the following
field names in your FormRecognizer models:

        "DECEASED_NAME", "DIN", "NYSID", "FACILITY_NAME", "CODE", \
        "REPORT_DATE", "REPORTING_OFFICIAL_NAME", "HEIGHT", "HEIGHT_FT", "HEIGHT_IN", "ETHNICITY", "WEIGHT", "RACE", "SEX", \
        "SENTENCE", "BIRTH_DATE", "SENTENCE_DATE", "ARREST_CHARGES", "DATE_ARREST", "DATE_CONVICTION", "CONVICTION_CHARGES", \
        "HOSPITAL_NAME", "CHIEF_ADMIN_OFFICER_NAME", "AMBULANCE_RESCUE_SQUAD_NAME", "DATETIME_ADMITTED", "DATE_OF_LAST_ADMISSION", \
        "DEATH_DATE", "DEATH_TIME", "TERMINAL_INCIDENT_LOCATION", "REPORTED_IMMEDIATE_CAUSE_OF_DEATH", "FACILITY_ADMINISTRATORS_REPORT_OF_DEATH_CIRCUMSTANCES", \
        "STAFF_INCARCERATED_WITNESSES", "OFFICER_SUPERVISING_DEATH_LOCATION", "ASSIGNED_HOUSING_UNIT", "HOUSING_UNIT_TYPE", \
        "AUTOPSY_DATE", "AUTOPSY_TIME", "AUTOPSY_DATETIME", "AUTOPSY_LOCATION", "MEDICAL_EXAMINER_CORONER_NAME", "AUTOPSY_PERFORMED_YES", "AUTOPSY_PERFORMED_NO", \
        "SUPERVISION_PRIOR_TO_INCIDENT_ACTIVE", "SUPERVISION_PRIOR_TO_INCIDENT_CONSTANT", "SUPERVISION_PRIOR_TO_INCIDENT_GENERAL",
        "SUBSTANCE_ABUSE_DRUG", "SUBSTANCE_ABUSE_ALCOHOL", "SUBSTANCE_ABUSE_UNKNOWN", "SUBSTANCE_ABUSE_NO", \
        "MEDICAL_TREATMENT", "PSYCH_TREATMENT", "NO_TREATMENT", "DATE_LAST_CONTACT", "MEDICAL_CONTACT", "PSYCH_CONTACT", \
        "INTAKE_SCREENING_YES", "INTAKE_SCREENING_NO", "FIELD_CONFIDENCE_DICT", "FACILITY_CODE_2D", "WEIGHT_LBS_F", "DATE_OF_BIRTH_D", \
        "DEATH_TIME_T", "DATE_ARREST_D", "DATE_CONVICTION_D", "SENTENCE_DATE_D", "AUTOPSY_DATE_D", "DATETIME_ADMITTED_DT", "DATE_OF_LAST_ADMISSION_D"

5. Once you have created the classification and extraction models, fill in the names of the models under #model IDs in the cell below.
'''

In [None]:
# import libraries
import os
import re
import pandas as pd
import numpy as np
import json
import datetime as dt
import pyodbc
import time

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

from azure.storage.blob import BlobClient
from azure.storage.blob import BlobServiceClient, ContentSettings
from azure.storage.blob import ContainerClient

# Azure account info
AZURE_BLOB_ACCOUNT_KEY = "[INSERT ACCOUNT KEY HERE]"
AZURE_BLOB_ACCOUNT_URL = "[INSERT ACCOUNT URL HERE]"
AZURE_FORM_RECOGNIZER_ENDPOINT = "[INSERT FORMRECOGNIZER ENDPOINT HERE]"
AZURE_FORM_RECOGNIZER_KEY = "[INSERT FORMRECOGNIZER KEY HERE]"
AZURE_BLOB_M187_CONTAINER = "[INSERT CONTAINER NAME HERE]"

document_analysis_client = DocumentAnalysisClient(
    endpoint=AZURE_FORM_RECOGNIZER_ENDPOINT, credential=AzureKeyCredential(AZURE_FORM_RECOGNIZER_KEY)
)

# model IDs
AZURE_FORM_RECOGNIZER_CLASSIFIER_ID = "[INSERT CLASSIFIER MODEL NAME HERE]"
BOLDFACE_MODEL_ID = "[INSERT BOLDFACE EXTRACTION MODEL NAME HERE]"
NON_BOLDFACE_MODEL_ID = "[INSERT NON-BOLDFACE EXTRACTION MODEL NAME HERE]"

# variable lists
RAW_DATE_COL_LIST = ["REPORT_DATE", "BIRTH_DATE", "SENTENCE_DATE", "DATE_ARREST", "DATE_CONVICTION", "DATE_OF_LAST_ADMISSION", "DEATH_DATE", "AUTOPSY_DATE"]
DATE_COL_LIST = ["REPORT_DATE_D", "BIRTH_DATE_D", "SENTENCE_DATE_D", "DATE_ARREST_D", "DATE_CONVICTION_D", "DATE_OF_LAST_ADMISSION_D", "DEATH_DATE_D", "AUTOPSY_DATE_D"]
RAW_TIME_COL_LIST = ["DEATH_TIME", "AUTOPSY_TIME"]
RAW_DATETIME_COL_LIST = ["AUTOPSY_DATETIME", "DATETIME_ADMITTED"]

In [None]:
# reads names of M187 forms to process
def read_file_names_from_blob_container():
    '''
    Returns the list of file names in an Azure blob container.
    '''

    # gen empty lists for filenames
    FILE_LIST = []

    # connect to M187 forms blob container
    container = ContainerClient(account_url=AZURE_BLOB_ACCOUNT_URL, container_name=AZURE_BLOB_M187_CONTAINER, credential=AZURE_BLOB_ACCOUNT_KEY)

    # make list of blobs in container
    blob_list = container.list_blobs()

    # organize file names of PDF files from blob container
    for blob in blob_list:
        if (blob.name).split('.')[-1].lower() == 'pdf':
            FILE_LIST.append(blob.name)

    return FILE_LIST

In [None]:
def delete_blob_from_blob_container(blob_name):
    '''
    Deletes an Azure blob from Azure blob container.
    '''

    # connect to M187 blob container
    container = ContainerClient(account_url=AZURE_BLOB_ACCOUNT_URL, container_name=AZURE_BLOB_M187_CONTAINER, credential=AZURE_BLOB_ACCOUNT_KEY)

    # delete blob
    container.delete_blob(blob=blob_name)

    return

In [None]:
def save_csv_to_blob_container(df, file_name):
    '''
    Saves a Pandas DataFrame to an Azure blob container.
    '''

    # create the BlobServiceClient object
    blob_service_client = BlobServiceClient(AZURE_BLOB_ACCOUNT_URL, credential=AZURE_BLOB_ACCOUNT_KEY)
    blob_client = blob_service_client.get_blob_client(container=AZURE_BLOB_M187_CONTAINER, blob=file_name)
    blob_settings = ContentSettings(content_encoding='UTF-8')

    # save file as blob to container, cool storage
    blob_client.upload_blob(df.to_csv(encoding='UTF-8', index=False, sep='|'),overwrite=True,content_type="text/csv")
    blob_client.set_standard_blob_tier('Cool')

    return

In [None]:
'''
The following functions classify and extract fields from M187s using custom Azure Document Intelligence models.
'''

In [None]:
def classify_documents(file_list):
    '''
    Classifies each page as (1) boldface first-page, (2) non-boldface first page, or (3) second page and returns dictionary
    with document type corresponding to each filename. Note that documents need to be split into individual pages
    before using this pipeline.
    '''
    classifier_id = os.getenv("CLASSIFIER_ID", AZURE_FORM_RECOGNIZER_CLASSIFIER_ID)

    file_types = {}

    for file in file_list:
        blob_client = BlobClient(account_url=AZURE_BLOB_ACCOUNT_URL, container_name="m187-doccs-forms-blob", blob_name=file, credential=AZURE_BLOB_ACCOUNT_KEY)
        poller = document_analysis_client.begin_classify_document_from_url(
             classifier_id, document_url=blob_client.url
         )
        result = poller.result()

        for page in result.documents: #documents are single-page
            file_types[file] = page.doc_type
    return file_types


In [None]:
def extract_info_from_boldface_form(file, fields_values_df):
    '''
    Uses a custom Azure Document Intelligence model to extract fields from boldface forms (first page only).
    '''

    model_id = os.getenv("BOLDFACE_MODEL_ID", BOLDFACE_MODEL_ID)
    blob_client = BlobClient(account_url=AZURE_BLOB_ACCOUNT_URL, container_name="m187-doccs-forms-blob", blob_name=file, credential=AZURE_BLOB_ACCOUNT_KEY)[ ]


    poller = document_analysis_client.begin_analyze_document_from_url(
             model_id = BOLDFACE_MODEL_ID, document_url=blob_client.url
         )
    result = poller.result()

    key_value_dict = {}
    key_confidence_dict = {}

    for document in result.documents:
        key_value_dict["FORM_TYPE"] = "boldface"
        key_value_dict["FORM_URL"] = blob_client.url
        for field_name, field in document.fields.items():
            key_value_dict[field_name] = field.value
            key_confidence_dict[field_name] = field.confidence
        key_value_dict["FIELD_CONFIDENCE_DICT"] = key_confidence_dict
        temp_df = pd.DataFrame([key_value_dict])
        fields_values_df = pd.concat([fields_values_df, temp_df], ignore_index=True)

    return fields_values_df

In [None]:
def extract_info_from_non_boldface_form(file, fields_values_df):
    '''
    Uses a custom Azure Document Intelligence model to extract fields from non-boldface forms (first page only).
    '''

    model_id = os.getenv("NON_BOLDFACE_MODEL_ID", NON_BOLDFACE_MODEL_ID)
    blob_client = BlobClient(account_url=AZURE_BLOB_ACCOUNT_URL, container_name="m187-doccs-forms-blob", blob_name=file, credential=AZURE_BLOB_ACCOUNT_KEY)

    poller = document_analysis_client.begin_analyze_document_from_url(
             model_id = NON_BOLDFACE_MODEL_ID, document_url=blob_client.url
         )
    result = poller.result()

    key_value_dict = {}
    key_confidence_dict = {}

    for document in result.documents:
        key_value_dict["FORM_TYPE"] = "non-boldface"
        key_value_dict["FORM_URL"] = blob_client.url
        for field_name, field in document.fields.items():
                key_value_dict[field_name] = field.value
                key_confidence_dict[field_name] = field.confidence
        key_value_dict["FIELD_CONFIDENCE_DICT"] = key_confidence_dict
        temp_df = pd.DataFrame([key_value_dict])
        fields_values_df = pd.concat([fields_values_df, temp_df], ignore_index=True)

    return fields_values_df


In [None]:
def classify_and_extract_info(file_list, only_process_new_files, current_fields_values_df = None):
    '''
    Uses classify_documents() to determine the layout of each page and then uses extract_info_from_boldface_form()
    and extract_info_from_non_boldface_form() to extract fields from first pages. Returns dataframe with extracted
    fields.
    '''

    # classify first pages of forms as either boldface or non-boldface so we know which model to use on them
    print('Starting classification')
    classified_files_dict = classify_documents(file_list)
    print('Classification step completed')

    # if we want to process all files, create a new empty dataframe for fields and values
    if only_process_new_files == False:
        fields_values_df = pd.DataFrame(columns = ["POTENTIAL_DUPLICATE", "FILE_NAME", "FORM_TYPE", "FORM_URL", "MANUAL_REVIEW_DONE", "DECEASED_NAME", "DIN", "NYSID", "FACILITY_NAME", "CODE", \
        "REPORT_DATE", "REPORTING_OFFICIAL_NAME", "HEIGHT", "HEIGHT_FT", "HEIGHT_IN", "ETHNICITY", "WEIGHT", "RACE", "SEX", \
        "SENTENCE", "BIRTH_DATE", "SENTENCE_DATE", "ARREST_CHARGES", "DATE_ARREST", "DATE_CONVICTION", "CONVICTION_CHARGES", \
        "HOSPITAL_NAME", "CHIEF_ADMIN_OFFICER_NAME", "AMBULANCE_RESCUE_SQUAD_NAME", "DATETIME_ADMITTED", "DATE_OF_LAST_ADMISSION", \
        "DEATH_DATE", "DEATH_TIME", "TERMINAL_INCIDENT_LOCATION", "REPORTED_IMMEDIATE_CAUSE_OF_DEATH", "FACILITY_ADMINISTRATORS_REPORT_OF_DEATH_CIRCUMSTANCES", \
        "STAFF_INCARCERATED_WITNESSES", "OFFICER_SUPERVISING_DEATH_LOCATION", "ASSIGNED_HOUSING_UNIT", "HOUSING_UNIT_TYPE", \
        "AUTOPSY_DATE", "AUTOPSY_TIME", "AUTOPSY_DATETIME", "AUTOPSY_LOCATION", "MEDICAL_EXAMINER_CORONER_NAME", "AUTOPSY_PERFORMED_YES", "AUTOPSY_PERFORMED_NO", \
        "SUPERVISION_PRIOR_TO_INCIDENT_ACTIVE", "SUPERVISION_PRIOR_TO_INCIDENT_CONSTANT", "SUPERVISION_PRIOR_TO_INCIDENT_GENERAL",
        "SUBSTANCE_ABUSE_DRUG", "SUBSTANCE_ABUSE_ALCOHOL", "SUBSTANCE_ABUSE_UNKNOWN", "SUBSTANCE_ABUSE_NO", \
        "MEDICAL_TREATMENT", "PSYCH_TREATMENT", "NO_TREATMENT", "DATE_LAST_CONTACT", "MEDICAL_CONTACT", "PSYCH_CONTACT", \
        "INTAKE_SCREENING_YES", "INTAKE_SCREENING_NO", "FIELD_CONFIDENCE_DICT", "FACILITY_CODE_2D", "WEIGHT_LBS_F", "DATE_OF_BIRTH_D", \
        "DEATH_TIME_T", "DATE_ARREST_D", "DATE_CONVICTION_D", "SENTENCE_DATE_D", "AUTOPSY_DATE_D", "DATETIME_ADMITTED_DT", "DATE_OF_LAST_ADMISSION_D"])

    # if we only want to process new values, set fields_values_df to current dataframe so we can append to it
    elif only_process_new_files == True:
        fields_values_df = current_fields_values_df

    # run field extraction models on each forms
    for file in file_list:
        if classified_files_dict[file] == "boldface first page":
            fields_values_df = extract_info_from_boldface_form(file, fields_values_df)
        elif classified_files_dict[file] == "non-boldface first page":
            fields_values_df = extract_info_from_non_boldface_form(file, fields_values_df)

    return fields_values_df


In [None]:
'''
The following functions standardize and clean the data once it has been extracted using the Document Intelligence models.
They are called once prior to the manual review, and once afterwards.
'''

In [None]:
def clean_doccs_facility_name(df):
    df.loc[df['FACILITY_NAME'].str.contains('ADIRONDACK'), 'FACILITY_NAME'] = 'ADIRONDACK'
    df.loc[df['FACILITY_NAME'].str.contains('ALBION'), 'FACILITY_NAME'] = 'ALBION'
    df.loc[df['FACILITY_NAME'].str.contains('ALTONA'), 'FACILITY_NAME'] = 'ALTONA'
    df.loc[df['FACILITY_NAME'].str.contains('ARTHUR KILL'), 'FACILITY_NAME'] = 'ARTHUR KILL'
    df.loc[df['FACILITY_NAME'].str.contains('ATTICA'), 'FACILITY_NAME'] = 'ATTICA'
    df.loc[df['FACILITY_NAME'].str.contains('AUBURN'), 'FACILITY_NAME'] = 'AUBURN'
    df.loc[df['FACILITY_NAME'].str.contains('BAYVIEW'), 'FACILITY_NAME'] = 'BAYVIEW'
    df.loc[df['FACILITY_NAME'].str.contains('BARE HILL'), 'FACILITY_NAME'] = 'BARE HILL'
    df.loc[df['FACILITY_NAME'].str.contains('BEACON'), 'FACILITY_NAME'] = 'BEACON'
    df.loc[df['FACILITY_NAME'].str.contains('BEDFORD HILLS'), 'FACILITY_NAME'] = 'BEDFORD HILLS'
    df.loc[df['FACILITY_NAME'].str.contains('BUFFALO'), 'FACILITY_NAME'] = 'BUFFALO'
    df.loc[df['FACILITY_NAME'].str.contains('BUTLER'), 'FACILITY_NAME'] = 'BUTLER'
    df.loc[df['FACILITY_NAME'].str.contains('CAMP GABRIELS'), 'FACILITY_NAME'] = 'CAMP GABRIELS'
    df.loc[df['FACILITY_NAME'].str.contains('CAMP GEORGETOWN'), 'FACILITY_NAME'] = 'CAMP GEORGETOWN'
    df.loc[df['FACILITY_NAME'].str.contains('CAMP PHARSALIA'), 'FACILITY_NAME'] = 'CAMP PHARSALIA'
    df.loc[df['FACILITY_NAME'].str.contains('CAPE VINCENT'), 'FACILITY_NAME'] = 'CAPE VINCENT'
    df.loc[df['FACILITY_NAME'].str.contains('CAYUGA'), 'FACILITY_NAME'] = 'CAYUGA'
    df.loc[df['FACILITY_NAME'].str.contains('CHATEAUGAY'), 'FACILITY_NAME'] = 'CHATEAUGAY'
    df.loc[df['FACILITY_NAME'].str.contains('CLINTON'), 'FACILITY_NAME'] = 'CLINTON'
    df.loc[df['FACILITY_NAME'].str.contains('COLLINS'), 'FACILITY_NAME'] = 'COLLINS'
    df.loc[df['FACILITY_NAME'].str.contains('COXSACKIE'), 'FACILITY_NAME'] = 'COXSACKIE'
    df.loc[df['FACILITY_NAME'].str.contains('DOWNSTATE'), 'FACILITY_NAME'] = 'DOWNSTATE'
    df.loc[df['FACILITY_NAME'].str.contains('EASTERN'), 'FACILITY_NAME'] = 'EASTERN'
    df.loc[df['FACILITY_NAME'].str.contains('EDGECOMBE'), 'FACILITY_NAME'] = 'EDGECOMBE'
    df.loc[df['FACILITY_NAME'].str.contains('ELMIRA'), 'FACILITY_NAME'] = 'ELMIRA'
    df.loc[df['FACILITY_NAME'].str.contains('FISHKILL'), 'FACILITY_NAME'] = 'FISHKILL'
    df.loc[df['FACILITY_NAME'].str.contains('FIVE POINTS'), 'FACILITY_NAME'] = 'FIVE POINTS'
    df.loc[df['FACILITY_NAME'].str.contains('FRANKLIN'), 'FACILITY_NAME'] = 'FRANKLIN'
    df.loc[df['FACILITY_NAME'].str.contains('FULTON'), 'FACILITY_NAME'] = 'FULTON'
    df.loc[df['FACILITY_NAME'].str.contains('GOWANDA'), 'FACILITY_NAME'] = 'GOWANDA'
    df.loc[df['FACILITY_NAME'].str.contains('GOUVERNEUR'), 'FACILITY_NAME'] = 'GOUVERNEUR'
    df.loc[df['FACILITY_NAME'].str.contains('GREAT MEADOW'), 'FACILITY_NAME'] = 'GREAT MEADOW'
    df.loc[df['FACILITY_NAME'].str.contains('GREEN HAVEN'), 'FACILITY_NAME'] = 'GREEN HAVEN'
    df.loc[df['FACILITY_NAME'].str.contains('GREENE'), 'FACILITY_NAME'] = 'GREENE'
    df.loc[df['FACILITY_NAME'].str.contains('GROVELAND'), 'FACILITY_NAME'] = 'GROVELAND'
    df.loc[df['FACILITY_NAME'].str.contains('HALE CREEK'), 'FACILITY_NAME'] = 'HALE CREEK'
    df.loc[df['FACILITY_NAME'].str.contains('HUDSON'), 'FACILITY_NAME'] = 'HUDSON'
    df.loc[df['FACILITY_NAME'].str.contains('LAKEVIEW'), 'FACILITY_NAME'] = 'LAKEVIEW'
    df.loc[df['FACILITY_NAME'].str.contains('LINCOLN'), 'FACILITY_NAME'] = 'LINCOLN'
    df.loc[df['FACILITY_NAME'].str.contains('LIVINGSTON'), 'FACILITY_NAME'] = 'LIVINGSTON'
    df.loc[df['FACILITY_NAME'].str.contains('LYON MOUNTAIN'), 'FACILITY_NAME'] = 'LYON MOUNTAIN'
    df.loc[df['FACILITY_NAME'].str.contains('MARCY'), 'FACILITY_NAME'] = 'MARCY'
    df.loc[df['FACILITY_NAME'].str.contains('MID-ORANGE'), 'FACILITY_NAME'] = 'MID-ORANGE'
    df.loc[df['FACILITY_NAME'].str.contains('MID-STATE'), 'FACILITY_NAME'] = 'MID-STATE'
    df.loc[df['FACILITY_NAME'].str.contains('MOHAWK'), 'FACILITY_NAME'] = 'MOHAWK'
    df.loc[df['FACILITY_NAME'].str.contains('MONTEREY'), 'FACILITY_NAME'] = 'MONTEREY'
    df.loc[df['FACILITY_NAME'].str.contains('MORIAH'), 'FACILITY_NAME'] = 'MORIAH'
    df.loc[df['FACILITY_NAME'].str.contains('MOUNT MCGREGOR'), 'FACILITY_NAME'] = 'MOUNT MCGREGOR'
    df.loc[df['FACILITY_NAME'].str.contains('OGDENSBURG'), 'FACILITY_NAME'] = 'OGDENSBURG'
    df.loc[df['FACILITY_NAME'].str.contains('ONEIDA'), 'FACILITY_NAME'] = 'ONEIDA'
    df.loc[df['FACILITY_NAME'].str.contains('ORLEANS'), 'FACILITY_NAME'] = 'ORLEANS'
    df.loc[df['FACILITY_NAME'].str.contains('OTISVILLE'), 'FACILITY_NAME'] = 'OTISVILLE'
    df.loc[df['FACILITY_NAME'].str.contains('QUEENSBORO'), 'FACILITY_NAME'] = 'QUEENSBORO'
    df.loc[df['FACILITY_NAME'].str.contains('RIVERVIEW'), 'FACILITY_NAME'] = 'RIVERVIEW'
    df.loc[df['FACILITY_NAME'].str.contains('ROCHESTER'), 'FACILITY_NAME'] = 'ROCHESTER'
    df.loc[df['FACILITY_NAME'].str.contains('SHAWANGUNK'), 'FACILITY_NAME'] = 'SHAWANGUNK'
    df.loc[df['FACILITY_NAME'].str.contains('SING SING'), 'FACILITY_NAME'] = 'SING SING'
    df.loc[df['FACILITY_NAME'].str.contains('SOUTHPORT'), 'FACILITY_NAME'] = 'SOUTHPORT'
    df.loc[df['FACILITY_NAME'].str.contains('SULLIVAN'), 'FACILITY_NAME'] = 'SULLIVAN'
    df.loc[df['FACILITY_NAME'].str.contains('SUMMIT'), 'FACILITY_NAME'] = 'SUMMIT'
    df.loc[df['FACILITY_NAME'].str.contains('TACONIC'), 'FACILITY_NAME'] = 'TACONIC'
    df.loc[df['FACILITY_NAME'].str.contains('ULSTER'), 'FACILITY_NAME'] = 'ULSTER'
    df.loc[df['FACILITY_NAME'].str.contains('UPSTATE'), 'FACILITY_NAME'] = 'UPSTATE'
    df.loc[df['FACILITY_NAME'].str.contains('WALSH'), 'FACILITY_NAME'] = 'WALSH'
    df.loc[df['FACILITY_NAME'].str.contains('WALLKILL'), 'FACILITY_NAME'] = 'WALLKILL'
    df.loc[df['FACILITY_NAME'].str.contains('WATERTOWN'), 'FACILITY_NAME'] = 'WATERTOWN'
    df.loc[df['FACILITY_NAME'].str.contains('WASHINGTON'), 'FACILITY_NAME'] = 'WASHINGTON'
    df.loc[df['FACILITY_NAME'].str.contains('WENDE'), 'FACILITY_NAME'] = 'WENDE'
    df.loc[df['FACILITY_NAME'].str.contains('WILLARD'), 'FACILITY_NAME'] = 'WILLARD'
    df.loc[df['FACILITY_NAME'].str.contains('WOODBOURNE'), 'FACILITY_NAME'] = 'WOODBOURNE'
    df.loc[df['FACILITY_NAME'].str.contains('WYOMING'), 'FACILITY_NAME'] = 'WYOMING'

    return df

In [None]:
def create_facility_code_2d(df):
    df.loc[df['FACILITY_NAME'].str.contains('ADIRONDACK'), 'FACILITY_CODE_2D'] = '23'
    df.loc[df['FACILITY_NAME'].str.contains('ALBION'), 'FACILITY_CODE_2D'] = '09'
    df.loc[df['FACILITY_NAME'].str.contains('ALTONA'), 'FACILITY_CODE_2D'] = '54'
    df.loc[df['FACILITY_NAME'].str.contains('ARTHUR KILL'), 'FACILITY_CODE_2D'] = '15'
    df.loc[df['FACILITY_NAME'].str.contains('ATTICA'), 'FACILITY_CODE_2D'] = '00'
    df.loc[df['FACILITY_NAME'].str.contains('AUBURN'), 'FACILITY_CODE_2D'] = '01'
    df.loc[df['FACILITY_NAME'].str.contains('BARE HILL'), 'FACILITY_CODE_2D'] = '56'
    df.loc[df['FACILITY_NAME'].str.contains('BAYVIEW'), 'FACILITY_CODE_2D'] = '31'
    df.loc[df['FACILITY_NAME'].str.contains('BEACON'), 'FACILITY_CODE_2D'] = '34'
    df.loc[df['FACILITY_NAME'].str.contains('BEDFORD HILLS'), 'FACILITY_CODE_2D'] = '12'
    df.loc[df['FACILITY_NAME'].str.contains('BUFFALO'), 'FACILITY_CODE_2D'] = '88'
    df.loc[df['FACILITY_NAME'].str.contains('BUTLER'), 'FACILITY_CODE_2D'] = '52'
    df.loc[df['FACILITY_NAME'].str.contains('CAMP GABRIELS'), 'FACILITY_CODE_2D'] = '22'
    df.loc[df['FACILITY_NAME'].str.contains('CAMP GEORGETOWN'), 'FACILITY_CODE_2D'] = '21'
    df.loc[df['FACILITY_NAME'].str.contains('CAMP PHARSALIA'), 'FACILITY_CODE_2D'] = '18'
    df.loc[df['FACILITY_NAME'].str.contains('CAPE VINCENT'), 'FACILITY_CODE_2D'] = '58'
    df.loc[df['FACILITY_NAME'].str.contains('CAYUGA'), 'FACILITY_CODE_2D'] = '55'
    df.loc[df['FACILITY_NAME'].str.contains('CHATEAUGAY'), 'FACILITY_CODE_2D'] = '86'
    df.loc[df['FACILITY_NAME'].str.contains('CLINTON'), 'FACILITY_CODE_2D'] = '02'
    df.loc[df['FACILITY_NAME'].str.contains('COLLINS'), 'FACILITY_CODE_2D'] = '47'
    df.loc[df['FACILITY_NAME'].str.contains('COXSACKIE'), 'FACILITY_CODE_2D'] = '13'
    df.loc[df['FACILITY_NAME'].str.contains('DOWNSTATE'), 'FACILITY_CODE_2D'] = '24'
    df.loc[df['FACILITY_NAME'].str.contains('EASTERN'), 'FACILITY_CODE_2D'] = '10'
    df.loc[df['FACILITY_NAME'].str.contains('EDGECOMBE'), 'FACILITY_CODE_2D'] = '32'
    df.loc[df['FACILITY_NAME'].str.contains('ELMIRA'), 'FACILITY_CODE_2D'] = '11'
    df.loc[df['FACILITY_NAME'].str.contains('FISHKILL'), 'FACILITY_CODE_2D'] = '05'
    df.loc[df['FACILITY_NAME'].str.contains('FIVE POINT'), 'FACILITY_CODE_2D'] = '37'
    df.loc[df['FACILITY_NAME'].str.contains('FRANKLIN'), 'FACILITY_CODE_2D'] = '53'
    df.loc[df['FACILITY_NAME'].str.contains('FULTON'), 'FACILITY_CODE_2D'] = '38'
    df.loc[df['FACILITY_NAME'].str.contains('GOWANDA'), 'FACILITY_CODE_2D'] = '45'
    df.loc[df['FACILITY_NAME'].str.contains('GOUVERNEUR'), 'FACILITY_CODE_2D'] = '81'
    df.loc[df['FACILITY_NAME'].str.contains('GREAT MEADOW'), 'FACILITY_CODE_2D'] = '04'
    df.loc[df['FACILITY_NAME'].str.contains('GREEN HAVEN'), 'FACILITY_CODE_2D'] = '08'
    df.loc[df['FACILITY_NAME'].str.contains('GREENE'), 'FACILITY_CODE_2D'] = '67'
    df.loc[df['FACILITY_NAME'].str.contains('GROVELAND'), 'FACILITY_CODE_2D'] = '46'
    df.loc[df['FACILITY_NAME'].str.contains('HALE CREEK'), 'FACILITY_CODE_2D'] = '85'
    df.loc[df['FACILITY_NAME'].str.contains('HUDSON'), 'FACILITY_CODE_2D'] = '27'
    df.loc[df['FACILITY_NAME'].str.contains('LAKEVIEW'), 'FACILITY_CODE_2D'] = '60'
    df.loc[df['FACILITY_NAME'].str.contains('LINCOLN'), 'FACILITY_CODE_2D'] = '36'
    df.loc[df['FACILITY_NAME'].str.contains('LIVINGSTON'), 'FACILITY_CODE_2D'] = '80'
    df.loc[df['FACILITY_NAME'].str.contains('LYON MOUNTAIN'), 'FACILITY_CODE_2D'] = '59'
    df.loc[df['FACILITY_NAME'].str.contains('MARCY'), 'FACILITY_CODE_2D'] = '49'
    df.loc[df['FACILITY_NAME'].str.contains('MID-ORANGE'), 'FACILITY_CODE_2D'] = '28'
    df.loc[(df['FACILITY_NAME'].str.contains('MIDSTATE'))|(df['FACILITY_NAME'].str.contains('MID-STATE')), 'FACILITY_CODE_2D'] = '48'
    df.loc[df['FACILITY_NAME'].str.contains('MOHAWK'), 'FACILITY_CODE_2D'] = '39'
    df.loc[df['FACILITY_NAME'].str.contains('WALSH'), 'FACILITY_CODE_2D'] = '39'  # make Walsh have same facility code as Mohawk
    df.loc[df['FACILITY_NAME'].str.contains('MONTEREY'), 'FACILITY_CODE_2D'] = '19'
    df.loc[df['FACILITY_NAME'].str.contains('MORIAH'), 'FACILITY_CODE_2D'] = '51'
    df.loc[df['FACILITY_NAME'].str.contains('MCGREGOR'), 'FACILITY_CODE_2D'] = '26'
    df.loc[df['FACILITY_NAME'].str.contains('OGDENSBURG'), 'FACILITY_CODE_2D'] = '35'
    df.loc[df['FACILITY_NAME'].str.contains('ONEIDA'), 'FACILITY_CODE_2D'] = '44'
    df.loc[df['FACILITY_NAME'].str.contains('ORLEANS'), 'FACILITY_CODE_2D'] = '64'
    df.loc[df['FACILITY_NAME'].str.contains('OTISVILLE'), 'FACILITY_CODE_2D'] = '29'
    df.loc[df['FACILITY_NAME'].str.contains('QUEENSBORO'), 'FACILITY_CODE_2D'] = '17'
    df.loc[df['FACILITY_NAME'].str.contains('RIVERVIEW'), 'FACILITY_CODE_2D'] = '57'
    df.loc[df['FACILITY_NAME'].str.contains('ROCHESTER'), 'FACILITY_CODE_2D'] = '30'
    df.loc[df['FACILITY_NAME'].str.contains('SHAWANGUNK'), 'FACILITY_CODE_2D'] = '68'
    df.loc[df['FACILITY_NAME'].str.contains('SING SING'), 'FACILITY_CODE_2D'] = '07'
    df.loc[df['FACILITY_NAME'].str.contains('SOUTHPORT'), 'FACILITY_CODE_2D'] = '63'
    df.loc[df['FACILITY_NAME'].str.contains('SULLIVAN'), 'FACILITY_CODE_2D'] = '69'
    df.loc[df['FACILITY_NAME'].str.contains('SUMMIT'), 'FACILITY_CODE_2D'] = '20'
    df.loc[df['FACILITY_NAME'].str.contains('TACONIC'), 'FACILITY_CODE_2D'] = '25'
    df.loc[df['FACILITY_NAME'].str.contains('ULSTER'), 'FACILITY_CODE_2D'] = '61'
    df.loc[df['FACILITY_NAME'].str.contains('UPSTATE'), 'FACILITY_CODE_2D'] = '84'
    df.loc[df['FACILITY_NAME'].str.contains('WALLKILL'), 'FACILITY_CODE_2D'] = '06'
    df.loc[df['FACILITY_NAME'].str.contains('WATERTOWN'), 'FACILITY_CODE_2D'] = '03'
    df.loc[df['FACILITY_NAME'].str.contains('WASHINGTON'), 'FACILITY_CODE_2D'] = '65'
    df.loc[df['FACILITY_NAME'].str.contains('WENDE'), 'FACILITY_CODE_2D'] = '43'
    df.loc[df['FACILITY_NAME'].str.contains('WILLARD'), 'FACILITY_CODE_2D'] = '82'
    df.loc[df['FACILITY_NAME'].str.contains('WOODBOURNE'), 'FACILITY_CODE_2D'] = '14'
    df.loc[df['FACILITY_NAME'].str.contains('WYOMING'), 'FACILITY_CODE_2D'] = '66'

    return df

In [None]:
def clean_race_col_create_ethnicity_col(df):

    df.loc[df["RACE"].str.contains("HISP") & (df['ETHNICITY'] == ""), "ETHNICITY"] = "HISPANIC"
    df.loc[df["RACE"].str.contains("NHP") & (df['ETHNICITY'] == ""), "ETHNICITY"] = "NOT HISPANIC"
    df.loc[df["RACE"].str.contains("NOT HISP") & (df['ETHNICITY'] == ""), "ETHNICITY"] = "NOT HISPANIC"

    df["RACE"] = df["RACE"].replace("AA", "Black")
    df.loc[df["RACE"].str.contains("B"), "RACE"] = "BLACK"
    df.loc[df["RACE"].str.contains("CAUC"), "RACE"] = "WHITE" # caucasian
    df.loc[df["RACE"].str.contains("W"), "RACE"] = "WHITE"
    df.loc[df["RACE"].str.contains("O"), "RACE"] = "OTHER"
    df["RACE"] = df["RACE"].replace("NHP", "")
    df["RACE"] = df["RACE"].replace("HISPANIC", "")

    return df

In [None]:
def clean_date_time_cols(df):

    for col in RAW_DATE_COL_LIST:
        # clean and standardize date formatting
        df[col] = df[col].str.replace('//', '/', regex=False)
        df[col] = df[col].str.replace('/022', '/2022', regex=False)
        df[col] = df[col].str.replace('-', '/', regex=False)
        df[col] = df[col].apply(pd.to_datetime, errors='ignore').astype(str)

        # remove time from datetime and ensure dates are in correct century
        df[col] = df[col].astype(str)
        df[col] = df[col].str.replace('00:00:00', '', regex=True)
        df.loc[df[col].apply(lambda x: bool(re.match('^20([3-9])', x))), col] = '19' + df[col].str[2:]

    for col in RAW_DATETIME_COL_LIST:
        df[col] = df[col].apply(pd.to_datetime, errors='ignore').astype(str)

    return df

In [None]:
def clean_remaining_cols(df):

    # Name
    df["DECEASED_NAME"] = df["DECEASED_NAME"].replace('[^a-zA-Z, ]', '', regex=True)

    # DIN
    df["DIN"] = df["DIN"].replace('[^a-zA-Z0-9]', '', regex=True)
    df["DIN"] = df["DIN"].replace('NYSID', '', regex=True)
    df["DIN"] = df["DIN"].str.upper()
    df["DIN"] = df["DIN"].str.strip()

    # height
    df["HEIGHT_FT"] = df["HEIGHT_FT"].str.replace('O', '0', regex=True)
    df["HEIGHT_IN"] = df["HEIGHT_IN"].str.replace('O', '0', regex=True)
    df["HEIGHT_FT"] = df["HEIGHT_FT"].str.replace('[^0-9]', '', regex=True)
    df["HEIGHT_IN"] = df["HEIGHT_IN"].str.replace('[^0-9]', '', regex=True)
    df["HEIGHT_IN"] = df["HEIGHT_IN"].str.replace('17', '7', regex=True)

    df["HEIGHT"] = df["HEIGHT"].str.replace('\"\"\"', '\"', regex=True)
    df["HEIGHT"] = df["HEIGHT"].str.lstrip('\"')
    df.loc[df["FORM_TYPE"] == "NON-BOLDFACE", "HEIGHT"] = df["HEIGHT_FT"] + '\' ' + df["HEIGHT_IN"] + '\"'
    df["HEIGHT"] = df["HEIGHT"].str.replace('\'[0-9]', '\' ', regex=True)

    df.loc[df["HEIGHT_FT"] == "0", "HEIGHT_IN"] = ""
    df.loc[df["HEIGHT_FT"] == "0", "HEIGHT_FT"] = ""
    df["HEIGHT"] = df["HEIGHT"].str.replace('\' \"', '', regex=True)

    # weight-- in lbs
    df["WEIGHT"] = df["WEIGHT"].str.replace('LBS', '', regex=True)
    df["HEIGHT"] = df["HEIGHT"].str.rstrip('.')

    # sex
    df['SEX'] = df['SEX'].str[:1]
    df['SEX'] = df['SEX'].str.replace("N", "M", regex = False)

    # reported immediate cause of death
    df["REPORTED_IMMEDIATE_CAUSE_OF_DEATH"] = df["REPORTED_IMMEDIATE_CAUSE_OF_DEATH"].str.replace('REPORTED IMMEDIATE CAUSE OF DEATH:', '', regex=True)

    # staff_incarcerated_witnesses
    df["STAFF_INCARCERATED_WITNESSES"] = df["STAFF_INCARCERATED_WITNESSES"].str.replace('DOCCS :', '', regex = True)

    # sentence
    df["SENTENCE"] = df["SENTENCE"].str.replace("YRS", "YEARS", regex = True)
    df["SENTENCE"] = df["SENTENCE"].str.replace(";", ", ", regex = True)

    # hospital name
    df["HOSPITAL_NAME"] = df["HOSPITAL_NAME"].str.replace('Hospital:', '', regex=True)

    # housing unit abbreviations
    df = df.replace({'REGIONAL MEDICAL UNIT': 'RMU'}, regex=True)
    df = df.replace({'GP': 'GENERAL POPULATION'}, regex=False)
    df.loc[df['HOUSING_UNIT_TYPE'].str.contains('GEN'), 'HOUSING_UNIT_TYPE'] = 'GENERAL POPULATION'
    df["ASSIGNED_HOUSING_UNIT"] = df["ASSIGNED_HOUSING_UNIT"].str.replace(" - ", "-", regex = True)

    # form URL -- make the appropriate phrases lowercase so the PDF will download properly
    df["FORM_URL"] = df["FORM_URL"].str.replace(".PDF", ".pdf", regex = True)
    df["FORM_URL"] = df["FORM_URL"].str.replace("M187-DOCCS-FORMS-BLOB", "m187-doccs-forms-blob", regex = True)

    return df

In [None]:
def flag_duplicates(df):
    '''
    Flags potential duplicate forms based on whether the name on the form is the same as a name on another form.
    During the manual review, users will need to delete all but one form for each duplicate, ideally
    keeping the form with the most information (some duplicate names correspond with different forms).
    '''
    df["POTENTIAL_DUPLICATE"] = df.duplicated(keep=False, subset=['DECEASED_NAME'])
    df["POTENTIAL_DUPLICATE"] = df["POTENTIAL_DUPLICATE"].astype(str)
    df["POTENTIAL_DUPLICATE"] = df["POTENTIAL_DUPLICATE"].str.upper()
    return df

In [None]:
def clean_df(df):
    '''
    Runs all data cleaning and standardization functions, as well as flags potential duplicates using flag_duplicates().
    '''
    df = df.astype(str)
    df = df.dropna(how='all')
    df = df.replace({'nan': ''}, regex=True)
    df = df.replace({'|': ''}, regex=True)
    df = df.replace({'\r': ''}, regex=True)
    df = df.replace({'DNA': 'DID NOT ANSWER'}, regex=False)

    #df = to_upper(df)
    df = df.applymap(lambda s: s.upper() if type(s) == str else s)

    df = clean_doccs_facility_name(df)
    df = clean_race_col_create_ethnicity_col(df)
    df = clean_date_time_cols(df)
    df = clean_remaining_cols(df)

    #df = strip_whitespace(df)
    df = df.applymap(lambda s: s.strip() if type(s) == str else s)

    df = flag_duplicates(df)

    df = df.astype(str)
    df = df.replace({'NaT': ''}, regex=True)
    df = df.replace({'NONE': ''}, regex=True)

    # abbreviate checkbox indicators for ease of checking
    df = df.replace({'UNSELECTED': ''}, regex=True)
    df = df.replace({'SELECTED': 'X'}, regex=True)

    return df

In [None]:
def data_qa_m187s(df):
    '''
    Runs QA on fields after the manual review to check that data is within expected bounds.
    '''

    try:
        # date columns
        for col in DATE_COL_LIST:
            assert (df[col] == '').any()==False                                                                                              , 'QA failure: Report date should never be missing'
            assert pd.to_datetime(df[col]).dt.month.min() >= 1 & pd.to_datetime(df[col]).dt.month.min() <= 12                  , 'QA Failure: Report date month should always be between 1-12'
            assert pd.to_datetime(df[col].max()).date() <= dt.date.today()                                                                  , 'QA failure: Report date can not be in the future'

        # DIN
        assert (df['DIN'] == "REDACTED") | (df['DIN'].apply(len) == 7).all()                                         ,'QA Failure: DIN should always be 7 digits and not missing'
        assert (df['DIN'] == "REDACTED") | (df['DIN'].apply(lambda x: x.isalnum()).all() == True)                      ,'QA Failure: DIN should always be alphanumeric'
        assert (df['DIN'] == "REDACTED") | (df['DIN'].str[0:2].apply(lambda x: x.isnumeric()).all() == True)           ,'QA Failure: first two DIN digits should be numeric'
        assert (df['DIN'] == "REDACTED") | (df['DIN'].str[2].isin(['A','B','C','D','E','G','H','I','J','N','P','R','S','T','X','Y']).all() == True),'QA Failure: third DIN digit should be from set of letters used to indicate reception facility'
        assert (df['DIN'] == "REDACTED") | (df['DIN'].str[3:].apply(lambda x: x.isnumeric()).all() == True)            ,'QA Failure: last 4 DIN digits should be numeric'

        # sex
        sex_codes = ['M','F','', 'REDACTED']
        assert df['SEX_CODE'].isin(sex_codes).all() == True                              , 'QA failure: sex code should be in the predefined set'

        # race
        race_codes = ['BLACK', 'WHITE', 'NATIVE AMERICAN', 'OTHER', 'REDACTED', '']
        assert df['RACE_CODE'].isin(race_codes).all() == True                                                                                                                               , 'QA failure: race code should be in the predefined set'

        # ethnicity
        ethnic_group_codes = ['HISPANIC','NOT HISPANIC','UNKNOWN','REDACTED','']
        assert df['ETHNIC_GROUP'].isin(ethnic_group_codes).all() == True                                                                        , 'QA failure: ethnic group code should be in the predefined set'

        # weight
        assert (df['WEIGHT_LBS_F'] < 0    ).any() == False                                                        , 'QA failure: weight should never be negative'

        # facility code
        facility_2D_codes = ['','23','09','54','15','00','01','56','31','34','12','88','52','22','21','18','58','55','86','02',
                             '47','13','24','10','32','11','05','37','53','38','81','45','04','08','67','46','85','27','60',
                             '36','80','59','49','28','48','39','19','51','26','35','44','64','29','17','57','30','68','07',
                             '63','69','20','25','61','84','06','65','03','43','82','14','66']
        assert df['FACILITY_2D_CODE'].isin(facility_2D_codes).all() == True                                        , 'QA failure: 2-digit facility code should be in the predefined set'

    except Exception as error:
        print("An exception occurred:", error)
        pdb.set_trace()

    return df

In [None]:
def process_raw_forms(only_process_new_files):
    '''
    The main function for before the manual review step. Classifies and extracts information from forms and performs
    preliminary data cleaning before uploading to the Azure blob as a CSV. Set only_process_new_files to True if we
    only want to process M187s that haven't been processed yet; set to False if we want to drop all processed data
    and reprocess all forms.
    '''

    # read file names from blob container
    FILE_LIST = read_file_names_from_blob_container()

    print('Files to process:', FILE_LIST)

    # delete old extracted fields df so we can upload a new one later
    delete_blob_from_blob_container("M187_DOCCS_extracted_fields_table.csv")

    # classify and extract information from forms
    df = classify_and_extract_info(FILE_LIST, only_process_new_files)

    # clean data
    df = clean_df(df)

    # set manual review field to "no"
    df["MANUAL_REVIEW_DONE"] = "NO"

    # save csv to Azure blob
    save_csv_to_blob_container(df, "M187_DOCCS_extracted_fields_table.csv")

    print('Processing complete')

    return df

In [None]:
def clean_data_post_processing():
    '''
    The main function for after the manual review step. Performs data QA and final data cleaning before uploading the
    cleaned file to the Azure blob as a CSV.
    '''
    file = "M187_DOCCS_extracted_fields_table.csv"
    blob_client = BlobClient(account_url=AZURE_BLOB_ACCOUNT_URL, container_name="m187-doccs-forms-blob", blob_name=file, credential=AZURE_BLOB_ACCOUNT_KEY)
    blob_download = blob_client.download_blob()
    blob_content = blob_download.readall().decode('UTF-8')
    df = pd.DataFrame([x.split('|') for x in blob_content.split('\n')])

    # make top row header
    df = df.replace({'\r': ''}, regex=True)
    header = df.iloc[0]
    df = df[1:]
    df.columns = header

    # final clean of dataframe
    df = clean_df(df)

    # create cleaned versions of the following: facility code 2d, weight, date and datetime columns
    df = create_facility_code_2d(df)
    df["WEIGHT_LBS_F"] = pd.to_numeric(df["WEIGHT"], errors='coerce')
    df["WEIGHT_LBS_F"] = df["WEIGHT_LBS_F"].replace(np.nan, '', regex=True)
    df["DATE_OF_BIRTH_D"] = pd.to_datetime(df["BIRTH_DATE"], errors='coerce')
    df["DATE_ARREST_D"] = pd.to_datetime(df["DATE_ARREST"], errors='coerce')
    df["DATE_CONVICTION_D"] = pd.to_datetime(df["DATE_CONVICTION"], errors='coerce')
    df["SENTENCE_DATE_D"] = pd.to_datetime(df["SENTENCE_DATE"], errors='coerce')
    df["AUTOPSY_DATE_D"] = pd.to_datetime(df["AUTOPSY_DATE"], errors='coerce')
    df["DATETIME_ADMITTED_DT"] = pd.to_datetime(df["DATETIME_ADMITTED"], errors='coerce')
    df["DATE_OF_LAST_ADMISSION_D"] = pd.to_datetime(df["DATE_OF_LAST_ADMISSION"], errors='coerce')

    # create cleaned version of death time column: remove ambiguous time stamps (1:00 to 12:59 with no AM/PM indicator), convert remaining times to 24 hr
    df["DEATH_TIME_T"] = df["DEATH_TIME"]
    df["DEATH_TIME_T"] = df["DEATH_TIME_T"].str.replace(":", "", regex = True)
    df["DEATH_TIME_T"] = pd.to_numeric(df["DEATH_TIME_T"], errors = 'coerce')
    df.loc[(df["DEATH_TIME_T"]> 99) & (df["DEATH_TIME_T"] < 1300), "DEATH_TIME_T"] = ""
    df.loc[df["DEATH_TIME_T"] != "", "DEATH_TIME_T"] = df["DEATH_TIME"]

    # perform quality assurance
    df = data_qa_m187s(df)

    # delete the old blob from the container so we can upload the new dataframe
    delete_blob_from_blob_container("M187_DOCCS_extracted_fields_table.csv")

    # save cleaned dataframe to container
    save_csv_to_blob_container(df, "M187_DOCCS_extracted_fields_table.csv")

    return df

In [None]:
# main function execution
def main(process_data, only_process_new_files, run_post_processing):

    if process_data == True:

        process_raw_forms(only_process_new_files)


    if run_post_processing == True:

        clean_data_post_processing()

    return

In [None]:
# run settings
'''
First set process_data to True and run_post_manual_review to False. Set only_process_new_files to either True or False depending on
whether to process only new files, or to reprocess all files.

Then, perform the MANUAL REVIEW STEP: Manually clean the output data to correct Document Intelligence errors. Flag redactions and keep only one
duplicate for each set of duplicates.

Finally, set process_data to False and run_post_manual_review to True. This will process the manually reviewed data to standardize the formatting.
'''
process_data = True # read and transform raw data files

run_post_manual_review = False # run data processing after manual review process has been completed

main(process_data, only_process_new_files, run_post_processing)