In [4]:
# !pip install pdfplumber
# !pip install pytesseract

In [12]:
import pandas as pd
import pdfplumber
import fitz
import pytesseract
from PIL import Image
import io
import difflib

# Read Excel file and extract relevant data
def read_excel(file_path):
    df = pd.read_excel(file_path, "Demographics", header=None)
    all_key_value_pairs = []

    for index, row in df.iterrows():
        key_value_pairs = {}
        key = None

        for i in range(len(row)):
            if pd.notna(row[i]) and key is None:
                key = row[i]
            elif pd.notna(row[i]) and key is not None:
                cell_value = str(row[i]).strip().upper()
                if cell_value != 'PLEASE SPECIFY':
                    key_value_pairs[key] = row[i]
                key = None

        if key_value_pairs:
            all_key_value_pairs.append(key_value_pairs)

    sample_dict = {}

    for i, kv_pairs in enumerate(all_key_value_pairs, start=1):
        for key, value in kv_pairs.items():
            cleaned_key = str(key).rstrip(':')
            sample_dict[f"{cleaned_key}"] = value

    return sample_dict
sample_dict = read_excel(r'/home/jellyfish/Simalarity/Pearson, Jack-SDBUDGET-12.1.22.xlsx')



  warn(msg)


In [13]:
# OCR PDF to extract text
def ocr_pdf(pdf_path):
    all_text = ''
    with fitz.open(pdf_path) as doc:
        for page in doc:
            image_list = page.get_images(full=True)
            for image_index, img in enumerate(page.get_images(full=True)):
                base_image = doc.extract_image(img[0])
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))
                text = pytesseract.image_to_string(image, lang='eng')
                all_text += text + '\n'
    return all_text

# Extract text from PDFs
pdf_path_sap = r"/home/jellyfish/Simalarity/Pearson, Jack SAP 8.29.22.pdf"
pdf_path_lp = r"/home/jellyfish/Simalarity/Pearson, Jack LP Addend 8.29.22.pdf"

sap_text = ocr_pdf(pdf_path_sap)
lp_text = ''

with pdfplumber.open(pdf_path_lp) as pdf:
    for i in range(len(pdf.pages)):
        lp_text = lp_text + '\n' +pdf.pages[i].extract_text()
lp_text = lp_text.strip()

# Define rules for compliance check
rules = {
        "Requesting Start-up Broker Services should be present in section 1 and 4 in LP but is missing": (("Start-up Support Brokerage", "start-up support brokerage"), True, "LP"),
        "Self-Hired Respite should be present in section 1 and 4 in LP but is missing": (("Self-Hired Respite","Respite","respite"), True, "LP"),
        "Self-Hired Community Habilitation should be present in section 1 and 4 in LP but is missing": (("Self-Hired Community Habilitation","ComHab","Comhab","comhab","CH","ch"), True, "LP"),
        "Support Brokerage should be present in section 1 and 4 in LP but is missing": (("Support Brokerage","Broker","broker","support brokerage"), True, "LP"),
        "Self-Hired Supported Employment should be present in section 1 and 4 in LP but is missing": (("Supported Employment (SEMP)", "Supported Employment", "Supported Employment", "supported employment", "SEMP"), True, "LP"),
        "Individual Directed Goods and Services should be present in section 1 and 4 in LP but is missing": (("Individual Directed Goods and Services(IDGS)","individual directed goods and services", "Individual directed goods and services", "individual directed goods and services(IDGS)", "IDGS", "Individual Directed Goods and Services(IDGS)", "idgs", "(IDGS)"), True, "LP"),
        "Live-In Caregiver should be present in section 1 and 4 in LP but is missing ": (("Live-In Care giver", "Live-In Caregiver (LIC)", "live-in caregiver(LIC)", "LIC", "(LIC)", "Live-in Caregiver"), True, "LP"),
        "Housing Subsidy should be present in section 1 and 5 in LP but is missing": (("Housing Subsidy", "housing subsidy", "Housing", "housing"), True, "LP"),
        "Family Reimbursed Respite should be present in section 1 and 5 in LP but is missing": (("Family Reimbursed Respite", "Family Reimbursed Respite(FRR)", "Family Reimbursed Respite (FRR)", "family reimbursed respite", "FRR"), True, "LP"),
        "Other Than Personal Service Items should be present in section 1 and 5 in LP but is missing": (("Other Than Personal Service Items", "other than personal service items", "other than personal service items(OTPS)", "Other Than Personal Service Items(OTPS)", "OTPS", "otps", "(OTPS)", "(otps)", "Other Than Personal Service Items (OTPS)"), True, "LP"),
        "Family Support Services should be present in section 1 and 5 in LP but is missing ": (("Family Support Services", "family support services", "Family Support Services(FSS)", "Family Support Services (FSS)", "family support services(FSS)", "family support services (FSS)"), True, "LP"),
}

# Check compliance of rules
results = {}

# def check_rule_compliance(rule, document_text):
#     keyword, should_exist, doc_type = rule
#     exists = keyword in document_text
#     return exists == should_exist
def check_rule_compliance(rule, document_text):
    keyword, should_exist, doc_type = rule
    if isinstance(keyword, tuple):
        exists = any(k in document_text for k in keyword)
    else:
        exists = keyword in document_text
    return exists == should_exist


for rule_description, rule_details in rules.items():
    doc_text = lp_text if rule_details[2] == "LP" else sap_text
    compliance = check_rule_compliance(rule_details, doc_text)
    results[rule_description] = "True" if compliance else "False"

# for rule, status in results.items():
#     if status == "False":
#         print(f"{rule}")



for rule, status in results.items():
    if sample_dict['Requesting Start-up Broker Services?'].lower()=='yes' and status == "False" and 'Requesting Start-up Broker Services'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Self-Hired Respite'].lower()=='yes'  and status == "False" and 'Self-Hired Respite'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Self-Hired Community Habilitation'].lower()=='yes'  and status == "False" and 'Self-Hired Community Habilitation'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Support Brokerage'].lower()=='yes'  and status == "False" and 'Support Brokerage'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Self-Hired Supported Employment (SEMP)'].lower()=='yes'  and status == "False" and 'Self-Hired Supported Employment'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Individual Directed Goods & Services'].lower()=='yes'  and status == "False" and 'Individual Directed Goods and Services'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Live-In Caregiver'].lower()=='yes'  and status == "False" and 'Live-In Caregiver'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Housing Subsidy'].lower()=='yes'  and status == "False" and 'Housing Subsidy'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Other Than Personal Service Items'].lower()=='yes'  and status == "False" and 'Other Than Personal Service Items'.lower() in rule.lower():
        print(f"{rule}")
    elif sample_dict['Family Support Services and Assistive Supports'].lower()=='yes'  and status == "False" and 'Family Support Services'.lower() in rule.lower():
        print(f"{rule}")



In [9]:
sample_dict= { 
 'Development Plan Project Number': '022713623',
 "Individual's Information": 'Version:',
 'Last Name': 'Pearson',
 'First Name': 'Jack',
 'Middle Initial': 'B',
 'Sex': 'Male',
 'Street Address (Line 1)': '123 Main Street',
 'City': 'Commack',
 'State': 'New York',
 'Zip Code': 11725,
 'County': 'Suffolk',
 'Phone Number': 5161231234,
 'E-mail': 'jackpearson@yahoo.com',
 'Medicaid ID Number': 'AB12345C',
 'TABS ID Number': '123456',

 'Marital Status': 'Single',
 'Start-Up Broker Services': 'SNF',
 '0': 0,
 'Requesting Start-up Broker Services?': 'No',
 'Broker Authorization Number': '20952',
 'Both Specialized': 0,
 'Both Highly Complex': 0,
 'Day Specialized': 0,
 'DDP Adaptive': 198.37,
 'DDP Health': 7,
 'DDP Behavior': 198,
 '0.0': 'Self-Hired Supported Employment',
 'Day Highly Complex': 0,
 'Is the individual: 1) less than 22 years old and 2) residing with parents or legal guardian and 3) receiving services through the school district?     Select "Yes" ONLY if all three criteria are met.': 'No',
 'Brokerage': 'Residential Specialized',
 'Individual Directed Goods & Services': 'Yes',
 'PRA Residential': 0,
 'PRA OTR': 0,
 'Both PRA': 129818,
 'Live-In Caregiver': 'Yes',
 'Please Specify': 'Columbia',
 'Initial Plan or Amendment?': 'Amended Budget',
 'Template Funding Approved?': 'No',
 'N': 'N',
 'Brooklyn': 'Capital District, Taconic & Hudson Valley',
 'DDRO': 'Long Island',
 'Regional Office': ' Long Island',
 'Agency Supported Self-Directed Respite': 1,
 'Female': 'Allegany',
 'Broome': 'Central New York, Broome & Sunmount',
 '2.0': 'Yes',
 'DDRO Contact': 'Piedad Ruiz, MSW',
 'DDRO Contact Phone': 6314163899,
 'Agency Supported Self-Directed SEMP': 'No',
 'Male': 'Bronx',
 'Capital District': 'Long Island',
 '5.0': 'No',
 'DDRO Contact E-mail': 'piedad.d.ruizgarcia@opwdd.ny.gov',
 'What type of budget is described in this document?': 'Both',
 'Housing Subsidy': 'Yes',
 'X': 'Broome',
 'Central': 'Metro, Brooklyn, Staten Island & Bernard Fineson',
 'CSS Participant Prior to 10/01/14': 'Yes',
 'Finger Lakes': 'Western New York & Finger Lakes',
 'Family Reimbursed Respite': 'Specialized',
 'Highly Complex': 'Married',
 'Chautauqua': 'Long Island',
 'Provide a response for each service. ': 'Other Than Personal Services ',
 'SNF': 'Single',
 'Chemung': 'Metro Bronx',
 'Medicaid Funded Self-Directed Services': 'Agency Supported Self-Directed Services',
 'Chenango': 'Metro Manhattan',
 'Self-Hired Community Habilitation': 'Yes',
 'Community Habilitation': 'No',
 "Total of NA's": 2.0,
 'Direct Provider Purchased Day Habilitation 1': 'Initial Budget',
 'Clinton': 'Staten Island',
 'Self-Hired Respite': 'No',
 'Respite': 'Yes',
 'Direct Provider Purchased Day Habilitation 2': 'Amended Budget',
 'Self-Hired Supported Employment (SEMP)': 'Yes',
 'Supported Employment (SEMP) ': 'No',
 'Direct Provider Purchased Pathway to Employment': 'Residential Only',
 'Cortland': 'Taconic',
 'Employment Outcome Payment Phase 1': 'Monroe',
 'Outcome Payment Phase 1': 'Yes',
 'Direct Provider Purchased Pre-Vocational Services': 'Other Than Residential Only',
 'Delaware ': 'Western',
 'Extended Employment Supports Phase 2': 'Montgomery',
 'Direct Provider Purchased Respite': 'Both',
 'Support Brokerage': 'Yes',
 'Direct Provider Purchased SEMP': 'Franklin',
 '100% State Funded Self Directed Services': 'Direct Provider Purchased Respite',
 'Number of Live-In Caregivers': 'Family Reimbursed Respite',
 'No': 'Fulton',
 'Other Than Personal Service Items': 'Yes',
 'Direct Provider Purchased Services': 'Greene',
 'Contracted Services:  State Paid Contracted Services including': 'Hamilton ',
 'Day Habilitation': 'No',
 'Family Support Services and Assistive Supports': 'No',
 'Pathway to Employment': 'No',
 'Pre-Vocational Services': 'No',
 'Supported Employment (SEMP)': 'No',
 'Fiscal Intermediary   ': 'New York City',
 'Agency Name': 'Resource Center for Independent Living (RCIL)',
 'Contact: Name': 'Amanda Tuthill',
 'Address': '456 Church Street',
 'City/State/Zip': 'Central Islip, NY 11722',
 'Contact Email': 'mrivas@gmail.com',
 'Contact Phone': 6319999999,
 'Corp ID': '20400',
 'Broker   ': 'Oswego',
 'Broker Name': 'Gloria Ollerenshaw',
 'Agency Affiliation': 'Care Design NY LLC CCO',
 'Care Manager': 'Rockland ',
 'Name': 'Miguel Rivas',
 'CSS Monthly Summary Note': 'Suffolk',
 'Employee Time Sheets/Daily Service Records': 'Sullivan ',
 'Invoices/Service Records for Contracted/Vendor Services': 'Tioga ',
 'Individualized Services Plan & Budget Reviews & Amendments': 'Tompkins',
 'Mileage Logs': 'Ulster ',
 'Other': 'Warren',
 'To Be Completed By Central Office': 'Washington',
}