In [30]:
import requests
import pytesseract
from PIL import Image
from io import BytesIO
from PyPDF2 import PdfReader

# Function to extract text from an image using OCR
def extract_text_from_image(image):
    return pytesseract.image_to_string(image, lang='eng')

# Function to extract text from a readable PDF
def extract_text_from_pdf(pdf_path):
    pdf_file = open(pdf_path, 'rb')
    pdf_reader = PdfReader(pdf_file)

    num_pages = len(pdf_reader.pages)

    text_data = []
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        text_data.append(page.extract_text())

    pdf_file.close()
    return '\n'.join(text_data)

def extract_financial_data(text):
    data = {}
    # Define the keywords to search for
    keywords = {
        'cash':None,
        'Other revenue' : None,
        'Cash and Equivalents': None,
        'Restricted Cash': None,
        'Accounts Receivables (AR)': None,
        'Inventory': None,
        'Long-term Investments': None,
        'Investment in GBEs': None,
        'Related Party Loans': None,
        'Investment in Trust Funds': None,
        'Other Assets': None,
        'Total Financial Assets': None,
        'Bank Indebtedness': None,
        'Accounts Payable': None,
        'Related Part Loans': None,
        'Deferred Revenue': None,
        'Short-term Debt': None,
        'Long-term Debt': None,
        'Other Liabilities': None,
        'Total Financial Liabilities': None,
        'Net Financial Assets': None,
        'Total Net Financial Assets': None,
        'Non-Financial Assets': None,
        'Tangible Capital Assets': None,
        'Prepaid Expenses': None,
        'Other non-Financial Assets': None,
        'Total non-Financial Assets': None,
        'Accumulated Surplus': None,
        'Income Statement': None,
        'Revenue': None,
        'Indigenous and Norther Affairs Canada (ISC)': None,
        'First Nations Health Authority': None,
        'Canadian Mortgage and Housing Corporation (CMHC)': None,
        'Health Canada': None,
        'Other Federal Transfer Revenue': None,
        'Provincial Transfer Revenue': None,
        'Tribal Government Transfer Revenue': None,
        'Other First Nation Entity Transfer Revenue': None,
        'Other Grants': None,
        'Royalties': None,
        'Rent/Lease Revenue': None,
        'Interest': None,
        'Property Tax': None,
        'Sales Tax': None,
        'Net GBE Income': None,
        'Gross Business Sales (non-GBE)': None,
        'User Fees': None,
        'Trust Fund': None,
        'Other Revenue': None,
        'Total Revenue': None,
        'Expenses': None,
        'Administration General': None,
        'Education': None,
        'Social Services and Development': None,
        'Capital and Maintenance': None,
        'Community Services': None,
        'Housing': None,
        'Land Resource Management': None,
        'Economic Development': None,
        'Business Expenses (Businesses directly controlled by the Nation)': None,
        'Health': None,
        'Combine Departments': None,
        'Other Expenses': None,
        'Total Expenses': None,
        'Surplus/Deficit': None,
        'Surplus in year': None,
        'Cash Flows': None,
        'Net Operating Cash Flows': None,
        'Gross Investment Cash Inflows': None,
        'Gross Investment Cash Outflows': None,
        'Net Investment Cash Flows': None,
        'Gross capital Cash inflows': None,
        'Net capital cash flows': None,
        'Gross financial cash inflows': None,
        'Gross financial cash outflows': None,
        'Net financial cash flows': None,
        'Total change in cash during year': None,
        'Cash balance beginning of year': None,
        'Cash balance end of year': None,
        'GBE Financial Information': None,
        'GBE Assets': None,
        'GBE Liabilities': None,
        'GBE Equity': None,
        'GBE Revenues': None,
        'GBE Expenses': None,
        'GBE Net Income': None
    }

    # Extract data using OCR
    for keyword in keywords:
        index = text.find(keyword)
        if index != -1:
            start_index = index + len(keyword) + 1
            end_index = text.find('\n', start_index)
            value = text[start_index:end_index].strip()
            keywords[keyword] = value

    return keywords

def fetch_financial_data(band_number, fiscal_year):
    base_url = "https://fnp-ppn.aadnc-aandc.gc.ca/fnp/Main/Search/DisplayBinaryData.aspx"
    query_params = {
        "BAND_NUMBER_FF": str(band_number),
        "FY": fiscal_year,
        "DOC": "Audited consolidated financial statements",
        "lang": "eng"
    }

    response = requests.get(base_url, params=query_params)
    if response.status_code == 200:
        content_type = response.headers.get('content-type')
        if content_type.startswith('application/pdf'):  # Readable PDF
            pdf_path = "financial_statements.pdf"
            with open(pdf_path, 'wb') as file:
                file.write(response.content)
            extracted_text = extract_text_from_pdf(pdf_path)
        else:  # Scanned PDF (image-based)
            pdf_image = Image.open(BytesIO(response.content))
            extracted_text = extract_text_from_image(pdf_image)

        financial_data = extract_financial_data(extracted_text)
        return financial_data
    else:
        print("Error: Unable to fetch the financial data.")
        return None

# Example usage
band_number = input("Enter Band Number: ")
fiscal_year = input("Enter Fiscal Year (yyyy-yyyy): ")

financial_data = fetch_financial_data(band_number, fiscal_year)

if financial_data is not None:
    print("Financial Data:")
    for key, value in financial_data.items():
        print(f"{key}: {value}")

Enter Band Number: 452
Enter Fiscal Year (yyyy-yyyy): 2014-2015
Financial Data:
cash: flows for the year then ended,
Other revenue: 1,678,680 1,722,614 1,667,459
Cash and Equivalents: None
Restricted Cash: None
Accounts Receivables (AR): None
Inventory: None
Long-term Investments: None
Investment in GBEs: None
Related Party Loans: None
Investment in Trust Funds: None
Other Assets: None
Total Financial Assets: None
Bank Indebtedness: None
Accounts Payable: None
Related Part Loans: None
Deferred Revenue: 
Short-term Debt: None
Long-term Debt: None
Other Liabilities: None
Total Financial Liabilities: None
Net Financial Assets: 
Total Net Financial Assets: None
Non-Financial Assets: None
Tangible Capital Assets: 
Prepaid Expenses: None
Other non-Financial Assets: None
Total non-Financial Assets: None
Accumulated Surplus: 
Income Statement: None
Revenue: 
Indigenous and Norther Affairs Canada (ISC): None
First Nations Health Authority: None
Canadian Mortgage and Housing Corporation (CMHC): 