<a href="https://colab.research.google.com/github/Yashbarod/PDFtoXML/blob/main/IY_PDF_XML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title IRONYASH BANK PDF to TALLY XML <br> <font size="2">Made by IronYash</font> { display-mode: "form" }
#@markdown <center><h2><font color=orange><b>⚡ IronYash | Generate Tally XML from Bank PDF 🔥</b></h2></center><br>

import os
bank_name = "STATE BANK OF INDIA - 0984 SA" #@param {type:"string"}
suspense_name = "Suspense" #@param {type:"string"}

# @markdown > <i> `by Suyash with ❤️`

!pip install pdfplumber ipywidgets

import xml.etree.ElementTree as ET
import pdfplumber
from google.colab import files
import ipywidgets as widgets
from IPython.display import display,  clear_output
import re # Importing the 're' module for regex operations

clear_output()

banner = r"""


 ___  ________  ________  ________       ___    ___ ________  ________  ___  ___
|\  \|\   __  \|\   __  \|\   ___  \    |\  \  /  /|\   __  \|\   ____\|\  \|\  \
\ \  \ \  \|\  \ \  \|\  \ \  \\ \  \   \ \  \/  / | \  \|\  \ \  \___|\ \  \\\  \
 \ \  \ \   _  _\ \  \\\  \ \  \\ \  \   \ \    / / \ \   __  \ \_____  \ \   __  \
  \ \  \ \  \\  \\ \  \\\  \ \  \\ \  \   \/  /  /   \ \  \ \  \|____|\  \ \  \ \  \
   \ \__\ \__\\ _\\ \_______\ \__\\ \__\__/  / /      \ \__\ \__\____\_\  \ \__\ \__\
    \|__|\|__|\|__|\|_______|\|__| \|__|\___/ /        \|__|\|__|\_________\|__|\|__|
                                       \|___|/                  \|_________|



     Created by IronYash | Google Colab Tally XML Utility
"""

print(banner)

def extract_pdf_data(pdf_paths):
    transactions = []
    for pdf_path in pdf_paths:
        try:
            with pdfplumber.open(pdf_path) as pdf:
                capturing = False  # Flag to start capturing transactions
                temp_transaction = None  # To handle multi-line descriptions

                for page in pdf.pages:
                    text = page.extract_text()
                    if not text:
                        continue
                    lines = text.split('\n')

                    for line in lines:
                        # Identify the start of transactions
                        if "Txn Date" in line and "Description" in line and "Debit" in line and "Credit" in line:
                            capturing = True
                            continue
                        if not capturing:
                            continue

                        # Regex pattern to match transaction entries
                        match = re.match(r'^(\d{1,2} \w{3} \d{4})\s+(.+?)\s+(\d{1,3}(,\d{3})*\.?\d*)?\s+(\d{1,3}(,\d{3})*\.?\d*)?', line)
                        if match:
                            date, description, debit, _, credit, _ = match.groups()
                            debit = float(debit.replace(",", "")) if debit else 0.0
                            credit = float(credit.replace(",", "")) if credit else 0.0

                            # If a transaction was partially captured, store the previous one
                            if temp_transaction:
                                transactions.append(temp_transaction)
                                temp_transaction = None

                            temp_transaction = {
                                'date': date,
                                'description': description.strip(),
                                'debit': debit,
                                'credit': credit
                            }

                        # Handle multi-line descriptions
                        elif temp_transaction:
                            temp_transaction['description'] += " " + line.strip()

                # Add last captured transaction
                if temp_transaction:
                    transactions.append(temp_transaction)

        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")

    return transactions

def create_tally_xml(transactions, bank_name, suspense_name):
    try:
        envelope = ET.Element("ENVELOPE")
        header = ET.SubElement(envelope, "HEADER")
        tally_request = ET.SubElement(header, "TALLYREQUEST")
        tally_request.text = "Import Data"

        body = ET.SubElement(envelope, "BODY")
        import_data = ET.SubElement(body, "IMPORTDATA")
        request_data = ET.SubElement(import_data, "REQUESTDATA")

        for transaction in transactions:
            tally_message = ET.SubElement(request_data, "TALLYMESSAGE")
            voucher = ET.SubElement(tally_message, "VOUCHER", attrib={
                "VCHTYPE": "Payment" if transaction['debit'] > 0 else "Receipt",
                "ACTION": "Create"
            })

            ET.SubElement(voucher, "DATE").text = transaction['date']
            ET.SubElement(voucher, "VOUCHERTYPENAME").text = "Payment" if transaction['debit'] > 0 else "Receipt"
            ET.SubElement(voucher, "NARRATION").text = transaction['description']

            if transaction['debit'] > 0:  # Payment (Money Out)
                # **Suspense A/c - CREDIT (Negative Value)**
                ledger_entry_1 = ET.SubElement(voucher, "ALLLEDGERENTRIES.LIST")
                ET.SubElement(ledger_entry_1, "LEDGERNAME").text = suspense_name
                ET.SubElement(ledger_entry_1, "ISDEEMEDPOSITIVE").text = "Yes"
                ET.SubElement(ledger_entry_1, "AMOUNT").text = "-" + str(transaction['debit'])

                # **Bank A/c - DEBIT (Positive Value)**
                ledger_entry_2 = ET.SubElement(voucher, "ALLLEDGERENTRIES.LIST")
                ET.SubElement(ledger_entry_2, "LEDGERNAME").text = bank_name
                ET.SubElement(ledger_entry_2, "ISDEEMEDPOSITIVE").text = "No"
                ET.SubElement(ledger_entry_2, "AMOUNT").text = str(transaction['debit'])

            elif transaction['credit'] > 0:  # Receipt (Money In)
                # **Bank A/c - CREDIT (Negative Value)**
                ledger_entry_1 = ET.SubElement(voucher, "ALLLEDGERENTRIES.LIST")
                ET.SubElement(ledger_entry_1, "LEDGERNAME").text = bank_name
                ET.SubElement(ledger_entry_1, "ISDEEMEDPOSITIVE").text = "Yes"
                ET.SubElement(ledger_entry_1, "AMOUNT").text = "-" + str(transaction['credit'])

                # **Suspense A/c - DEBIT (Positive Value)**
                ledger_entry_2 = ET.SubElement(voucher, "ALLLEDGERENTRIES.LIST")
                ET.SubElement(ledger_entry_2, "LEDGERNAME").text = suspense_name
                ET.SubElement(ledger_entry_2, "ISDEEMEDPOSITIVE").text = "No"
                ET.SubElement(ledger_entry_2, "AMOUNT").text = str(transaction['credit'])

        tree = ET.ElementTree(envelope)
        tree.write("tally_import.xml", xml_declaration=True, encoding='utf-8')
        print("Tally XML file created successfully.")
    except Exception as e:
        print(f"Error creating XML: {e}")


# Upload PDF
print("Please upload your bank statement PDF file.")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

# Process the bank statement
transactions = extract_pdf_data([pdf_path])  # Passing pdf_path as a list


if transactions:
    create_tally_xml(transactions, bank_name, suspense_name)
else:
    print("No transactions found in the PDF. Check the PDF's format and content.") # Added a message for better clarity


# Download XML
print("Downloading the generated XML file...")
files.download("tally_import.xml")