In [1]:
import re
import pdfplumber

# Function to extract fields based on specific patterns
def extract_itr_fields(text):
    fields = {}
    
    # Define patterns for each required field
    patterns = {
        'ITR Assessment': r'ITR Assessment\s*=\s*(.*)',
        'Gross Salary': r'Gross Salary\s*=\s*([\d,\,\.]+)',
        'Net Salary': r'Net Salary\s*=\s*([\d,\,\.]+)',
        'Income from Other Sources': r'Income from Other Sources\s*=\s*([\d,\,\.]+)',
        'Income chargeable under Salaries': r'Income chargeable under Salaries\s*=\s*([\d,\,\.]+)',
        'Total Tax Deducted': r'Total Tax Deducted\s*=\s*([\d,\,\.]+)'
    }
    
    # Loop through the patterns and search in the text
    for field, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            fields[field] = match.group(1)
        else:
            fields[field] = 'Not Found'
    
    return fields

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Path to the PDF file
pdf_path = 'sampleitr.pdf'  # Replace with the actual PDF file path

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Call the function to extract the fields from the extracted PDF text
extracted_fields = extract_itr_fields(pdf_text)

# Print the extracted fields
# for field, value in extracted_fields.items():
#     print(f"{field}: {value}")


In [2]:
print(pdf_text)

Acknowledgement Number : 324920610310722 Date of Filing : 31-Jul-2022
MROF
INDIAN INCOME TAX RETURN
[For individuals being a resident (other than not ordinarily resident)
having total income upto Rs.50 lakh, having Income from Salaries, one
house property, other sources (Interest etc.), and agricultural income upto Assessment
ITR-1
Year
Rs.5 thousand]
SAHAJ
[Not for an individual who is either Director in a company or has 2022-23
invested in unlisted equity shares or in cases where TDS has
been deducted u/s 194N or if income-tax is deferred on ESOP]
(Refer instructions for eligibility)
PART A GENERAL INFORMATION
(A1) PAN (A2) First Name (A2a) Middle Name (A3) Last Name (A4) Date of Birth (A5) Aadhaar Number (12 digits)/
03-Sep-1991 Aadhaar Enrolment Id (28 digits) (If
eligible for Aadhaar No.)
7xxxxxxx2294
(A6) Mobile No. (A7) Email Address (A8) Flat/Door/Block No. (A9) Name of Premises/ (A10) Road/Street/Post
maheshbhaygude@gmail .coGmAVTHAN MU PO TA Building/Village Office, Area/Loca

In [5]:
import re

def extract_tax_data(pdf_text):
    # Extract ITR Assessment
    itr_assessment = re.search(r"ITR Assessment\s*:\s*([\w\s\d]+)", text)
    
    # Extract Gross Salary
    gross_salary = re.search(r"Gross Salary\s*:\s*([\d,]+)", text)
    print(gross_salary)
    
    # Extract Net Salary
    net_salary = re.search(r"Net Salary\s*:\s*([\d,]+)", text)
    
    # Extract Income from Other Sources
    income_other_sources = re.search(r"Income from Other Sources\s*:\s*([\d,]+)", text)
    
    # Extract Income chargeable under Salaries
    income_chargeable_salaries = re.search(r"Income chargeable under Salaries\s*:\s*([\d,]+)", text)
    
    # Extract Total Tax Deducted
    total_tax_deducted = re.search(r"Total Tax Deducted\s*:\s*([\d,]+)", text)
    
    # Formatting and cleaning data
    tax_data = {
        "ITR Assessment": itr_assessment.group(1) if itr_assessment else "Not found",
        "Gross Salary": gross_salary.group(1).replace(",", "") if gross_salary else "Not found",
        "Net Salary": net_salary.group(1).replace(",", "") if net_salary else "Not found",
        "Income from Other Sources": income_other_sources.group(1).replace(",", "") if income_other_sources else "Not found",
        "Income chargeable under Salaries": income_chargeable_salaries.group(1).replace(",", "") if income_chargeable_salaries else "Not found",
        "Total Tax Deducted": total_tax_deducted.group(1).replace(",", "") if total_tax_deducted else "Not found"
    }
    
    return tax_data

# Example usage
text = """
... # Your extracted document text here
"""

tax_data = extract_tax_data(text)
print(tax_data)


None
{'ITR Assessment': 'Not found', 'Gross Salary': 'Not found', 'Net Salary': 'Not found', 'Income from Other Sources': 'Not found', 'Income chargeable under Salaries': 'Not found', 'Total Tax Deducted': 'Not found'}


In [56]:
import pdfplumber
import pandas as pd

pdf = pdfplumber.open("sampleitr.pdf")
page = pdf.pages[0]
df = pd.DataFrame(page.extract_table())
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,MROF\nITR-1\nSAHAJ,,,,INDIAN INCOME TAX RETURN\n[For individuals bei...,,,,,,,,,,,,Assessment\nYear\n2022-23
1,,,,,,,,,,,,,,,,,
2,PART A GENERAL INFORMATION,,,,,,,,,,,,,,,,
3,(A1) PAN,,,(A2) First Name,,,(A2a) Middle Name,(A3) Last Name,,(A4) Date of Birth\n03-Sep-1991,,(A5) Aadhaar Number (12 digits)/\nAadhaar Enro...,,,,,
4,(A6) Mobile No.,,,,,(A7) Email Address\nmaheshbhaygude@gmail .,,(A8) Flat/Door/Block No.\ncoGmAVTHAN MU PO TA,,(A9) Name of Premises/\nBuilding/Village,,,,,"(A10) Road/Street/Post\nOffice, Area/Locality",,
5,(A11) Town/City/District\nSATARA,,,,,,,(A12) State\n19 - Maharashtra,,(A13) Country/Region\n91 - India,,,,,(A14) PIN Code/ZIP Code\n415010,,
6,(A15) Filed u/s (Tick)\n[Please see instruction],,,,,139(1)-On or before due date\n139(4)-Belated 1...,,,,(A16) Nature of Employment -\nCentral Govt. St...,,,,,,,
7,(A17)Or Filed in\nresponse to notice u/s,,,,,139(9) 142(1) 148,,,,,,,,,,,
8,"(A18)If revised/defective, then enter Receipt ...",,,,,,,,,,,,,,,,
9,(A19) If filed in response to notice u/s 139(9...,,,,,,,,,,,,,,,,


In [61]:
import pdfplumber
import re

# Function to extract specific fields from the text
def extract_fields_from_pdf_text(text):
    extracted_data = {
        "Gross Salary": None,
        "Net Salary": None,
        "Income from Other Sources": None,
        "Income chargeable under Salaries": None,
        "Total Tax Deducted": None
    }

    lines = text.replace(",", "").lower().split("\n")

    for i, line in enumerate(lines):
        if "gross salary" in line:
            extracted_data["Gross Salary"] = find_numeric_value_in_line(line)
        elif "net salary" in line:
            extracted_data["Net Salary"] = find_numeric_value_in_line(line)
        elif "income from other sources" in line:
            extracted_data["Income from Other Sources"] = find_numeric_value_in_line(line)
        elif "income chargeable under the head ‘salaries’" in line:
            extracted_data["Income from Other Sources"] = find_numeric_value_in_line(line)
        elif "total tax deducted" in line or "total tax" in line:
            extracted_data["Total Tax Deducted"] = find_numeric_value_in_line(line)

    return extracted_data


def find_numeric_value_in_line(line):
    match = re.search(r"(\d+[\d\s]*\d+)", line)
    if match:
        return match.group(1).strip()
    return None

# Extract text from all pages of the PDF
def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"
    return full_text


pdf_path = 'sampleitr.pdf'

pdf_text = extract_text_from_pdf(pdf_path)

extracted_data = extract_fields_from_pdf_text(pdf_text)

for key, value in extracted_data.items():
    if value:
        print(f"{key}: {value}")
    else:
        print(f"{key}: Not found")

Gross Salary: 775311
Net Salary: 775311
Income from Other Sources: 3 0
Income chargeable under Salaries: Not found
Total Tax Deducted: Not found


In [52]:
import pytesseract
from pdf2image import convert_from_path
import re

# Function to extract the required fields from the extracted text
def extract_fields_from_text(text):
    extracted_data = {
        "Gross Salary": None,
        "Net Salary": None,
        "Income from Other Sources": None,
        "Total Tax Deducted": None
    }

    # Define patterns for each field
    patterns = {
        "Gross Salary": r"gross salary\s*[\w\W]*?\s*([\d,]+)",
        "Net Salary": r"net salary\s*[\w\W]*?\s*([\d,]+)",
        "Income from Other Sources": r"income from other sources\s*[\w\W]*?\s*([\d,]+)",
        "Total Tax Deducted": r"total tax deducted\s*[\w\W]*?\s*([\d,]+)"
    }

    # Normalize the text by lowering the case and removing commas
    text = text.lower().replace(",", "")

    # Apply patterns and extract the data
    for field, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            extracted_data[field] = match.group(1)

    return extracted_data

# Convert PDF to images using pdf2image and then extract text using pytesseract
def extract_text_from_pdf(pdf_path):
    # Convert PDF pages to images
    pages = convert_from_path(pdf_path)
    
    # Extract text from each page using Tesseract OCR
    extracted_text = ""
    for page in pages:
        extracted_text += pytesseract.image_to_string(page)

    return extracted_text

# Path to your PDF file
pdf_path = 'sampleitr.pdf'

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Extract the fields from the text
extracted_data = extract_fields_from_text(pdf_text)

# Print the extracted values
for key, value in extracted_data.items():
    if value:
        print(f"{key}: {value}")
    else:
        print(f"{key}: Not found")


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [53]:
import spacy
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF using PyMuPDF."""
    document = fitz.open(pdf_path)
    full_text = ""
    for page_num in range(len(document)):
        full_text += document[page_num].get_text("text") + "\n"
    document.close()
    return full_text

def extract_data_with_ner(text, nlp_model):
    """Use a pre-trained spaCy NER model to extract financial fields."""
    doc = nlp_model(text)
    extracted_data = {}

    # Define the target labels and their corresponding field names
    field_labels = {
        "GROSS_SALARY": "Gross Salary",
        "NET_SALARY": "Net Salary",
        "INCOME_OTHER_SOURCES": "Income from Other Sources",
        "ASSESSMENT_DATE": "ITR Assessment Date"
    }

    for ent in doc.ents:
        label = ent.label_
        if label in field_labels:
            extracted_data[field_labels[label]] = ent.text

    return extracted_data

# Load a pre-trained spaCy model (you can customize or fine-tune your own model)
# Install spaCy models with: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Example usage
pdf_path = "path_to_your_income_tax_return.pdf"
pdf_text = extract_text_from_pdf('sampleitr.pdf')
data = extract_data_with_ner(pdf_text, nlp)
print(data)


{}


In [55]:
import pytesseract
from PIL import Image
import pdfplumber

def extract_text_from_pdf(pdf_path):
    """Extract text and images from a PDF."""
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text
            full_text += page.extract_text() or ""
            
            # Extract images and apply OCR
            for img in page.images:
                # Get the image bounding box
                x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
                
                # Convert the PDF page to an image
                page_image = page.to_image()
                
                # Crop the image using the bounding box
                cropped_image = page_image.original.crop((x0, top, x1, bottom))
                
                # Apply OCR to the cropped image
                ocr_text = pytesseract.image_to_string(cropped_image)
                full_text += "\n" + ocr_text
    return full_text

# Example usage
pdf_path = "sampleitr.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)


TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [59]:
import pdfplumber
import re

# Function to extract specific fields from the text
def extract_fields_from_pdf_text(text):
    extracted_data = {
        "Gross Salary": None,
        "Net Salary": None,
        "Income from Other Sources": None,
        "Income chargeable under Salaries": None,
        "Total Tax Deducted": None
    }

    lines = text.replace(",", "").lower().split("\n")

    for i, line in enumerate(lines):
        if "gross salary" in line:
            extracted_data["Gross Salary"] = find_numeric_value_in_line(line)
        elif "net salary" in line:
            extracted_data["Net Salary"] = find_numeric_value_in_line(line)
        elif "income from other sources" in line:
            extracted_data["Income from Other Sources"] = find_numeric_value_in_line(line)
        elif "Income chargeable under the head ‘Salaries’" in line:
            extracted_data["Income chargeable under Salaries"] = find_numeric_value_in_line(line)
        elif "total tax deducted" in line or "total tax" in line:
            extracted_data["Total Tax Deducted"] = find_numeric_value_in_line(line)

    return extracted_data


def find_numeric_value_in_line(line):
    match = re.search(r"(\d+[\d\s]*\d+)", line)
    if match:
        return match.group(1).strip()
    return None

# Extract text from all pages of the PDF
def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"
    return full_text


pdf_path = 'sampleitr.pdf'

pdf_text = extract_text_from_pdf(pdf_path)

extracted_data = extract_fields_from_pdf_text(pdf_text)

for key, value in extracted_data.items():
    if value:
        print(f"{key}: {value}")
    else:
        print(f"{key}: Not found")

Gross Salary: 775311
Net Salary: 775311
Income from Other Sources: 3 0
Income chargeable under Salaries: Not found
Total Tax Deducted: Not found
