In [20]:
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import json
from lxml import etree

pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  

def summarize_text(text):
    return text[:200]  # Dummy summary (first 200 chars only)

# Function to process TXT filesy
def process_txt(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# Function to process JSON files
def process_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Extract all text values from the JSON recursively
    def extract_text_from_json(data, text=""):
        if isinstance(data, dict):
            for value in data.values():
                text = extract_text_from_json(value, text)
        elif isinstance(data, list):
            for item in data:
                text = extract_text_from_json(item, text)
        elif isinstance(data, str):
            text += data + " "
        return text
    
    extracted_text = extract_text_from_json(data)
    
    # Save to .txt file
    txt_output = os.path.splitext(file_path)[0] + ".txt"
    with open(txt_output, 'w') as f:
        f.write(extracted_text)
    
    return extracted_text

# Function to process PDF files by treating each page as an image
def process_pdf_with_images(file_path):
    # Convert PDF pages to images
    images = convert_from_path(file_path)
    extracted_text = ""
    
    for i, image in enumerate(images):
        # Use Tesseract to extract text from each image (PDF page)
        text = pytesseract.image_to_string(image)
        extracted_text += text
    
    # Save to .txt file
    txt_output = os.path.splitext(file_path)[0] + ".txt"
    with open(txt_output, 'w') as f:
        f.write(extracted_text)
    
    return extracted_text

# Function to process Handwritten Images (Future custom model placeholder)
def process_handwritten_image(file_path):
    # TODO: Replace with your custom handwriting recognition model
    custom_model = 'handwritten.traindata'
    # For now, using Tesseract for OCR
    image = Image.open(file_path)
    extracted_text = pytesseract.image_to_string(image)
    
    # Save to .txt file
    txt_output = os.path.splitext(file_path)[0] + ".txt"
    with open(txt_output, 'w') as f:
        f.write(extracted_text)
    
    return extracted_text

# Function to process Digitally Written Images (Using trained font detection model + Tesseract)
def process_digital_image(file_path):
    # Placeholder for custom font detection model
    custom_model = 'digitalfonts.traindata'
    # Extract text using Tesseract for now
    image = Image.open(file_path)
    extracted_text = pytesseract.image_to_string(image)
    
    # Save to .txt file
    txt_output = os.path.splitext(file_path)[0] + ".txt"
    with open(txt_output, 'w') as f:
        f.write(extracted_text)
    
    return extracted_text

# Function to process XML files
def process_xml(file_path):
    with open(file_path, 'r') as file:
        tree = etree.parse(file)
        extracted_text = ''.join(tree.xpath('//text()'))  # Extract all text elements
    
    # Save to .txt file
    txt_output = os.path.splitext(file_path)[0] + ".txt"
    with open(txt_output, 'w') as f:
        f.write(extracted_text)
    
    return extracted_text

# Function to detect file type and run the appropriate processing
def process_file(file_path):
    # Get the file extension and auto-detect the type
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == '.txt':
        text = process_txt(file_path)
        file_type = 'TXT'
    elif ext == '.json':
        text = process_json(file_path)
        file_type = 'JSON'
    elif ext == '.pdf':
        text = process_pdf_with_images(file_path)
        file_type = 'PDF'
    elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
        # Prompt for image type (handwritten or digital)
        image_type = input("Is the image handwritten or digital font? (Enter 'Handwritten' or 'Digital'): ").strip().lower()
        if image_type == 'handwritten':
            text = process_handwritten_image(file_path)
            file_type = 'Handwritten Image'
        elif image_type == 'digital':
            text = process_digital_image(file_path)
            file_type = 'Digital Image'
        else:
            raise ValueError("Invalid image type entered. Please enter 'Handwritten' or 'Digital'.")
    elif ext == '.xml':
        text = process_xml(file_path)
        file_type = 'XML'
    else:
        raise ValueError("Unsupported file type")

    # Pass the extracted text to the summarization model
    summary = summarize_text(text)

    # Save summary to an output file
    output_file = f"output_{file_type.replace(' ', '_')}.txt"
    with open(output_file, 'w') as f:
        f.write(text)
    
    return summary

# Example Usage
if __name__ == "__main__":
    file_path = input("Enter the file path: ")
    summary = process_file(file_path)
    
    print("Summary of the text:")
    print(summary)


Summary of the text:
How +0 Use This Handwwx4iting Change?
Heste 25 how JOU can convent text +0
handwriting with this +00:

Type 0% copy-paste youu content in the Input
field.

Or, Jou can dinectty uptoad a Fite fom your

