In [31]:
import os
import cv2
from PIL import Image
import pytesseract
from pytesseract import Output

# Set Tesseract path and environment variables
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
TESSDATA_PREFIX = r'C:\Program Files\Tesseract-OCR\tessdata'
os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH

def detect_black_boxes(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    _, binary = cv2.threshold(blurred, 15, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Filter contours based on area and retrieve bounding boxes
    black_boxes = []
    for cnt in contours:
        if cv2.contourArea(cnt) > 500:
            x, y, w, h = cv2.boundingRect(cnt)
            black_boxes.append((x, y, w, h))
    return black_boxes

def get_ocr_data_with_position(image_path):
    img = Image.open(image_path)
    return pytesseract.image_to_data(img, output_type=Output.DICT)

def read_formulas_from_tex(folder_path):
    formulas = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.tex'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                formulas.append((file_path, file.read().strip()))
    return formulas

def calculate_distances(ocr_data, box):
    box_center_y = box[1] + box[3] // 2
    distances = []
    for i, top in enumerate(ocr_data['top']):
        if int(ocr_data['conf'][i]) > 60:  # High confidence
            text_center_y = top + ocr_data['height'][i] // 2
            distance = abs(text_center_y - box_center_y)
            distances.append((i, distance))
    return distances

def assign_formulas_to_boxes(ocr_data, black_boxes, formulas):
    assignments = {}
    for box in black_boxes:
        distances = calculate_distances(ocr_data, box)
        if distances:
            nearest = min(distances, key=lambda x: x[1])
            if formulas:
                file_path, formula = formulas.pop(0)
                assignments[nearest[0]] = formula
    return assignments

def generate_latex_document(ocr_data, formula_assignments, output_path):
    document_content = ""
    for i, (text, conf, left, top, width, height) in enumerate(zip(ocr_data['text'], ocr_data['conf'], ocr_data['left'], ocr_data['top'], ocr_data['width'], ocr_data['height'])):
        if int(conf) > 60:  # Confidence threshold
            if i in formula_assignments:
                document_content += "$$" + formula_assignments[i] + "$$" + "\n\n"  # Insert formula with extra line break
            else:
                document_content += text + " "

    latex_document = f"""\\documentclass{{article}}
\\usepackage{{amsmath}}
\\usepackage[utf8]{{inputenc}}
\\usepackage[letterpaper, margin=1in]{{geometry}}  % Set margin to 1 inch
\\begin{{document}}
{document_content}
\\end{{document}}"""

    with open(output_path, 'w') as file:
        file.write(latex_document)

if __name__ == "__main__":
    input_images_folder = 'C:/Users/Acer/Desktop/without_cropped'
    output_latex_folder = 'C:/Users/Acer/Desktop/full_latex' # The code creates the folder itself
    formulas_tex_folder = 'C:/Users/Acer/Desktop/math_latex'  # Update this path

    black_boxes = detect_black_boxes(input_images_folder)
    ocr_data = get_ocr_data_with_position(input_images_folder)
    formulas = read_formulas_from_tex(formulas_tex_folder)
    formula_assignments = assign_formulas_to_boxes(ocr_data, black_boxes, formulas)
    generate_latex_document(ocr_data, formula_assignments, output_latex_folder)

    print(f"LaTeX document generated at {output_latex_folder}.")


error: OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'


In [32]:
import os
import pytesseract
from pytesseract import Output
from PIL import Image
import cv2
from tqdm import tqdm

# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = r'C:\Program Files\Tesseract-OCR\tessdata'

# Set Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def detect_black_boxes(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError("Unable to read image.")
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        _, binary = cv2.threshold(blurred, 15, 255, cv2.THRESH_BINARY_INV)
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        return [cv2.boundingRect(cnt) for cnt in contours if cv2.contourArea(cnt) > 100]
    except Exception as e:
        print(f"Error detecting black boxes in image {image_path}: {e}")
        return []

def get_ocr_data_with_position(image_path):
    try:
        img = Image.open(image_path)
        return pytesseract.image_to_data(img, output_type=Output.DICT)
    except Exception as e:
        print(f"Error extracting OCR data from image {image_path}: {e}")
        return {'text': [], 'conf': [], 'top': [], 'height': []}

def read_formulas_from_tex(folder_path):
    try:
        formulas = []
        for filename in os.listdir(folder_path):
            if filename.endswith('.tex'):
                formulas.append(os.path.join(folder_path, filename))
        return sorted(formulas, reverse=True)  # Sort the formulas in reverse order
    except Exception as e:
        print(f"Error reading formulas from folder {folder_path}: {e}")
        return []

# Other functions remain unchanged

if __name__ == "__main__":
    input_images_folder = 'C:/Users/Acer/Desktop/without_cropped'
    output_latex_folder = 'C:/Users/Acer/Desktop/full_latex' # The code creates the folder itself
    formulas_tex_folder = 'C:/Users/Acer/Desktop/math_latex'  # Update this path

    if not os.path.exists(output_latex_folder):
        os.makedirs(output_latex_folder)

    image_files = [f for f in os.listdir(input_images_folder) if f.endswith('.png') or f.endswith('.jpg')]
    
    with tqdm(total=len(image_files), desc="Processing images", unit=" image", dynamic_ncols=True) as pbar:
        for image_file in image_files:
            image_path = os.path.join(input_images_folder, image_file)
            formulas = read_formulas_from_tex(formulas_tex_folder)
            black_boxes = detect_black_boxes(image_path)
            ocr_data = get_ocr_data_with_position(image_path)
            formula_assignments = assign_formulas_to_boxes(ocr_data, black_boxes, formulas)
            output_latex_path = os.path.join(output_latex_folder, os.path.splitext(image_file)[0] + '.tex')
            generate_latex_document(ocr_data, formula_assignments, output_latex_path)
            pbar.update(1)
            pbar.set_postfix({"Current file": image_file})


Processing images:   0%|                                                                     | 0/2 [00:02<?, ? image/s]


ValueError: too many values to unpack (expected 2)

In [33]:
import os
import pytesseract
from pytesseract import Output
from PIL import Image
import cv2
from tqdm import tqdm

# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = r'C:\Program Files\Tesseract-OCR\tessdata'

# Set Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def detect_black_boxes(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError("Unable to read image.")
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        _, binary = cv2.threshold(blurred, 15, 255, cv2.THRESH_BINARY_INV)
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        return [cv2.boundingRect(cnt) for cnt in contours if cv2.contourArea(cnt) > 100]
    except Exception as e:
        print(f"Error detecting black boxes in image {image_path}: {e}")
        return []

def get_ocr_data_with_position(image_path):
    try:
        img = Image.open(image_path)
        return pytesseract.image_to_data(img, output_type=Output.DICT)
    except Exception as e:
        print(f"Error extracting OCR data from image {image_path}: {e}")
        return {'text': [], 'conf': [], 'top': [], 'height': []}

def read_formulas_from_tex(folder_path):
    try:
        formulas = []
        for filename in os.listdir(folder_path):
            if filename.endswith('.tex'):
                formulas.append(os.path.join(folder_path, filename))
        return sorted(formulas, reverse=True)  # Sort the formulas in reverse order
    except Exception as e:
        print(f"Error reading formulas from folder {folder_path}: {e}")
        return []

def calculate_distances(ocr_data, box):
    box_center_y = box[1] + box[3] // 2
    distances = []
    for i, top in enumerate(ocr_data['top']):
        if int(ocr_data['conf'][i]) > 60:  # High confidence
            text_center_y = top + ocr_data['height'][i] // 2
            distance = abs(text_center_y - box_center_y)
            distances.append((i, distance))
    return distances

def assign_formulas_to_boxes(ocr_data, black_boxes, formulas):
    assignments = {}
    for i, box in enumerate(black_boxes):
        distances = calculate_distances(ocr_data, box)
        if distances:
            nearest = min(distances, key=lambda x: x[1])
            if formulas:
                file_path = formulas.pop()  # Retrieve the last formula path
                with open(file_path, 'r') as file:
                    formula = ' '.join([line.strip() for line in file.readlines() if line.strip()])
                assignments[nearest[0]] = formula
            else:
                assignments[nearest[0]] = ""
    return assignments

def generate_latex_document(ocr_data, formula_assignments, output_path):
    document_content = ""
    for i, text in enumerate(ocr_data['text']):
        if int(ocr_data['conf'][i]) > 60:  # Confidence threshold
            if i in formula_assignments:
                document_content += formula_assignments[i] + "\n"  # Insert formula
            document_content += text + " "
    latex_document = f"""\\documentclass{{article}}
\\usepackage{{amsmath}}
\\begin{{document}}
{document_content}
\\end{{document}}"""
    
    with open(output_path, 'w') as file:
        file.write(latex_document)

if __name__ == "__main__":
    input_images_folder = 'C:/Users/Acer/Desktop/without_cropped'
    output_latex_folder = 'C:/Users/Acer/Desktop/full_latex' # The code creates the folder itself
    formulas_tex_folder = 'C:/Users/Acer/Desktop/math_latex'  # Update this path

    if not os.path.exists(output_latex_folder):
        os.makedirs(output_latex_folder)

    image_files = [f for f in os.listdir(input_images_folder) if f.endswith('.png') or f.endswith('.jpg')]
    
    with tqdm(total=len(image_files), desc="Processing images", unit=" image", dynamic_ncols=True) as pbar:
        for image_file in image_files:
            image_path = os.path.join(input_images_folder, image_file)
            formulas = read_formulas_from_tex(formulas_tex_folder)
            black_boxes = detect_black_boxes(image_path)
            ocr_data = get_ocr_data_with_position(image_path)
            formula_assignments = assign_formulas_to_boxes(ocr_data, black_boxes, formulas)
            output_latex_path = os.path.join(output_latex_folder, os.path.splitext(image_file)[0] + '.tex')
            generate_latex_document(ocr_data, formula_assignments, output_latex_path)
            pbar.update(1)
            pbar.set_postfix({"Current file": image_file})


Processing images: 100%|██████| 2/2 [00:05<00:00,  2.80s/ image, Current file=analysis-and_2_input_without_cropped.png]
