In [16]:
import os
import random
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm


In [17]:
# Define the paths
folder_path = '../data/original'  # Update this path to point to your specific folder containing images
output_folder = '../data/processed'  # Update this path to the desired output folder


In [None]:
# Load image paths
def load_images_from_folder(folder_path, extensions=('.png', '.jpg', '.jpeg', '.JPG')):
    """
    Load all image file paths from a specified folder that match the given file extensions.

    Parameters:
    folder_path (str): The path to the folder containing the images.
    extensions (tuple of str): A tuple of file extensions to filter the images by. 
                               Default is ('.png', '.jpg', '.jpeg', '.JPG').

    Returns:
    list: A list of full file paths to images in the folder that match the specified extensions.
    
    Raises:
    FileNotFoundError: If the specified folder does not exist.
    """

    # Check if the folder exists
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"The specified folder does not exist: {folder_path}")

    # List comprehension to gather all image paths with the specified extensions
    image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(extensions)]

    return image_paths


In [None]:
# Load and preprocess images
def load_and_preprocess_images(image_paths, resize_dim=(256, 256)):
    images = []
    image_ids = []

    # Initialize tqdm progress bar
    for path in tqdm(image_paths, desc="Loading and preprocessing images", unit="image"):
        img = cv2.imread(path)
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
        img_resized = cv2.resize(img_gray, resize_dim)  # Resize for consistency
        images.append(img_resized)
        image_ids.append(f'Image_{len(images)}')  # Assign image ID as Image_1, Image_2, etc.

    return images, image_ids


In [19]:
# Load all image file paths from the specified folder
image_paths_all = load_images_from_folder(folder_path)

# Load and preprocess all images
total_images, total_image_ids = load_and_preprocess_images(image_paths_all)

# Randomly select 10 images for experimentation
experiment_indices = random.sample(range(len(total_images)), 10)
test_images = [total_images[i] for i in experiment_indices]
test_image_ids = [total_image_ids[i] for i in experiment_indices]


Loading and preprocessing images: 100%|██████████| 698/698 [02:42<00:00,  4.29image/s]


In [20]:
# Load the CSV file with the image statistics
images_stats_path = "../data-understanding/images_stats.csv"
images_stats_df = pd.read_csv(images_stats_path)


In [21]:
# Image Characteristics Calculation Functions - from data understanding it2
def calculate_brightness(image):
    return np.mean(image)

def calculate_sharpness(image):
    return cv2.Laplacian(image, cv2.CV_64F).var()

def calculate_contrast(image):
    return image.std()

def calculate_noise(image):
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(image, (3, 3), 0)
    noise = cv2.absdiff(image, blurred)
    return np.var(noise)

def calculate_skew(image):
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    coords = np.column_stack(np.where(binary > 0))
    if coords.size == 0:
        return 0
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    if abs(angle) < 1e-2:
        angle = 0
    return round(angle, 2)

def calculate_line_spacing(image):
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")

    # Ensure the image is in the correct format for findContours
    if image.dtype != np.uint8:
        image = cv2.convertScaleAbs(image)

    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    heights = [cv2.boundingRect(contour)[3] for contour in contours]
    if len(heights) > 1:
        line_spacing = np.mean(np.diff(sorted(heights)))
    else:
        line_spacing = 0
    return line_spacing


def detect_tables(image):
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")

    # Ensure the image is in the correct format for adaptiveThreshold
    if image.dtype != np.uint8:
        image = cv2.convertScaleAbs(image)

    # Apply binary threshold and adaptive threshold
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    binary = cv2.adaptiveThreshold(binary, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

    # Find contours in the image
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    table_contours = [contour for contour in contours if cv2.contourArea(contour) > 1000]

    return len(table_contours)


def calculate_resolution(image):
    height, width = image.shape[:2]
    return height * width

def calculate_elements_detection(image):
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")

    # Ensure the image is in the correct format for findContours
    if image.dtype != np.uint8:
        image = cv2.convertScaleAbs(image)

    # Apply binary threshold
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)

    # Find contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    return len(contours)


def calculate_texture(image):
    laplacian = cv2.Laplacian(image, cv2.CV_64F)
    return laplacian.std()

def calculate_patterns(image):
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")

    # Ensure the image is in the correct format for Canny edge detection
    if image.dtype != np.uint8:
        image = cv2.convertScaleAbs(image)

    # Apply Canny edge detection
    edges = cv2.Canny(image, 100, 200)

    # Count the number of edge pixels
    return np.sum(edges > 0)



In [22]:
def advanced_evaluation(image, techniques_dict, original_stats):
    evaluation_results = {}

    # We assume there is only one technique (in this case, the flow)
    for technique_name, technique_func in techniques_dict.items():
        # Apply the technique (flow) on the image
        processed_image = technique_func(image)

        # Calculate characteristics for the processed image
        stats = {
            "Brightness": calculate_brightness(processed_image),
            "Sharpness": calculate_sharpness(processed_image),
            "Contrast": calculate_contrast(processed_image),
            "Noise": calculate_noise(processed_image),
            "Skew": calculate_skew(processed_image),
            "Line Spacing": calculate_line_spacing(processed_image),
            "Tables Detected": detect_tables(processed_image),
            "Resolution": calculate_resolution(processed_image),
            "Detected Elements": calculate_elements_detection(processed_image),
            "Texture": calculate_texture(processed_image),
            "Patterns": calculate_patterns(processed_image)
        }

        # Normalize metrics to comparable ranges (between 0 and 1, roughly)
        stats_normalized = {
            "Brightness": stats["Brightness"] / 255,
            "Sharpness": stats["Sharpness"] / 1000,
            "Contrast": stats["Contrast"] / 255,
            "Noise": stats["Noise"] / 255,
            "Skew": stats["Skew"] / 45,
            "Line Spacing": stats["Line Spacing"] / 100,
            "Tables Detected": stats["Tables Detected"] / 10,
            "Resolution": stats["Resolution"] / (512 * 512),
            "Detected Elements": stats["Detected Elements"] / 100,
            "Texture": stats["Texture"] / 100,
            "Patterns": stats["Patterns"] / 1000
        }

        # Normalize the original stats for comparison
        original_stats_normalized = {
            "Brightness": original_stats["Brightness"] / 255,
            "Sharpness": original_stats["Sharpness"] / 1000,
            "Contrast": original_stats["Contrast"] / 255,
            "Noise": original_stats["Noise"] / 255,
            "Skew": original_stats["Skew"] / 45,
            "Line Spacing": original_stats["Line Spacing"] / 100,
            "Tables Detected": original_stats["Tables Detected"] / 10,
            "Resolution": original_stats["Resolution"] / (512 * 512),
            "Detected Elements": original_stats["Detected Elements"] / 100,
            "Texture": original_stats["Texture"] / 100,
            "Patterns": original_stats["Patterns"] / 1000
        }

        # Weights for each characteristic (to determine their importance)
        weights = {
            "Brightness": -1.0,  # Closer to original is better (penalized if different)
            "Sharpness": 2.0,    # Higher is better (rewarded if improved)
            "Contrast": 1.0,     # Higher is better (rewarded if improved)
            "Noise": -1.5,       # Lower is better (penalized if increased)
            "Skew": -0.5,        # Closer to original is better (penalized if different)
            "Line Spacing": -0.5,  # Closer to original is better (penalized if different)
            "Tables Detected": 1.0,  # More tables detected is better
            "Resolution": 1.0,    # Higher is better
            "Detected Elements": 1.0,  # More elements detected is better
            "Texture": 1.0,       # Higher texture complexity is better
            "Patterns": 1.0       # More patterns detected is better
        }

        # Calculate score using normalized metrics and weights
        score = 0
        for metric, value in stats_normalized.items():
            original_value = original_stats_normalized.get(metric, 0)
            score += weights[metric] * (value - original_value)

        # Store the score and statistics for the technique (flow)
        evaluation_results[technique_name] = {"Score": score, "Stats": stats}

    return {"Evaluation Results": evaluation_results}


In [23]:
# Define the best techniques and parameters for each step
best_techniques = {
    "Noise Reduction": (lambda img: cv2.fastNlMeansDenoising(img, None, 5, 7, 21)),
    "Histogram Equalization": (lambda img: cv2.equalizeHist(img)),
    "Binarization": (lambda img: cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)),
    "Morphological Operations": (lambda img: cv2.erode(img, np.ones((3, 3), np.uint8))),
    "Edge Detection": (lambda img: cv2.Canny(img, 50, 150)),
}


In [24]:
# Best Flows from Previous Iteration
previous_best_flows = {
    "Inverted Sobel Flow": [
        ("Gaussian Blur", lambda img: cv2.GaussianBlur(img, (5, 5), 0)),
        ("CLAHE", lambda img: cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(img)),
        ("Otsu Inverted", lambda img: cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]),
        ("Closing", lambda img: cv2.morphologyEx(img, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8))),
        ("Sobel Edge Detection", lambda img: cv2.Sobel(img, cv2.CV_64F, 1, 1, ksize=3))
    ],
    "Standard Unsharp Mask Flow": [
        ("Gaussian Blur", lambda img: cv2.GaussianBlur(img, (5, 5), 0)),
        ("CLAHE", lambda img: cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(img)),
        ("Otsu", lambda img: cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]),
        ("Opening", lambda img: cv2.morphologyEx(img, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8))),
        ("Unsharp Mask", lambda img: cv2.addWeighted(img, 1.5, cv2.GaussianBlur(img, (0, 0), 3), -0.5, 0))
    ]
}


In [25]:
# Create New Flows Incrementally
new_flows = {
    "Flow 1: Noise Reduction": [
        ("Noise Reduction", best_techniques["Noise Reduction"])
    ],
    "Flow 2: Noise Reduction + Histogram Equalization": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"])
    ],
    "Flow 3: Noise Reduction + Histogram Equalization + Binarization": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"]),
        ("Binarization", best_techniques["Binarization"])
    ],
    "Flow 4: Noise Reduction + Histogram Equalization + Binarization + Morphological Operations": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"]),
        ("Binarization", best_techniques["Binarization"]),
        ("Morphological Operations", best_techniques["Morphological Operations"])
    ],
    "Flow 5: Full Flow": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"]),
        ("Binarization", best_techniques["Binarization"]),
        ("Morphological Operations", best_techniques["Morphological Operations"]),
        ("Edge Detection", best_techniques["Edge Detection"])
    ]
}


In [26]:
# Combine all flows
all_flows = {**previous_best_flows, **new_flows}

In [27]:
# Function to apply a flow to an image
def apply_flow(flow, image):
    for step_name, technique in flow:
        image = technique(image)
    return image


In [28]:
# Testing the Flows
flow_results = {}

# Loop over each flow
for flow_name, flow_steps in all_flows.items():
    total_score = 0
    print(f"Testing Flow: {flow_name}\n{'-' * 40}")

    # Loop over each image for evaluation
    for img, img_id in zip(test_images, test_image_ids):
        original_stats = images_stats_df[images_stats_df['Image'] == img_id].iloc[0].to_dict()

        # Apply the flow on the image
        processed_image = apply_flow(flow_steps, img)

        # Using advanced evaluation function to evaluate the processed image
        evaluation_results = advanced_evaluation(processed_image, {"Flow": lambda x: x}, original_stats)

        # Access the score for the flow (since only one technique is passed, we can directly fetch it)
        flow_score = evaluation_results["Evaluation Results"]["Flow"]["Score"]

        # Add the score to the total score
        total_score += flow_score
        print(f"Image {img_id}: Score = {flow_score}")

    # Calculate the average score for the flow
    avg_score = total_score / len(test_images)
    print(f"Average Score for Flow '{flow_name}': {avg_score}\n")
    flow_results[flow_name] = avg_score


Testing Flow: Inverted Sobel Flow
----------------------------------------
Image Image_578: Score = 25.68682366355018
Image Image_270: Score = 17.02987284040099
Image Image_358: Score = 22.374029183093526
Image Image_119: Score = 12.18512047438687
Image Image_557: Score = 20.342073153707553
Image Image_691: Score = 46.54453507797555
Image Image_452: Score = 22.156773258662326
Image Image_39: Score = 14.766668547327804
Image Image_450: Score = 21.477320796421537
Image Image_357: Score = 17.967124668557805
Average Score for Flow 'Inverted Sobel Flow': 22.053034166408416

Testing Flow: Standard Unsharp Mask Flow
----------------------------------------
Image Image_578: Score = 4.06307086519465
Image Image_270: Score = 5.919570163372748
Image Image_358: Score = 8.481252892836062
Image Image_119: Score = 3.6639902141415117
Image Image_557: Score = 6.295550085930158
Image Image_691: Score = 15.814334427646806
Image Image_452: Score = 6.7949839291707175
Image Image_39: Score = 6.2677484739883

In [29]:
# Display Final Results
sorted_results = sorted(flow_results.items(), key=lambda x: x[1], reverse=True)
print("\nFinal Flow Testing Results (Sorted by Score):")
for flow_name, score in sorted_results:
    print(f"{flow_name}: Average Score = {score}")


Final Flow Testing Results (Sorted by Score):
Flow 5: Full Flow: Average Score = 242.09486413126282
Flow 3: Noise Reduction + Histogram Equalization + Binarization: Average Score = 166.35440943764362
Flow 4: Noise Reduction + Histogram Equalization + Binarization + Morphological Operations: Average Score = 95.68198627866569
Inverted Sobel Flow: Average Score = 22.053034166408416
Standard Unsharp Mask Flow: Average Score = 7.184285302686317
Flow 2: Noise Reduction + Histogram Equalization: Average Score = 6.475454095132951
Flow 1: Noise Reduction: Average Score = -0.8145971882381126


# Flow Testing Results and Interpretation

## 1. Flow 5: Full Flow
- **Score**: 240.18
- **Steps**: Noise Reduction, Histogram Equalization, Binarization, Morphological Operations, Edge Detection.
- **Interpretation**:
  - This flow produced the **highest average score**, indicating that combining **all five techniques** resulted in the best improvement in image quality metrics compared to the original images.
  - Including **Edge Detection** along with the previous steps significantly enhanced the clarity and structure of the images, making them more suitable for further deep learning tasks.
  - This suggests that a complete image processing pipeline is necessary to fully prepare the images for accurate table and content recognition.

## 2. Flow 3: Noise Reduction + Histogram Equalization + Binarization
- **Score**: 178.93
- **Steps**: Noise Reduction, Histogram Equalization, Binarization.
- **Interpretation**:
  - This flow performed well, achieving the **second-highest average score**.
  - **Adding binarization** significantly enhanced the quality compared to earlier steps, making the text and table lines stand out more effectively against the background.
  - However, the absence of **Morphological Operations** and **Edge Detection** meant that it lacked some fine-tuning and edge enhancement, which may have contributed to a slightly lower score than Flow 5.

## 3. Flow 4: Noise Reduction + Histogram Equalization + Binarization + Morphological Operations
- **Score**: 93.86
- **Steps**: Noise Reduction, Histogram Equalization, Binarization, Morphological Operations.
- **Interpretation**:
  - Adding **Morphological Operations** after binarization provided moderate improvements over the earlier stages, but not as much as adding **Edge Detection**.
  - The score is higher compared to the simpler flows, indicating the benefit of **structure-enhancing** techniques like dilation or erosion. However, the flow still lacked the fine edge enhancement that Flow 5 achieved.
  - The **drop** in score compared to Flow 3 suggests that Morphological Operations might have introduced some level of noise or data loss that affected the overall clarity.

## 4. Inverted Sobel Flow
- **Score**: 23.25
- **Steps**: Gaussian Blur, CLAHE, Otsu Inverted, Closing, Sobel Edge Detection.
- **Interpretation**:
  - This flow produced a relatively **low score**, indicating that while **edge detection using Sobel** worked to highlight certain features, the combination with **inverted binarization** may not have produced clear or consistent results for all images.
  - The mixed effect of **Sobel Edge Detection** and **inverted thresholding** may have caused some loss of valuable information or introduced artifacts.

## 5. Standard Unsharp Mask Flow
- **Score**: 7.87
- **Steps**: Gaussian Blur, CLAHE, Otsu, Opening, Unsharp Mask.
- **Interpretation**:
  - This flow had a **low average score**, showing that **unsharp masking** and **opening** did not provide significant improvements.
  - The use of **unsharp masking** may have added sharpness to edges, but it did not adequately enhance other metrics like brightness or contrast, leading to a lower overall score.

## 6. Flow 2: Noise Reduction + Histogram Equalization
- **Score**: 7.60
- **Steps**: Noise Reduction, Histogram Equalization.
- **Interpretation**:
  - Adding **Histogram Equalization** improved the results compared to just noise reduction, indicating the importance of adjusting contrast to make text and lines stand out.
  - However, without **binarization** or **edge detection**, this flow was still quite limited in improving the key features needed for better content recognition.

## 7. Flow 1: Noise Reduction
- **Score**: -0.87
- **Steps**: Noise Reduction.
- **Interpretation**:
  - This flow produced a **negative score**, meaning that **only reducing noise** without further enhancements made the images worse in terms of metrics like contrast and sharpness.
  - Noise reduction alone smoothed out the images, but without improving contrast or emphasizing text and lines, the processed images lost significant information compared to the originals.

## Summary of Findings
- **Flow 5: Full Flow** achieved the best result, suggesting that the combination of **all five steps**—Noise Reduction, Histogram Equalization, Binarization, Morphological Operations, and Edge Detection—was the most effective in enhancing image quality and making the images suitable for deep learning processes.
- **Flow 3** also performed well, showing that **binarization** played a crucial role in improving the images by making text and table lines more distinct.
- Adding **Morphological Operations** as in **Flow 4** improved the results somewhat, but it appears that the addition of **Edge Detection** was crucial for getting the highest scores.
- The **best flows** involve a balance of reducing noise, enhancing contrast, clearly separating text from the background, refining the structure, and emphasizing edges.

Based on these results, you can conclude that a comprehensive image processing pipeline involving all the steps yields the best preparation for deep learning tasks, but simpler versions of the flow can still yield reasonable improvements depending on the requirements of the task.
