# Data Preparation - IT2 - Choosing best flow 

In [20]:
import os
import random
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm


In [21]:
# Define the paths
folder_path = '../data/subset'  

## Loading Images and stats

In [22]:
# Load image paths
def load_images_from_folder(folder_path, extensions=('.png', '.jpg', '.jpeg', '.JPG')):
    """
    Load all image file paths from a specified folder that match the given file extensions.

    Parameters:
    folder_path (str): The path to the folder containing the images.
    extensions (tuple of str): A tuple of file extensions to filter the images by. 
                               Default is ('.png', '.jpg', '.jpeg', '.JPG').

    Returns:
    list: A list of full file paths to images in the folder that match the specified extensions.
    
    Raises:
    FileNotFoundError: If the specified folder does not exist.
    """

    # Check if the folder exists
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"The specified folder does not exist: {folder_path}")

    # List comprehension to gather all image paths with the specified extensions
    image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(extensions)]

    return image_paths


In [23]:
# Load and preprocess images
def load_and_preprocess_images(image_paths, resize_dim=(256, 256)):
    images = []
    image_ids = []

    # Initialize tqdm progress bar
    for path in tqdm(image_paths, desc="Loading and preprocessing images", unit="image"):
        img = cv2.imread(path)
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
        # img_resized = cv2.resize(img_gray, resize_dim)  # Resize for consistency
        images.append(img_gray)
        image_ids.append(f'Image_{len(images)}')  # Assign image ID as Image_1, Image_2, etc.

    return images, image_ids


In [24]:
# Load all image file paths from the specified folder
image_paths_all = load_images_from_folder(folder_path)

# Load and preprocess all images
total_images, total_image_ids = load_and_preprocess_images(image_paths_all)

# Randomly select 10 images for experimentation
# experiment_indices = random.sample(range(len(total_images)), 10)
# test_images = [total_images[i] for i in experiment_indices]
# test_image_ids = [total_image_ids[i] for i in experiment_indices]


Loading and preprocessing images: 100%|██████████| 12/12 [00:02<00:00,  4.99image/s]


## Functions of Characteristics

In [26]:
# Image Characteristics Calculation Functions - from data understanding it2
def calculate_brightness(image):
    return np.mean(image)

def calculate_sharpness(image):
    return cv2.Laplacian(image, cv2.CV_64F).var()

def calculate_contrast(image):
    return image.std()

def calculate_noise(image):
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(image, (3, 3), 0)
    noise = cv2.absdiff(image, blurred)
    return np.var(noise)


## Calculating Original Stats

### If from the original dataframe created

In [27]:
# Load the CSV file with the image statistics
# images_stats_path = "../data-understanding/images_stats.csv"  
# images_stats_df = pd.read_csv(images_stats_path)

In [28]:
# images_stats_df.columns

In [29]:
# images_stats_df.drop(['Skew','Line Spacing', 'Tables Detected', 'Resolution', 'Detected Elements','Texture', 'Patterns'],axis=1, inplace=True)

### From the subset 

In [30]:
def image_statistics_table(images, imagesId):
    stats_data = {'Image': [],
                  'Brightness': [],
                  'Sharpness': [],
                  'Contrast': [],
                  'Noise': [],}

    for i, img in enumerate(images):
        stats_data['Image'].append(imagesId[i])
        stats_data['Brightness'].append(calculate_brightness(img))
        stats_data['Sharpness'].append(calculate_sharpness(img))
        stats_data['Contrast'].append(calculate_contrast(img))
        stats_data['Noise'].append(calculate_noise(img))
    # Create a DataFrame to store per-image statistics
    df = pd.DataFrame(stats_data)
    return df

In [31]:
images_stats_df = image_statistics_table(total_images, total_image_ids)
print("Image Statistics Table:")
images_stats_df

Image Statistics Table:


Unnamed: 0,Image,Brightness,Sharpness,Contrast,Noise
0,Image_1,100.038606,19.325682,49.248767,0.611743
1,Image_2,98.179438,25.128856,45.808319,0.793025
2,Image_3,102.128122,20.47635,44.327512,0.670412
3,Image_4,103.643973,26.898489,40.046309,0.955973
4,Image_5,109.058805,24.906567,42.498071,0.730254
5,Image_6,111.614181,24.875,37.462779,0.757413
6,Image_7,106.277182,19.551871,36.462511,0.69744
7,Image_8,102.769224,67.817982,38.631583,1.579164
8,Image_9,95.099688,87.266055,51.179098,2.935492
9,Image_10,95.865772,62.674985,50.360914,2.025381


## Evaluation Function

In [32]:
def advanced_evaluation(image, techniques_dict, original_stats):
    evaluation_results = {}

    for technique_name, technique_func in techniques_dict.items():
        # Apply the technique
        processed_image = technique_func(image)

        # Calculate characteristics for the processed image
        stats = {
            "Brightness": calculate_brightness(processed_image),
            "Sharpness": calculate_sharpness(processed_image),
            "Contrast": calculate_contrast(processed_image),
            "Noise": calculate_noise(processed_image),
        }

        # Normalize metrics to comparable ranges (between 0 and 1, roughly)
        stats_normalized = {
            "Brightness": stats["Brightness"] / 255,
            "Sharpness": stats["Sharpness"] / 1000,
            "Contrast": stats["Contrast"] / 255,
            "Noise": stats["Noise"] / 255,
        }

        # Normalize the original stats for comparison
        original_stats_normalized = {
            "Brightness": original_stats["Brightness"] / 255,
            "Sharpness": original_stats["Sharpness"] / 1000,
            "Contrast": original_stats["Contrast"] / 255,
            "Noise": original_stats["Noise"] / 255,
        }

        # Weights for each characteristic (to determine their importance)
        weights = {
            "Brightness": 1.0,  # Higher is better (rewarded if improved)
            "Sharpness": 1.0,    # Higher is better (rewarded if improved) but images were generally sharp already 
            "Contrast": 2.0,     # Higher is better (rewarded if improved) the levels of contrast were lower and obstructed details
            "Noise": -1.5,       # Lower is better (penalized if increased)
        }

        # Calculate score using normalized metrics and weights
        score = 0
        for metric, value in stats_normalized.items():
            original_value = original_stats_normalized.get(metric, 0)
            score += weights[metric] * (value - original_value)

        evaluation_results[technique_name] = {"Score": score, "Stats": stats}

    # Determine the best technique based on the highest score
    best_technique = max(evaluation_results, key=lambda x: evaluation_results[x]["Score"])
    return {"Best Technique": best_technique, "Evaluation Results": evaluation_results}


## Setting the best techniques based on step 1.1

In [33]:
# Define the best techniques and parameters for each step
best_techniques = {
    "Noise Reduction": (lambda img: cv2.medianBlur(img, ksize=3)),
    "Histogram Equalization": (lambda img: cv2.equalizeHist(img)),  # No parameters were found to be beneficial
    "Binarization": (lambda img: cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blockSize=11, C=2)),
    "Morphological Operations": (lambda img: cv2.dilate(img, np.ones((9, 9), np.uint8))),
    "Edge Detection": (lambda img: cv2.Canny(img, threshold1=50, threshold2=150)),
}


## Using the previous flows found to be most suitable

In [34]:
# Best Flows from Previous Iteration
previous_best_flows = {
    "Flow - CLAHE": [
        ("Gaussian Blur", lambda img: cv2.GaussianBlur(img, (5, 5), 0)),
        ("CLAHE", lambda img: cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)).apply(img))
    ]
}


## Setting flows based on best techniques

In [35]:
# Create New Flows Incrementally
new_flows = {
    "Flow 1 Noise Reduction": [
        ("Noise Reduction", best_techniques["Noise Reduction"])
    ],
    "Flow 2 Noise Reduction + Histogram Equalization": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"])
    ],
    "Flow 3 Noise Reduction + Histogram Equalization + Binarization": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"]),
        ("Binarization", best_techniques["Binarization"])
    ],
    "Flow 4 Noise Reduction + Histogram Equalization + Binarization + Morphological Operations": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"]),
        ("Binarization", best_techniques["Binarization"]),
        ("Morphological Operations", best_techniques["Morphological Operations"])
    ],
    "Flow 5 Full Flow": [
        ("Noise Reduction", best_techniques["Noise Reduction"]),
        ("Histogram Equalization", best_techniques["Histogram Equalization"]),
        ("Binarization", best_techniques["Binarization"]),
        ("Morphological Operations", best_techniques["Morphological Operations"]),
        ("Edge Detection", best_techniques["Edge Detection"])
    ]
}


## Prepearing and running the flows

In [36]:
# Combine all flows
all_flows = {**previous_best_flows, **new_flows}

In [37]:
# Function to apply a flow to an image
def apply_flow(flow, image):
    for step_name, technique in flow:
        image = technique(image)
    return image


In [38]:
# Testing the Flows
flow_results = {}

# Define a specific image to save across all flows
image_to_save_id = total_image_ids[0]  # Choose the first image (or any specific image)

# Loop over each flow
for flow_name, flow_steps in all_flows.items():
    total_score = 0
    print(f"Testing Flow: {flow_name}\n{'-' * 40}")

    # Loop over each image for evaluation
    for img, img_id in zip(total_images, total_image_ids):
        original_stats = images_stats_df[images_stats_df['Image'] == img_id].iloc[0].to_dict()

        # Apply the flow on the image
        processed_image = apply_flow(flow_steps, img)

        # Save the processed image only for the selected image ID
        if img_id == image_to_save_id:
            # Specify the folder to save the processed images for the flows
            output_folder_flow = f"./Data/It2/Flows/{flow_name}"
            os.makedirs(output_folder_flow, exist_ok=True)

            # Save the processed image with the flow name and image ID
            cv2.imwrite(f"{output_folder_flow}/{flow_name}_Image_{img_id}.jpg", processed_image)

        # Using advanced evaluation function to evaluate the processed image
        evaluation_results = advanced_evaluation(processed_image, {"Flow": lambda x: x}, original_stats)

        # Access the score for the flow (since only one technique is passed, we can directly fetch it)
        flow_score = evaluation_results["Evaluation Results"]["Flow"]["Score"]

        # Add the score to the total score
        total_score += flow_score
        print(f"Image {img_id}: Score = {flow_score}")

    # Calculate the average score for the flow
    avg_score = total_score / len(total_images)
    print(f"Average Score for Flow '{flow_name}': {avg_score}\n")
    flow_results[flow_name] = avg_score


Testing Flow: Flow - CLAHE
----------------------------------------
Image Image_1: Score = 0.06169536056771593
Image Image_2: Score = 0.07898004046606698
Image Image_3: Score = 0.06065598633179331
Image Image_4: Score = 0.0676560965565005
Image Image_5: Score = 0.09260820738941113
Image Image_6: Score = 0.09955002190101941
Image Image_7: Score = 0.09406687210972073
Image Image_8: Score = 0.08532877794197584
Image Image_9: Score = 0.05379625775123446
Image Image_10: Score = 0.04384334413520083
Image Image_11: Score = 0.0896755389754957
Image Image_12: Score = 0.08952760079399172
Average Score for Flow 'Flow - CLAHE': 0.07644867541001053

Testing Flow: Flow 1 Noise Reduction
----------------------------------------
Image Image_1: Score = -0.010987027282247222
Image Image_2: Score = -0.013700469270125814
Image Image_3: Score = -0.011114293069925214
Image Image_4: Score = -0.01204227570192167
Image Image_5: Score = -0.014686089631650454
Image Image_6: Score = -0.013961430638968984
Image Im

In [39]:
# Display Final Results
sorted_results = sorted(flow_results.items(), key=lambda x: x[1], reverse=True)
print("\nFinal Flow Testing Results (Sorted by Score):")
for flow_name, score in sorted_results:
    print(f"{flow_name}: Average Score = {score}")


Final Flow Testing Results (Sorted by Score):
Flow 3 Noise Reduction + Histogram Equalization + Binarization: Average Score = 83.55246100901141
Flow 2 Noise Reduction + Histogram Equalization: Average Score = 0.5065538714568281
Flow 4 Noise Reduction + Histogram Equalization + Binarization + Morphological Operations: Average Score = 0.2746051967581929
Flow - CLAHE: Average Score = 0.07644867541001053
Flow 1 Noise Reduction: Average Score = -0.02226497058333796
Flow 5 Full Flow: Average Score = -0.6915651590714366


Final Flow Testing Results (Sorted by Score):
Flow 3: Noise Reduction + Histogram Equalization + Binarization: Average Score = 83.55
Flow 2: Noise Reduction + Histogram Equalization: Average Score = 0.51
Flow 4: Noise Reduction + Histogram Equalization + Binarization + Morphological Operations: Average Score = 0.27
Flow - CLAHE: Average Score = 0.08
Flow 1: Noise Reduction: Average Score = -0.02
Flow 5: Full Flow: Average Score = -0.69

# Flow Testing Results and Interpretation

## 1. Flow 3: Noise Reduction + Histogram Equalization + Binarization
- **Score**: 83.55
- **Steps**: Noise Reduction, Histogram Equalization, Binarization.
- **Interpretation**:
  - This flow produced the **highest average score**, indicating that combining these three techniques—**Noise Reduction**, **Histogram Equalization**, and **Binarization**—resulted in a significant improvement in image quality metrics compared to the original images.
  - **Binarization** helped make text and table lines more distinct, providing better separation of content from the background.
  - However, the absence of **Morphological Operations** and **Edge Detection** means that further refinement and edge enhancement could potentially yield even better results.

## 2. Flow 2: Noise Reduction + Histogram Equalization
- **Score**: 0.51
- **Steps**: Noise Reduction, Histogram Equalization.
- **Interpretation**:
  - Adding **Histogram Equalization** improved the results compared to just using noise reduction, suggesting that enhancing contrast played a crucial role in making text and lines stand out more.
  - The relatively low positive score indicates that although the images were improved, without **Binarization** or more advanced techniques like **Morphological Operations** or **Edge Detection**, the enhancement was limited.

## 3. Flow 4: Noise Reduction + Histogram Equalization + Binarization + Morphological Operations
- **Score**: 0.27
- **Steps**: Noise Reduction, Histogram Equalization, Binarization, Morphological Operations.
- **Interpretation**:
  - Adding **Morphological Operations** to the binarization step provided some improvement, but not as much as expected compared to Flow 3.
  - The low score indicates that while **Morphological Operations** added some structure to the images, they also introduced some level of distortion or data loss that negatively impacted the overall quality.

## 4. Flow - CLAHE
- **Score**: 0.08
- **Steps**: CLAHE.
- **Interpretation**:
  - This flow, which only used **CLAHE** (Contrast Limited Adaptive Histogram Equalization), yielded a very small positive score.
  - The limited improvement suggests that **contrast enhancement** alone, without noise reduction or edge enhancement, was not sufficient to achieve significant quality gains.

## 5. Flow 1: Noise Reduction
- **Score**: -0.02
- **Steps**: Noise Reduction.
- **Interpretation**:
  - This flow produced a **negative score**, meaning that **Noise Reduction** alone did not improve image quality and instead slightly degraded it.
  - The smoothed images lost some contrast and detail, which led to reduced image quality in terms of the evaluation metrics.

## 6. Flow 5: Full Flow
- **Score**: -0.69
- **Steps**: Noise Reduction, Histogram Equalization, Binarization, Morphological Operations, Edge Detection.
- **Interpretation**:
  - Surprisingly, the **Full Flow**—which combined all the techniques—produced the **lowest average score**.
  - This suggests that adding all five techniques together introduced too many transformations, which may have led to over-processing, resulting in the loss of important image characteristics.
  - The **Edge Detection** and **Morphological Operations** steps might have contributed to unwanted noise or artifacts, negatively affecting the overall score.

## Summary of Findings
- **Flow 3** achieved the best result, showing that a combination of **Noise Reduction**, **Histogram Equalization**, and **Binarization** is effective in enhancing the images while keeping transformations minimal.
- **Flow 2** also provided some improvement, indicating that **Histogram Equalization** is crucial for enhancing contrast and making features more prominent.
- Adding **Morphological Operations**, as in **Flow 4**, did not significantly enhance the results and might have introduced unnecessary noise or detail loss.
- The **Full Flow** (Flow 5) produced a negative score, suggesting that combining all techniques together without careful parameter tuning can lead to over-processing.
- The **best flows** balance noise reduction, contrast enhancement, and binarization to make content more distinct without over-processing the images.

Based on these results, a simplified image processing pipeline involving **Noise Reduction**, **Histogram Equalization**, and **Binarization** yields the best preparation for further tasks. More complex flows, such as adding **Morphological Operations** or **Edge Detection**, may require additional careful parameter tuning to prevent degradation in quality.
