# Iteration 2 - Data Understanding before Data Preparation

> **Creator**: Ryo

In [None]:
import os
import itertools
import warnings

import random
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from tqdm import tqdm

### Variables

In [None]:
# Define the path to the folder containing the images to be processed
folder_path = '../data/original'

---
## Load Images Data

### Load Image Paths From Source Folder

#### *Function*

In [None]:
def load_images_from_folder(folder_path, extensions=('.png', '.jpg', '.jpeg', '.JPG')):
    """
    Load all image file paths from a specified folder that match the given file extensions.

    Parameters:
    folder_path (str): The path to the folder containing the images.
    extensions (tuple of str): A tuple of file extensions to filter the images by. 
                                Default is ('.png', '.jpg', '.jpeg', '.JPG').

    Returns:
    list: A list of full file paths to images in the folder that match the specified extensions.
    
    Raises:
    FileNotFoundError: If the specified folder does not exist.
    """

    # Check if the folder exists
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"The specified folder does not exist: {folder_path}")

    # List comprehension to gather all image paths with the specified extensions
    image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(extensions)]

    return image_paths

### Load & Pre-Process Image

#### *Function*

In [None]:
def load_and_preprocess_images(image_paths, resize_dim=(256, 256)):
    """
    Load and preprocess a list of images by converting to grayscale and resizing.
    
    Parameters:
    image_paths (list): List of paths to image files.
    resize_dim (tuple): Target dimensions to resize images (width, height).
    
    Returns:
    list: List of preprocessed images as numpy arrays.
    """
    images = []  # Initialize list to store preprocessed images

    # Initialize tqdm progress bar to track loading and processing of images
    for path in tqdm(image_paths, desc="Loading and preprocessing images", unit="image"):
        img = cv2.imread(path)  # Load image from file
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert image to grayscale
        img_resized = cv2.resize(img_gray, resize_dim)  # Resize image to specified dimensions
        images.append(img_resized)  # Add the processed image to the list

    return images  # Return list of preprocessed images

In [None]:
def load_images(image_paths):
    """
    Load a list of original images without any preprocessing.
    
    Parameters:
    image_paths (list): List of paths to image files.
    
    Returns:
    list: List of original images as numpy arrays.
    """
    original_images = []  # Initialize list to store original images

    # Loop over each image path in the provided list
    for path in image_paths:
        img = cv2.imread(path)  # Load the original image from file
        original_images.append(img)  # Add the loaded image to the list

    return original_images  # Return list of original images

### Implementation

In [None]:
image_paths = load_images_from_folder(folder_path)

In [None]:
images = load_and_preprocess_images(image_paths)

---
## Statistical Analysis

### Brightness

In [None]:
def calculate_brightness(image):
    """
    Calculate the brightness of an image based on the mean pixel intensity.
    
    Parameters:
    image (numpy array): Input image as a numpy array. Should be in grayscale or 
                        single-channel format for accurate brightness calculation.
                        
    Returns:
    float: The average brightness of the image.
    """
    return np.mean(image)  # Calculate and return the mean pixel intensity

### Sharpness

In [None]:
def calculate_sharpness(image):
    """
    Calculate the sharpness of an image based on the variance of the Laplacian.
    
    Parameters:
    image (numpy array): Input image as a numpy array, preferably in grayscale 
                        for accurate sharpness calculation.
                        
    Returns:
    float: A sharpness score based on the variance of the Laplacian. 
            Higher values indicate sharper images.
    """
    # Apply Laplacian filter to the image and calculate its variance
    return cv2.Laplacian(image, cv2.CV_64F).var()  # Variance of Laplacian method


### Contrast

In [None]:
def calculate_contrast(image):
    """
    Calculate the contrast of an image based on the standard deviation of pixel values.
    
    Parameters:
    image (numpy array): Input image as a numpy array, preferably in grayscale.
    
    Returns:
    float: A contrast score based on the standard deviation of pixel values.
    """
    return image.std()  # Standard deviation of pixel values

### Noise

In [None]:
def calculate_noise(image):
    """
    Calculate the noise level of an image by analyzing the difference between
    the original and a blurred version.
    
    Parameters:
    image (numpy array): Input image as a numpy array.
    
    Returns:
    float: A noise score based on the variance of the difference between the 
            original and blurred images.
    """
    # Convert to grayscale if necessary
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply Gaussian blur and calculate the difference
    blurred = cv2.GaussianBlur(image, (3, 3), 0)
    noise = cv2.absdiff(image, blurred)
    return np.var(noise)  # Variance of the noise image

### Skew

In [None]:
def calculate_skew(image):
    """
    Calculate the skew angle of an image using contour-based analysis.
    
    Parameters:
    image (numpy array): Input grayscale image as a numpy array.
    
    Returns:
    float: The skew angle in degrees, rounded to two decimal places.
    """
    # Ensure the image is grayscale
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")

    # Thresholding to create a binary image
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)

    # Identify non-zero coordinates in binary image
    coords = np.column_stack(np.where(binary > 0))
    if coords.size == 0:
        return 0  # No contours found, so skew is zero

    # Calculate the angle of the minimum area rectangle
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    return round(angle, 2)  # Return skew angle, rounded to two decimal places

### Line Spacing

In [None]:
def calculate_line_spacing(image):
    """
    Calculate the average line spacing in a grayscale image by analyzing bounding box heights.
    
    Parameters:
    image (numpy array): Input grayscale image as a numpy array.
    
    Returns:
    float: Average spacing based on the heights of bounding boxes around text lines.
    """
    # Ensure image is grayscale
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")

    # Threshold and find contours
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Calculate line spacing based on bounding box heights
    heights = [cv2.boundingRect(contour)[3] for contour in contours]
    if len(heights) > 1:
        return np.mean(np.diff(sorted(heights)))  # Average spacing between heights
    return 0  # No lines found

### Tables Detected

In [None]:
def detect_tables(image):
    """
    Detect potential tables in a grayscale image by analyzing large contours.
    
    Parameters:
    image (numpy array): Input grayscale image as a numpy array.
    
    Returns:
    tuple: A tuple containing:
        - int: Number of detected tables.
        - list: List of contours that represent detected tables.
    """
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")
    
    # Apply adaptive thresholding
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    binary = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY_INV, 11, 2)

    # Detect contours and filter by area
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    table_contours = [contour for contour in contours if cv2.contourArea(contour) > 1000]
    
    # Return both the count and the list of table contours
    return len(table_contours), table_contours

### Resolution

In [None]:
def calculate_resolution(image):
    """
    Calculate the resolution of an image by multiplying its width and height.
    
    Parameters:
    image (numpy array): Input image as a numpy array.
    
    Returns:
    int: Total number of pixels in the image.
    """
    height, width = image.shape[:2]  # Extract dimensions
    return height * width  # Calculate total pixels

### Elements Detected

In [None]:
def calculate_elements_detection(image):
    """
    Calculate the number of detected elements in an image and return contours.
    
    Parameters:
    image (numpy array): Input grayscale image as a numpy array.
    
    Returns:
    tuple: A tuple containing:
        - int: Number of detected elements based on contours.
        - list: List of contours that represent detected elements.
    """
    # Ensure the image is in grayscale format
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")
    
    # Apply thresholding to create a binary image for contour detection
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    
    # Detect contours based on the binary image
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Return the number of elements (contours) and the list of contours
    return len(contours), contours

### Texture

In [None]:
def calculate_texture(image):
    """
    Calculate the texture complexity of an image based on the Laplacian's standard deviation.
    
    Parameters:
    image (numpy array): Input image as a numpy array.
    
    Returns:
    float: A measure of texture complexity based on the standard deviation of the Laplacian.
    """
    laplacian = cv2.Laplacian(image, cv2.CV_64F)  # Apply Laplacian filter
    return laplacian.std()  # Return standard deviation of the Laplacian

### Patterns

In [None]:
def calculate_patterns(image):
    """
    Calculate the presence of patterns in a grayscale image using edge detection.
    
    Parameters:
    image (numpy array): Input grayscale image as a numpy array.
    
    Returns:
    int: Number of edge pixels detected, representing pattern presence.
    """
    if len(image.shape) != 2:
        raise ValueError("Invalid image format. Image must be a 2D grayscale image.")
    
    edges = cv2.Canny(image, 100, 200)  # Detect edges
    return np.sum(edges > 0)  # Count the number of edge pixels

---
## Statistical Summary

### Create Statistics Table

#### *Function*

In [None]:
def image_statistics_table(images, image_paths):
    """
    Generate a table of statistics for a list of images, with a loading progress bar.
    
    Parameters:
    images (list): List of image arrays.
    image_paths (list): List of image file names (not full paths) corresponding to each image.
    
    Returns:
    pd.DataFrame: DataFrame containing statistics for each image.
    """
    stats_data = {
        'Image': [], 
        'Brightness': [], 
        'Sharpness': [], 
        'Contrast': [], 
        'Noise': [], 
        'Skew': [], 
        'Line Spacing': [], 
        'Tables Detected': [], 
        'Resolution': [], 
        'Detected Elements': [], 
        'Texture': [], 
        'Patterns': []
    }

    # Initialize tqdm progress bar for processing images
    for img, image_name in tqdm(zip(images, image_paths), desc="Processing Images", unit="image"):
        # Use the actual file name in the 'Image' column
        stats_data['Image'].append(image_name)
        
        # Check if the image is loaded correctly
        if img is None or not isinstance(img, np.ndarray) or img.size == 0:
            # Append None for each statistic if the image is invalid
            stats_data['Brightness'].append(None)
            stats_data['Sharpness'].append(None)
            stats_data['Contrast'].append(None)
            stats_data['Noise'].append(None)
            stats_data['Skew'].append(None)
            stats_data['Line Spacing'].append(None)
            stats_data['Tables Detected'].append(None)
            stats_data['Resolution'].append(None)
            stats_data['Detected Elements'].append(None)
            stats_data['Texture'].append(None)
            stats_data['Patterns'].append(None)
            print(f"Warning: {image_name} is invalid or empty, setting values to None.")
            continue

        # Calculate and append each statistic to the stats_data dictionary
        stats_data['Brightness'].append(calculate_brightness(img))
        stats_data['Sharpness'].append(calculate_sharpness(img))
        stats_data['Contrast'].append(calculate_contrast(img))
        stats_data['Noise'].append(calculate_noise(img))
        stats_data['Skew'].append(calculate_skew(img))
        stats_data['Line Spacing'].append(calculate_line_spacing(img))
        
        # Only take the count of tables detected
        table_count, _ = detect_tables(img)
        stats_data['Tables Detected'].append(table_count)

        stats_data['Resolution'].append(calculate_resolution(img))
        
        # Only take the count of detected elements
        element_count, _ = calculate_elements_detection(img)
        stats_data['Detected Elements'].append(element_count)

        stats_data['Texture'].append(calculate_texture(img))
        stats_data['Patterns'].append(calculate_patterns(img))
    
    # Create a DataFrame to store per-image statistics
    df = pd.DataFrame(stats_data)
    return df

### Overall Statistical Summary

#### *Function*

In [None]:
def overall_statistical_summary(df):
    """
    Calculate an overall statistical summary for selected features in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing image statistics with columns such as
                    'Brightness', 'Sharpness', 'Contrast', 'Noise', 'Skew',
                    'Line Spacing', 'Tables Detected', 'Resolution',
                    'Detected Elements', 'Texture', and 'Patterns'.
                    
    Returns:
    pd.DataFrame: A summary DataFrame with descriptive statistics for each feature.
    """
    # Select relevant columns for statistical summary
    columns_to_summarize = [
        'Brightness', 'Sharpness', 'Contrast', 'Noise', 'Skew', 
        'Line Spacing', 'Tables Detected', 'Resolution', 
        'Detected Elements', 'Texture', 'Patterns'
    ]
    
    # Calculate summary statistics for the selected columns
    summary_df = df[columns_to_summarize].describe()
    
    return summary_df  # Return the summary DataFrame


### Implementation

#### *Create Data Table*

In [None]:
# Generate a statistics table using the list of image paths
df_stats = image_statistics_table(images, image_paths)

# Display the full statistics table for all images
print("Image Data Table:")
df_stats

#### *Overall Statistics Table*

In [None]:
# Calculate and display the overall statistical summary
summary_df = overall_statistical_summary(df_stats)

print("\nOverall Statistical Summary:")
summary_df

---
## Visualizations

### Characteristics Units

In [None]:
# Define characteristics and their units in a single dictionary
characteristics_units = {
    'Brightness': 'Pixel Value (0-255)',
    'Contrast': 'Pixel Value (0-255) Std Dev',
    'Sharpness': 'Pixel Intensity Squared Variability',
    'Noise': 'Pixel Value (0-255) Std Dev',
    'Skew': 'Degrees (°)',
    'Line Spacing': 'Pixel Count Avg',
    'Tables Detected': 'Count',
    'Resolution': 'Pixel Count',
    'Detected Elements': 'Count',
    'Texture': 'Pixel Intensity Variability',
    'Patterns': 'Count'
}

### General Characteristics Distribution

#### *Function*

In [None]:
def plot_characteristic_distribution(df, characteristic, unit, plot_type='hist'):
    """
    Visualize the distribution of a specific characteristic in a DataFrame using histograms or box plots.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    characteristic (str): The column name of the characteristic to plot.
    unit (str): The unit of the characteristic, for labeling purposes.
    plot_type (str): Type of plot to generate ('hist' for histogram, 'box' for box plot).
                    Default is 'hist'.
    
    Returns:
    None
    """
    # Set up figure size for consistent visualization
    plt.figure(figsize=(10, 6))

    # Generate histogram or box plot based on the specified plot type
    if plot_type == 'hist':
        sns.histplot(df[characteristic], kde=True, bins=30, color='blue')
    elif plot_type == 'box':
        sns.boxplot(x=df[characteristic], color='green')
    else:
        raise ValueError("Invalid plot_type. Choose 'hist' for histogram or 'box' for box plot.")

    # Customize the plot with titles and labels
    plt.title(f"Distribution of {characteristic}")
    plt.xlabel(f"{characteristic} ({unit})")
    plt.ylabel("Frequency")
    plt.grid(True)  # Add grid for better readability
    plt.show()  # Display the plot

#### *Implementation*

In [None]:
# Temporarily suppress FutureWarnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    
    # Loop through each characteristic and plot type
    for characteristic, unit in characteristics_units.items():
        for plot_type in ['hist', 'box']:
            # Plot the distribution for each characteristic using the specified plot type
            plot_characteristic_distribution(df_stats, characteristic, unit, plot_type)

### General Characteristics Relationship

#### *Function*

In [None]:
def plot_characteristic_relationship(df, characteristic_1, characteristic_2, unit_1, unit_2, use_color=False):
    """
    Plot the relationship between two characteristics in a DataFrame using scatter plots.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    characteristic_1 (str): The column name of the first characteristic (x-axis).
    characteristic_2 (str): The column name of the second characteristic (y-axis).
    unit_1 (str): The unit of the first characteristic, for x-axis labeling.
    unit_2 (str): The unit of the second characteristic, for y-axis labeling.
    use_color (bool): If True, color points by the values of `characteristic_2` to show variation.
                    Default is False.
    
    Returns:
    None
    """
    # Set up figure size for consistent visualization
    plt.figure(figsize=(10, 6))

    # Plot scatter plot with optional color encoding
    if use_color:
        sns.scatterplot(x=df[characteristic_1], y=df[characteristic_2], 
                        hue=df[characteristic_2], palette='coolwarm', s=50)
    else:
        sns.scatterplot(x=df[characteristic_1], y=df[characteristic_2], s=50)

    # Customize the plot with titles and labels
    plt.title(f"Relationship between {characteristic_1} and {characteristic_2}")
    plt.xlabel(f"{characteristic_1} ({unit_1})")
    plt.ylabel(f"{characteristic_2} ({unit_2})")
    plt.grid(True)  # Add grid for better readability
    plt.show()  # Display the plot

#### *Implementation*

In [None]:
# Generate all possible pairs of characteristics for plotting relationships
pairs = itertools.combinations(characteristics_units.keys(), 2)

# Loop through each pair of characteristics and create relationship plots
for char1, char2 in pairs:
    plot_characteristic_relationship(
        df_stats,
        char1,
        char2,
        characteristics_units[char1],  # Retrieve unit for the first characteristic
        characteristics_units[char2]   # Retrieve unit for the second characteristic
    )

### Correlation Heatmap

#### *Function*

In [None]:
def plot_correlation_heatmap(df, characteristics, characteristic_units):
    """
    Create a heatmap to visualize correlations between different characteristics.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    characteristics (list of str): List of column names representing characteristics to include in the heatmap.
    characteristic_units (list of str): List of units corresponding to each characteristic.
    
    Returns:
    None
    """
    # Set figure size for visibility
    plt.figure(figsize=(10, 8))

    # Calculate the correlation matrix for the specified characteristics
    correlation_matrix = df[characteristics].corr()

    # Create annotations for the heatmap with correlation values and units
    annot = correlation_matrix.round(2).astype(str)  # Round to 2 decimal places and convert to string
    for i in range(len(characteristics)):
        for j in range(len(characteristics)):
            annot.iloc[i, j] += f"\n({characteristic_units[j]})"  # Append unit to each cell annotation

    # Generate the heatmap with annotations and a color map for visual clarity
    sns.heatmap(correlation_matrix, annot=annot, cmap='coolwarm', linewidths=0.5, fmt='',
                cbar_kws={'shrink': 0.8}, annot_kws={"size": 10})

    # Add plot titles and labels
    plt.title("Correlation Heatmap Between Characteristics", fontsize=16)
    plt.xlabel("Characteristics (Unit)", fontsize=14)
    plt.ylabel("Characteristics (Unit)", fontsize=14)

    # Customize tick labels with characteristic names and units, adjusting font size and rotation
    plt.xticks(ticks=range(len(characteristics)),
                labels=[f"{char} ({unit})" for char, unit in zip(characteristics, characteristic_units)],
                rotation=45, fontsize=12)
    plt.yticks(ticks=range(len(characteristics)),
                labels=[f"{char} ({unit})" for char, unit in zip(characteristics, characteristic_units)],
                rotation=0, fontsize=12)

    # Adjust layout to prevent clipping of tick labels
    plt.tight_layout()
    plt.show()

#### *Implementation*

In [None]:
# Filter only numeric columns for correlation matrix and statistical tests
numeric_df = df_stats.select_dtypes(include=[np.number])

# Calculate correlation matrix
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

### Statistical Tests

#### *Function*

In [None]:
def overall_statistical_tests(df):
    """
    Perform statistical tests on characteristics within a DataFrame, including normality tests 
    and Pearson correlation tests between characteristics.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing characteristics for statistical testing.
    
    Returns:
    pd.DataFrame: DataFrame containing the results of the statistical tests.
    """
    # Initialize a dictionary to store the test results
    test_results = {
        'Test': [], 
        'Characteristic_1': [], 
        'Characteristic_2': [], 
        'Statistic': [], 
        'P-Value': []
    }

    # Perform normality tests for selected characteristics
    for col in ['Brightness', 'Sharpness', 'Contrast']:
        # Conduct normality test for each characteristic
        stat, p_value = stats.normaltest(df[col])
        test_results['Test'].append('Normality')
        test_results['Characteristic_1'].append(col)
        test_results['Characteristic_2'].append(None)  # No second characteristic for univariate tests
        test_results['Statistic'].append(stat)
        test_results['P-Value'].append(p_value)

    # Perform Pearson correlation tests between each pair of characteristics
    pairs = itertools.combinations(
        ['Brightness', 'Sharpness', 'Contrast', 'Noise', 'Skew', 'Line Spacing', 
            'Tables Detected', 'Resolution', 'Detected Elements', 'Texture', 'Patterns'], 2)
    
    for char1, char2 in pairs:
        # Skip pairs where either characteristic has constant values
        if df[char1].nunique() <= 1 or df[char2].nunique() <= 1:
            # Add entry to indicate constant input
            test_results['Test'].append('Pearson Correlation (Skipped)')
            test_results['Characteristic_1'].append(char1)
            test_results['Characteristic_2'].append(char2)
            test_results['Statistic'].append(None)
            test_results['P-Value'].append(None)
            continue
        
        # Conduct Pearson correlation test between characteristic pairs
        corr_stat, corr_p_val = stats.pearsonr(df[char1], df[char2])
        test_results['Test'].append('Pearson Correlation')
        test_results['Characteristic_1'].append(char1)
        test_results['Characteristic_2'].append(char2)
        test_results['Statistic'].append(corr_stat)
        test_results['P-Value'].append(corr_p_val)

    # Convert the results dictionary into a DataFrame
    return pd.DataFrame(test_results)

#### *Implementation*

In [None]:
# Perform overall statistical tests
test_results_table = overall_statistical_tests(numeric_df)
test_results_table

---
## Special Visualization per Characteristics

In [None]:
# Pick a random image from the list of preprocessed images (assuming `images` is a list of grayscale images)
random_image = random.choice(images)

### Brightness

In [None]:
def plot_brightness_overall(df):
    """
    Plot the distribution of brightness across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing brightness statistics for each image.
    
    Returns:
    None: Displays the brightness distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Brightness'], kde=True, bins=30, color='skyblue')
    plt.title("Distribution of Brightness Across Dataset")
    plt.xlabel("Brightness (Pixel Value 0-255)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_brightness_single(image):
    """
    Plot the brightness histogram for a single image and display the mean brightness value.
    
    Parameters:
    image (numpy array): Grayscale image to analyze brightness distribution.
    
    Returns:
    None: Displays the brightness histogram for the image.
    """
    # Calculate the mean brightness using the provided function
    mean_brightness = calculate_brightness(image)
    
    plt.figure(figsize=(10, 6))
    plt.hist(image.ravel(), bins=256, range=(0, 255), color='skyblue', alpha=0.7)
    plt.title(f"Brightness Histogram of Single Image (Mean Brightness: {mean_brightness:.2f})")
    plt.xlabel("Brightness (Pixel Value)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_brightness_heatmap(image):
    """
    Display a smoothed heatmap of brightness intensity for a single image using Gaussian blur,
    and display the mean brightness value in the title.
    
    Parameters:
    image (numpy array): Grayscale image to visualize brightness intensity.
    
    Returns:
    None: Displays a blurred heatmap visualization of the image's brightness.
    """
    # Calculate the mean brightness using the provided function
    mean_brightness = calculate_brightness(image)
    
    # Convert the image to float64 for accurate processing
    brightness_image = image.astype(np.float64)
    
    # Apply Gaussian blur to create a smooth brightness map
    blurred_brightness = cv2.GaussianBlur(brightness_image, (5, 5), 0)
    
    # Normalize the blurred brightness to fall between 0 and 255 for visualization
    brightness_heatmap = cv2.normalize(blurred_brightness, None, 0, 255, cv2.NORM_MINMAX)
    
    # Plot the smoothed heatmap with the mean brightness in the title
    plt.figure(figsize=(12, 12))
    plt.imshow(brightness_heatmap, cmap='gray', vmin=0, vmax=255)
    plt.colorbar(label="Brightness Intensity (Normalized 0-255)")
    plt.title(f"Brightness Heatmap of Image (Mean Brightness: {mean_brightness:.2f})")
    plt.axis('off')
    plt.show()

In [None]:
plot_brightness_overall(df_stats)

In [None]:
plot_brightness_single(random_image)

In [None]:
plot_brightness_heatmap(random_image)

### Contrast

In [None]:
def plot_contrast_overall(df):
    """
    Plot the distribution of contrast across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing contrast statistics for each image.
    
    Returns:
    None: Displays the contrast distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Contrast'], kde=True, bins=30, color='purple')
    plt.title("Distribution of Contrast Across Dataset")
    plt.xlabel("Contrast (Pixel Value Std Dev)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_contrast_single(image):
    """
    Plot the contrast histogram for a single image and display the mean contrast value.
    
    Parameters:
    image (numpy array): Grayscale image to analyze contrast distribution.
    
    Returns:
    None: Displays the contrast histogram for the image.
    """
    # Calculate the mean contrast using the provided function
    mean_contrast = calculate_contrast(image)
    
    plt.figure(figsize=(10, 6))
    plt.hist(image.ravel(), bins=256, range=(0, 255), color='purple', alpha=0.7)
    plt.title(f"Contrast Histogram of Single Image (Mean Contrast: {mean_contrast:.2f})")
    plt.xlabel("Pixel Intensity")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_contrast_map(image):
    """
    Display a contrast map for a single image by normalizing brightness values to enhance contrast,
    and display the mean contrast value in the title.
    
    Parameters:
    image (numpy array): Grayscale image to visualize contrast intensity.
    
    Returns:
    None: Displays a contrast map visualization of the image.
    """
    # Calculate the mean contrast using the provided function
    mean_contrast = calculate_contrast(image)
    
    # Normalize the image brightness to the range 0-255 to enhance contrast visibility
    contrast_map_normalized = cv2.normalize(image.astype(np.float64), None, 0, 255, cv2.NORM_MINMAX)
    
    # Plot the contrast map with the mean contrast in the title
    plt.figure(figsize=(6, 6))
    plt.imshow(contrast_map_normalized, cmap='gray', vmin=0, vmax=255)
    plt.colorbar(label="Contrast Intensity (Normalized 0-255)")
    plt.title(f"Contrast Map of Image (Mean Contrast: {mean_contrast:.2f})")
    plt.axis('off')
    plt.show()

In [None]:
plot_contrast_overall(df_stats)

In [None]:
plot_contrast_single(random_image)

In [None]:
plot_contrast_map(random_image)

### Sharpness

In [None]:
def plot_sharpness_overall(df):
    """
    Plot the distribution of sharpness across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing sharpness statistics for each image.
    
    Returns:
    None: Displays the sharpness distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Sharpness'], kde=True, bins=30, color='orange')
    plt.title("Distribution of Sharpness Across Dataset")
    plt.xlabel("Sharpness (Variance of Laplacian)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_sharpness_single(image):
    """
    Plot the sharpness histogram for a single image based on edge intensity,
    and display the mean sharpness value in the title.
    
    Parameters:
    image (numpy array): Grayscale image to analyze sharpness distribution.
    
    Returns:
    None: Displays the sharpness histogram for the image.
    """
    # Calculate the mean sharpness using the provided function
    mean_sharpness = calculate_sharpness(image)
    
    # Apply Laplacian to detect edges
    edges = cv2.Laplacian(image, cv2.CV_64F)
    
    plt.figure(figsize=(10, 6))
    plt.hist(np.abs(edges).ravel(), bins=256, range=(0, edges.max()), color='orange', alpha=0.7)
    plt.title(f"Sharpness Histogram of Single Image (Mean Sharpness: {mean_sharpness:.2f})")
    plt.xlabel("Edge Intensity")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_sharpness_edge_map(image):
    """
    Display an edge map for a single image to visualize areas with high sharpness,
    and display the mean sharpness value in the title.
    
    Parameters:
    image (numpy array): Grayscale image to visualize edge intensity.
    
    Returns:
    None: Displays an edge map visualization of the image.
    """
    # Calculate the mean sharpness using the provided function
    mean_sharpness = calculate_sharpness(image)
    
    # Apply Laplacian to detect edges
    edges = cv2.Laplacian(image, cv2.CV_64F)
    
    # Normalize edges to the range 0-255 for visualization
    edge_map_normalized = cv2.normalize(np.abs(edges), None, 0, 255, cv2.NORM_MINMAX)
    
    # Plot the edge map with the mean sharpness in the title
    plt.figure(figsize=(12, 12))
    plt.imshow(edge_map_normalized, cmap='gray', vmin=0, vmax=255)
    plt.colorbar(label="Edge Intensity (Normalized 0-255)")
    plt.title(f"Edge Map for Sharpness Visualization (Mean Sharpness: {mean_sharpness:.2f})")
    plt.axis('off')
    plt.show()

In [None]:
plot_sharpness_overall(df_stats)

In [None]:
plot_sharpness_single(random_image)

In [None]:
plot_sharpness_edge_map(random_image)

### Noise

In [None]:
def plot_noise_overall(df):
    """
    Plot the distribution of noise across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing noise statistics for each image.
    
    Returns:
    None: Displays the noise distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Noise'], kde=True, bins=30, color='red')
    plt.title("Distribution of Noise Across Dataset")
    plt.xlabel("Noise (Pixel Value Std Dev)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_noise_single(image):
    """
    Plot the noise histogram for a single image and display the mean noise value.
    
    Parameters:
    image (numpy array): Grayscale image to analyze noise distribution.
    
    Returns:
    None: Displays the noise histogram for the image.
    """
    # Calculate the mean noise using the provided function
    mean_noise = calculate_noise(image)
    
    # Apply Gaussian blur to simulate noise calculation
    blurred_image = cv2.GaussianBlur(image, (3, 3), 0)
    noise = cv2.absdiff(image, blurred_image)
    
    plt.figure(figsize=(10, 6))
    plt.hist(noise.ravel(), bins=256, range=(0, 255), color='red', alpha=0.7)
    plt.title(f"Noise Histogram of Single Image (Mean Noise: {mean_noise:.2f})")
    plt.xlabel("Noise Intensity (Pixel Value)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_noise_map(image):
    """
    Display a noise map for a single image by highlighting areas with high noise intensity.
    
    Parameters:
    image (numpy array): Grayscale image to visualize noise intensity.
    
    Returns:
    None: Displays a noise map visualization of the image.
    """
    # Calculate the mean noise using the provided function
    mean_noise = calculate_noise(image)
    
    # Apply Gaussian blur and calculate the difference to highlight noise areas
    blurred_image = cv2.GaussianBlur(image, (3, 3), 0)
    noise_map = cv2.absdiff(image, blurred_image)
    
    # Normalize the noise map to the range 0-255 for visualization
    noise_map_normalized = cv2.normalize(noise_map.astype(np.float64), None, 0, 255, cv2.NORM_MINMAX)
    
    # Plot the noise map with the mean noise in the title
    plt.figure(figsize=(6, 6))
    plt.imshow(noise_map_normalized, cmap='gray', vmin=0, vmax=255)
    plt.colorbar(label="Noise Intensity (Normalized 0-255)")
    plt.title(f"Noise Map of Image (Mean Noise: {mean_noise:.2f})")
    plt.axis('off')
    plt.show()

In [None]:
plot_noise_overall(df_stats)

In [None]:
plot_noise_single(random_image)

In [None]:
plot_noise_map(random_image)

### Skew

In [None]:
def plot_skew_overall(df):
    """
    Plot the distribution of skew across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing skew statistics for each image.
    
    Returns:
    None: Displays the skew distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Skew'], kde=True, bins=30, color='green')
    plt.title("Distribution of Skew Across Dataset")
    plt.xlabel("Skew (Degrees)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_skew_single(image):
    """
    Display the skew angle for a single image.
    
    Parameters:
    image (numpy array): Grayscale image to analyze skew angle.
    
    Returns:
    None: Displays the skew angle of the image in a text output.
    """
    # Calculate the skew angle using the provided function
    skew_angle = calculate_skew(image)
    print(f"Skew Angle of the Image: {skew_angle} degrees")

In [None]:
def plot_skew_visualization(image, skew_angle):
    """
    Display an image with the skew angle overlay to visualize its orientation.
    
    Parameters:
    image (numpy array): Grayscale image to visualize skew.
    skew_angle (float): Skew angle in degrees to display.
    
    Returns:
    None: Displays the image with the skew angle overlay.
    """
    plt.figure(figsize=(6, 6))
    plt.imshow(image, cmap='gray')
    plt.title(f"Skew Visualization (Angle: {skew_angle:.2f}°)")
    plt.axline((0, 0), slope=np.tan(np.deg2rad(skew_angle)), color='red', linestyle='--', linewidth=1)
    plt.axis('off')
    plt.show()

In [None]:
plot_skew_overall(df_stats)

In [None]:
plot_skew_single(random_image)

In [None]:
skew_angle = calculate_skew(random_image)
plot_skew_visualization(random_image, skew_angle)

### Line Spacing

In [None]:
def plot_line_spacing_overall(df):
    """
    Plot the distribution of line spacing across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing line spacing statistics for each image.
    
    Returns:
    None: Displays the line spacing distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Line Spacing'], kde=True, bins=30, color='blue')
    plt.title("Distribution of Line Spacing Across Dataset")
    plt.xlabel("Line Spacing (Average Pixel Count)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_line_spacing_single(image):
    """
    Display the average line spacing for a single image.
    
    Parameters:
    image (numpy array): Grayscale image to analyze line spacing.
    
    Returns:
    None: Prints the average line spacing value for the image.
    """
    # Calculate the line spacing using the provided function
    avg_line_spacing = calculate_line_spacing(image)
    print(f"Average Line Spacing of the Image: {avg_line_spacing:.2f} pixels")

In [None]:
def plot_line_spacing_visualization(image):
    """
    Visualize line spacing by marking bounding boxes around detected lines in a grayscale image,
    and display lines indicating average spacing between rows.
    
    Parameters:
    image (numpy array): Grayscale image to visualize line spacing.
    
    Returns:
    None: Displays the image with bounding boxes and line spacing indications.
    """
    # Convert the image to binary for contour detection
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    
    # Find contours that represent significant lines or areas
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Plot bounding boxes and lines indicating line spacing
    plt.figure(figsize=(12, 12))
    plt.imshow(image, cmap='gray')
    
    # Collect vertical positions of bounding boxes
    y_positions = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        y_positions.append(y)
        # Draw bounding box
        plt.gca().add_patch(plt.Rectangle((x, y), w, h, linewidth=1, edgecolor='green', facecolor='none'))

    # Sort and draw lines indicating spacing between rows
    y_positions = sorted(y_positions)
    for i in range(1, len(y_positions)):
        y_avg = (y_positions[i-1] + y_positions[i]) / 2
        plt.axhline(y=y_avg, color='red', linestyle='--', linewidth=0.5)  # Draw line spacing

    plt.title("Line Spacing Visualization in Image")
    plt.axis('off')
    plt.show()

In [None]:
plot_line_spacing_overall(df_stats)

In [None]:
plot_line_spacing_single(random_image)

In [None]:
plot_line_spacing_visualization(random_image)

### Tables Detected

In [None]:
def plot_tables_detected_overall(df):
    """
    Plot the distribution of tables detected across the entire dataset using a bar chart.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing tables detected statistics for each image.
    
    Returns:
    None: Displays the tables detected distribution plot for the dataset.
    """
    # Count occurrences of each unique number of tables detected
    table_counts = df['Tables Detected'].value_counts().sort_index()
    
    # Plot bar chart
    plt.figure(figsize=(10, 6))
    sns.barplot(x=table_counts.index, y=table_counts.values, color='purple')
    plt.title("Distribution of Tables Detected Across Dataset")
    plt.xlabel("Number of Tables Detected")
    plt.ylabel("Frequency")
    plt.grid(True, axis='y')
    plt.show()

In [None]:
def plot_tables_detected_single(image):
    """
    Display the number of tables detected for a single image.
    
    Parameters:
    image (numpy array): Grayscale image to analyze tables detected.
    
    Returns:
    None: Prints the number of tables detected in the image.
    """
    # Calculate the number of tables detected using the provided function
    table_count, _ = detect_tables(image)
    print(f"Number of Tables Detected in the Image: {table_count}")

In [None]:
def plot_tables_detected_visualization(image):
    """
    Visualize detected tables by drawing bounding boxes around each detected table area in a grayscale image.
    
    Parameters:
    image (numpy array): Grayscale image to visualize detected tables.
    
    Returns:
    None: Displays the image with bounding boxes around detected tables.
    """
    # Detect tables and obtain contours
    _, table_contours = detect_tables(image)
    
    # Plot image with bounding boxes around detected tables
    plt.figure(figsize=(6, 8))
    plt.imshow(image, cmap='gray')
    for contour in table_contours:
        x, y, w, h = cv2.boundingRect(contour)
        plt.gca().add_patch(plt.Rectangle((x, y), w, h, linewidth=1, edgecolor='blue', facecolor='none'))
    plt.title("Detected Tables Visualization in Image")
    plt.axis('off')
    plt.show()

In [None]:
plot_tables_detected_overall(df_stats)

In [None]:
plot_tables_detected_single(random_image)

In [None]:
plot_tables_detected_visualization(random_image)

### Resolution

In [None]:
def plot_resolution_overall(df):
    """
    Plot the distribution of resolution across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing resolution statistics for each image.
    
    Returns:
    None: Displays the resolution distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Resolution'], kde=True, bins=30, color='brown')
    plt.title("Distribution of Resolution Across Dataset")
    plt.xlabel("Resolution (Total Pixel Count)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_resolution_single(image):
    """
    Display the resolution (total pixel count) for a single image.
    
    Parameters:
    image (numpy array): Image to analyze resolution.
    
    Returns:
    None: Prints the resolution (total pixel count) of the image.
    """
    # Calculate the resolution using the provided function
    resolution = calculate_resolution(image)
    print(f"Resolution of the Image: {resolution} pixels")

In [None]:
def plot_resolution_visualization(image):
    """
    Display an image with annotated resolution information, including pixel coordinates on x and y axes.
    
    Parameters:
    image (numpy array): Image to visualize resolution.
    
    Returns:
    None: Displays the image with resolution and axis coordinates annotated.
    """
    height, width = image.shape[:2]
    resolution = calculate_resolution(image)
    
    plt.figure(figsize=(6, 8))
    plt.imshow(image, cmap='gray')
    plt.title(f"Image Resolution: {width} x {height} (Total: {resolution} pixels)")
    plt.xlabel(f"Width (pixels, 0 to {width - 1})")
    plt.ylabel(f"Height (pixels, 0 to {height - 1})")
    plt.axis('on')  # Ensure axes are shown
    plt.show()

In [None]:
plot_resolution_overall(df_stats)

In [None]:
plot_resolution_single(random_image)

In [None]:
plot_resolution_visualization(random_image)

### Detected Elements

In [None]:
def plot_detected_elements_overall(df):
    """
    Plot the distribution of detected elements across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing detected elements statistics for each image.
    
    Returns:
    None: Displays the detected elements distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Detected Elements'], kde=False, bins=20, color='teal')
    plt.title("Distribution of Detected Elements Across Dataset")
    plt.xlabel("Number of Detected Elements")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_detected_elements_single(image):
    """
    Display the number of detected elements for a single image.
    
    Parameters:
    image (numpy array): Grayscale image to analyze detected elements.
    
    Returns:
    None: Prints the number of detected elements in the image.
    """
    # Calculate the number of detected elements using the provided function
    element_count = calculate_elements_detection(image)
    print(f"Number of Detected Elements in the Image: {element_count}")

In [None]:
def plot_detected_elements_visualization(image):
    """
    Visualize detected elements by drawing bounding boxes around each detected element area in a grayscale image.
    
    Parameters:
    image (numpy array): Grayscale image to visualize detected elements.
    
    Returns:
    None: Displays the image with bounding boxes around detected elements.
    """
    # Detect elements and obtain contours
    _, binary = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Plot image with bounding boxes around detected elements
    plt.figure(figsize=(12, 12))
    plt.imshow(image, cmap='gray')
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        plt.gca().add_patch(plt.Rectangle((x, y), w, h, linewidth=1, edgecolor='green', facecolor='none'))
    plt.title("Detected Elements Visualization in Image")
    plt.axis('off')
    plt.show()

In [None]:
plot_detected_elements_overall(df_stats)

In [None]:
plot_detected_elements_single(random_image)

In [None]:
plot_detected_elements_visualization(random_image)

### Texture

In [None]:
def plot_texture_overall(df):
    """
    Plot the distribution of texture across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing texture statistics for each image.
    
    Returns:
    None: Displays the texture distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Texture'], kde=True, bins=30, color='orange')
    plt.title("Distribution of Texture Across Dataset")
    plt.xlabel("Texture (Pixel Intensity Variability)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_texture_single(image):
    """
    Display the texture value for a single image.
    
    Parameters:
    image (numpy array): Grayscale image to analyze texture.
    
    Returns:
    None: Prints the texture value of the image.
    """
    # Calculate the texture value using the provided function
    texture_value = calculate_texture(image)
    print(f"Texture Value of the Image: {texture_value:.2f}")

In [None]:
def plot_texture_map(image):
    """
    Display a texture map for a single image to visualize areas with high texture intensity.
    
    Parameters:
    image (numpy array): Grayscale image to visualize texture intensity.
    
    Returns:
    None: Displays a texture map visualization of the image.
    """
    # Calculate the texture map using the Laplacian filter to enhance texture details
    texture_map = cv2.Laplacian(image, cv2.CV_64F)
    
    # Normalize the texture map to the range 0-255 for visualization
    texture_map_normalized = cv2.normalize(np.abs(texture_map), None, 0, 255, cv2.NORM_MINMAX)
    
    # Plot the texture map
    plt.figure(figsize=(6, 6))
    plt.imshow(texture_map_normalized, cmap='gray', vmin=0, vmax=255)
    plt.colorbar(label="Texture Intensity (Normalized 0-255)")
    plt.title("Texture Map of Image")
    plt.axis('off')
    plt.show()

In [None]:
plot_texture_overall(df_stats)

In [None]:
plot_texture_single(random_image)

In [None]:
plot_texture_map(random_image)

### Patterns

In [None]:
def plot_patterns_overall(df):
    """
    Plot the distribution of patterns detected across the entire dataset.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing patterns detected statistics for each image.
    
    Returns:
    None: Displays the patterns detected distribution plot for the dataset.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Patterns'], kde=False, bins=20, color='darkblue')
    plt.title("Distribution of Patterns Detected Across Dataset")
    plt.xlabel("Number of Patterns Detected")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [None]:
def plot_patterns_single(image):
    """
    Display the number of patterns detected for a single image.
    
    Parameters:
    image (numpy array): Grayscale image to analyze patterns.
    
    Returns:
    None: Prints the number of patterns detected in the image.
    """
    # Calculate the number of patterns detected using the provided function
    pattern_count = calculate_patterns(image)
    print(f"Number of Patterns Detected in the Image: {pattern_count}")

In [None]:
def plot_pattern_map(image):
    """
    Display a pattern map for a single image using edge detection to highlight patterns.
    
    Parameters:
    image (numpy array): Grayscale image to visualize patterns.
    
    Returns:
    None: Displays a pattern map visualization of the image.
    """
    # Apply edge detection (Canny) to detect patterns
    pattern_map = cv2.Canny(image, 100, 200)
    
    # Plot the pattern map
    plt.figure(figsize=(6, 6))
    plt.imshow(pattern_map, cmap='gray')
    plt.colorbar(label="Pattern Intensity")
    plt.title("Pattern Map of Image")
    plt.axis('off')
    plt.show()

In [None]:
plot_patterns_overall(df_stats)

In [None]:
plot_patterns_single(random_image)

In [None]:
plot_pattern_map(random_image)