Data Understanding Naomi's Version 

In [1]:
#imports 
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Global Variables 

In [2]:
# Define the path to the folder containing the images to be processed
folder_path = '/Users/naominorris/cps/original-photos'  # Update this path to point to your specific folder containing images

# Define the path to the folder where the processed images will be saved
output_folder = '/Users/naominorris/cps/modified-photos'  # Update this path to the desired output folder 

Load Images

In [3]:
def load_images_from_folder(folder_path, extensions=('.png', '.jpg', '.jpeg', '.JPG')):
    """
    Load all image file paths from a specified folder that match the given file extensions.

    Parameters:
    folder_path (str): The path to the folder containing the images.
    extensions (tuple of str): A tuple of file extensions to filter the images by. 
                               Default is ('.png', '.jpg', '.jpeg', '.JPG').

    Returns:
    list: A list of full file paths to images in the folder that match the specified extensions.
    
    Raises:
    FileNotFoundError: If the specified folder does not exist.
    """
    
    # Check if the folder exists
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"The specified folder does not exist: {folder_path}")
    
    # List comprehension to gather all image paths with the specified extensions
    image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(extensions)]
    
    return image_paths

In [4]:
def analyze_images(folder_path):
    image_info = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.JPG')):
            file_path = os.path.join(folder_path, filename)
            
            # Get file format and size
            format = filename.split('.')[-1].lower()
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
            
            # Read image
            img = cv2.imread(file_path)
            height, width = img.shape[:2]
            
            # Calculate average brightness
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            avg_brightness = np.mean(gray)
            
            # Detect blur
            laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
            
            image_info.append({
                'filename': filename,
                'format': format,
                'size_mb': file_size,
                'width': width,
                'height': height,
                'aspect_ratio': width / height,
                'avg_brightness': avg_brightness,
                'blur_score': laplacian_var
            })
    
    return pd.DataFrame(image_info)

In [5]:
def plot_analysis(df):
    # Set style for better-looking plots
    sns.set_style("whitegrid")
    
    # Histogram and Box plot for brightness
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    sns.histplot(data=df, x='avg_brightness', kde=True, ax=ax1)
    ax1.set_title('Histogram of Average Brightness')
    ax1.set_xlabel('Brightness')
    
    sns.boxplot(y='avg_brightness', data=df, ax=ax2)
    ax2.set_title('Box Plot of Average Brightness')
    ax2.set_ylabel('Brightness')
    
    plt.tight_layout()
    plt.show()
    
    # Histogram and Box plot for blur score
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    sns.histplot(data=df, x='blur_score', kde=True, ax=ax1)
    ax1.set_title('Histogram of Blur Score')
    ax1.set_xlabel('Blur Score')
    
    sns.boxplot(y='blur_score', data=df, ax=ax2)
    ax2.set_title('Box Plot of Blur Score')
    ax2.set_ylabel('Blur Score')
    
    plt.tight_layout()
    plt.show()

In [6]:
# Usage
folder_path = '/Users/naominorris/cps/original-photos'
df_images = analyze_images(folder_path)

In [None]:
# Display the first few rows of the DataFrame
df_images.head()

In [None]:
unique_aspect = df_images['aspect_ratio'].value_counts()
unique_aspect

In [None]:
# Step 1: Find the majority value (the mode) in the 'Name' column
majority_value = df_images['aspect_ratio'].mode()[0]  # Get the most frequent (majority) value

# Step 2: Filter the DataFrame for rows where the 'Name' column is NOT the majority value
filtered_df = df_images[df_images['aspect_ratio'] != majority_value]

# Display the filtered DataFrame
filtered_df

In [None]:
# Plot analysis
plot_analysis(df_images)

In [None]:
data_dir = '/Users/naominorris/cps/original-photos'
for dirpath, dirnames, filenames in os.walk(data_dir):
    print(f'Found {len(filenames)} images in {dirpath}')