**DATA CLEANING**

Import Packages

In [1]:
from PIL import Image
import os
import hashlib
import numpy as np
from google.colab import drive

In [3]:
# Mounting Google Drive in Colab
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


LIGHT SUBDIRECTORIES

Checking for Corrupted or Invalid Images

In [24]:
# The directory where the image is stored
image_directory = '/content/drive/My Drive/Capstone/data_skintone/light'

# Creating a list to save corrupted image paths
invalid_images = []

# Loop to check images one by one
for filename in os.listdir(image_directory):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)
        try:
            img = Image.open(image_path)
            img.verify()  # Verifying images
        except (IOError, SyntaxError) as e:
            print(f"Corrupted or invalid images: {image_path}")
            invalid_images.append(image_path)

# Remove corrupted images from directories
for invalid_image in invalid_images:
    os.remove(invalid_image)
    print(f"File {invalid_image} has been deleted.")

Removing Duplicate Images

In [25]:
# Functions for calculating image hashes
def calculate_image_hash(image_path):
    with Image.open(image_path) as img:
        img = img.resize((100, 100)).convert("L")  # Resize and convert to grayscale for accuracy
        hash_val = hashlib.md5(img.tobytes()).hexdigest()  # Calculating the hash of an image
    return hash_val

# Detect and remove duplicate images
hashes = {}
duplicate_images = []

for filename in os.listdir(image_directory):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)
        image_hash = calculate_image_hash(image_path)

        if image_hash in hashes:
            print(f"Duplicate image found: {image_path} duplicate of {hashes[image_hash]}")
            duplicate_images.append(image_path)
        else:
            hashes[image_hash] = image_path

# Remove duplicate images from directories
for duplicate_image in duplicate_images:
    os.remove(duplicate_image)
    print(f"Duplicate files {duplicate_image} has been deleted.")

Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/light/88974939.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/light/87177968.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/light/48774375.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/light/54283817.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/light/40820641.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/light/40710202.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/light/42377315.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/light/58632360.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/light/42377341.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/light/58632343.jpg
Duplicate files /content/drive/My Drive/Capstone/data_skintone/light/88974939.jpg has been deleted.
Duplicate files /content/drive/My Drive/Capstone/data_skinto

Checking Image Size and Resolution

In [26]:
# Check the size of images and make a list of images that are too small
min_size = (100, 100)  # Minimum allowable size

small_images = []

for filename in os.listdir(image_directory):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)
        with Image.open(image_path) as img:
            if img.size[0] < min_size[0] or img.size[1] < min_size[1]:
                print(f"Image too small: {image_path}, size: {img.size}")
                small_images.append(image_path)

# Remove images that are too small
for small_image in small_images:
    os.remove(small_image)
    print(f"File {small_image} has been removed because it is too small in size.")

Normalizing Images

In [28]:
# Function to normalize images
def normalize_image(image_path, target_size=(224, 224)):
    img = Image.open(image_path).convert('RGB')
    img = img.resize(target_size)
    img_array = np.array(img) / 255.0  # Normalization
    return img_array

# Example normalization for a single image
sample_image = normalize_image(os.path.join(image_directory, '358609.jpg'))
print("Sample image shape:", sample_image.shape)

Sample image shape: (224, 224, 3)


MID-LIGHT SUBDIRECTIORIES

Checking for Corrupted or Invalid Images

In [29]:
# The directory where the image is stored
image_directory = '/content/drive/My Drive/Capstone/data_skintone/mid-light'

# Creating a list to save corrupted image paths
invalid_images = []

# Loop to check images one by one
for filename in os.listdir(image_directory):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)
        try:
            img = Image.open(image_path)
            img.verify()  # Verifying images
        except (IOError, SyntaxError) as e:
            print(f"Corrupted or invalid images: {image_path}")
            invalid_images.append(image_path)

# Remove corrupted images from directories
for invalid_image in invalid_images:
    os.remove(invalid_image)
    print(f"File {invalid_image} has been deleted.")

Removing Duplicate Images

In [30]:
# Functions for calculating image hashes
def calculate_image_hash(image_path):
    with Image.open(image_path) as img:
        img = img.resize((100, 100)).convert("L")  # Resize and convert to grayscale for accuracy
        hash_val = hashlib.md5(img.tobytes()).hexdigest()  # Calculating the hash of an image
    return hash_val

# Detect and remove duplicate images
hashes = {}
duplicate_images = []

for filename in os.listdir(image_directory):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)
        image_hash = calculate_image_hash(image_path)

        if image_hash in hashes:
            print(f"Duplicate image found: {image_path} duplicate of {hashes[image_hash]}")
            duplicate_images.append(image_path)
        else:
            hashes[image_hash] = image_path

# Remove duplicate images from directories
for duplicate_image in duplicate_images:
    os.remove(duplicate_image)
    print(f"Duplicate files {duplicate_image} has been deleted.")

Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/mid-light/image_1005.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/mid-light/augmented_image_1005.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/mid-light/image_10157.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/mid-light/augmented_image_10157.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/mid-light/image_10209.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/mid-light/augmented_image_10209.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/mid-light/image_10383.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/mid-light/augmented_image_10383.jpg
Duplicate image found: /content/drive/My Drive/Capstone/data_skintone/mid-light/image_10395.jpg duplicate of /content/drive/My Drive/Capstone/data_skintone/mid-light/augmented_image_10395.jpg
Duplicate image found: /content/drive/My D

Checking Image Size and Resolution

In [31]:
# Check the size of images and make a list of images that are too small
min_size = (100, 100)  # Minimum allowable size

small_images = []

for filename in os.listdir(image_directory):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        image_path = os.path.join(image_directory, filename)
        with Image.open(image_path) as img:
            if img.size[0] < min_size[0] or img.size[1] < min_size[1]:
                print(f"Image too small: {image_path}, size: {img.size}")
                small_images.append(image_path)

# Remove images that are too small
for small_image in small_images:
    os.remove(small_image)
    print(f"File {small_image} has been removed because it is too small in size.")

Normalizing Images

In [33]:
# Function to normalize images
def normalize_image(image_path, target_size=(224, 224)):
    img = Image.open(image_path).convert('RGB')
    img = img.resize(target_size)
    img_array = np.array(img) / 255.0  # Normalization
    return img_array

# Example normalization for a single image
sample_image = normalize_image(os.path.join(image_directory, '85407.jpg'))
print("Sample image shape:", sample_image.shape)

Sample image shape: (224, 224, 3)
