## Part 1 - Data Preprocessing


In [None]:
# Import necessary libraries
import numpy as np  # For numerical operations and array handling
import cv2 as cv  # OpenCV for image processing
import random  # For shuffling data
import os  # For file and directory operations
import matplotlib.pyplot as plt  # For data visualization
import pickle  # For saving processed data to disk
%matplotlib inline

In [None]:
# Define the dataset directory path
DIRECTORY = r'C:\Users\Arjun M S\Desktop\build-from-home\dataset'
# A raw string (r'...') is used to treat backslash (\) as a normal character

# Define the three classes for classification
CATEGORIES = ['mammooty','mohanlal','random']

In [None]:
# Set the target image size for model input
IMG_SIZE = 224

# Initialize list to store image data and labels
data = []

# Process and label all images from the dataset
for category in CATEGORIES:
    # Build path to category folder
    folder = os.path.join(DIRECTORY, category)
    
    # Iterate through all images in the category folder
    for img in os.listdir(folder):
        # Build full path to image file
        img_path = os.path.join(folder, img)
        
        # Assign numeric label based on category index (0: mammooty, 1: mohanlal, 2: random)
        label = CATEGORIES.index(category)
        
        # Read image in grayscale mode
        img_arr = cv.imread(img_path, cv.IMREAD_GRAYSCALE)
        
        # Resize image to standard size (224x224)
        img_arr = cv.resize(img_arr,(IMG_SIZE,IMG_SIZE))
        
        # Append [image, label] pair to data list
        data.append([img_arr,label])

In [None]:
# Shuffle the data to randomize the order
# This ensures the model doesn't learn based on the order of data
random.shuffle(data)

In [None]:
# Separate features (X) and labels (y)
X = []  # Will store image arrays (features)
y = []  # Will store corresponding labels

# Extract features and labels from the data list
for features, labels in data:
    X.append(features)
    y.append(labels)

In [None]:
# Convert lists to NumPy arrays for efficient computation
# X contains all image data, y contains all corresponding labels
X = np.array(X)
y = np.array(y)

In [None]:
# Display the features array
X

In [None]:
# Display the labels array
y

In [None]:
# Save the preprocessed data to pickle files for later use
# Pickle files preserve the NumPy array format and can be loaded quickly
X = pickle.dump(X,open('X.pkl', 'wb'))  # Save features
y = pickle.dump(y,open('y.pkl', 'wb'))  # Save labels