## **Image Data Preprocessing for AlexNet Paper**

In this script perform the following task are done 
1. Take the Data folder named my_dataset/train which contains 50000 images under 100 classes.
2. Read the data from file and create a data frame to store all 50000 images data  
3. Split the DataFrame and create test, train and validation dataframe.
4. Prepare image of size 256*256 as per the Alexnet paper Section 2 "The Dataset" as source image file are      variable size.
5. Transfer the preprocessed images into the "alexnet_dataset" under train, test and val local directory

In [21]:
# Importing Libraries
import os
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

**The Split data are stored separately in folders. Directory are created for respective sets**

In [2]:
# Source and Destination folder path

source_folder = 'my_dataset/train'  
output_folder = 'alexnet_dataset'    

# Creating directory in the current directory path
train_dir = os.path.join(output_folder, 'train')
test_dir = os.path.join(output_folder, 'test')
val_dir = os.path.join(output_folder, 'val')

for directory in [train_dir, test_dir, val_dir]:
    os.makedirs(directory, exist_ok=True)

**Data Frame is created to store the Image File name, Image Class name and path**

In [11]:
# Navigating to the sopurce folder directories to gather information regarding image file

image_data = []

for class_dir in os.listdir(source_folder):
    class_path = os.path.join(source_folder, class_dir)
    if os.path.isdir(class_path):
        for img_file in os.listdir(class_path):
            if img_file.lower().endswith(('.jpeg')):   # only taking .jpeg image as source contains only jpeg file 
                image_data.append({
                    'path': os.path.join(class_path, img_file),
                    'className': class_dir,
                    'fileName': img_file
                })

# Converting to DataFrame
image_df = pd.DataFrame(image_data)
print(f"Total images found: {len(image_df)}")

Total images found: 50000


In [12]:
image_df.head(5)

Unnamed: 0,path,className,fileName
0,my_dataset/train\n01530575\n01530575_10018.JPEG,n01530575,n01530575_10018.JPEG
1,my_dataset/train\n01530575\n01530575_10021.JPEG,n01530575,n01530575_10021.JPEG
2,my_dataset/train\n01530575\n01530575_10023.JPEG,n01530575,n01530575_10023.JPEG
3,my_dataset/train\n01530575\n01530575_10024.JPEG,n01530575,n01530575_10024.JPEG
4,my_dataset/train\n01530575\n01530575_10039.JPEG,n01530575,n01530575_10039.JPEG


In [14]:
# Spliting 50000 data in train, test and validation set

total_images = 50000  
train_size = 30000
test_size = 10000
val_size = 10000

train_df, temp_df = train_test_split(
    image_df, 
    train_size=train_size,
    test_size=total_images-train_size, 
    stratify=image_df['className'],    # statifying sampling is used for the selction of image.
    random_state=42
)

In [15]:
# Spliting temp_df into test and validation
test_df, val_df = train_test_split(
    temp_df, 
    train_size=test_size,
    test_size=val_size, 
    stratify=temp_df['className'],    # statifying sampling is used for the selction of image.
    random_state=42
)

In [16]:
print(f"Training set: {len(train_df)} images")
print(f"Testing set: {len(test_df)} images")
print(f"Validation set: {len(val_df)} images")

Training set: 30000 images
Testing set: 10000 images
Validation set: 10000 images


**Prepare data for model as described in section 2 (i.e The Dataset) of AlexNet paper**

As per the paper " we first rescaled the image such that the shorter side was of length 256, and then
 cropped out the central 256 256 patch from the resulting image." 
1. Keep short side to 256.
2. Crop resulting image.


In [None]:
# Image processing function 
# Takes image path and output path as input and initialize the size as 256
def process_image_alexnet(image_path, output_path, size=256):
    """
    Processing an image according to AlexNet paper:
    1. Resize the image such that the shortest side is 256 pixels
    2. Center crop to 256x256 (if needed)
    """
    try:
        img = Image.open(image_path)
        
        # Converting grayscale to RGB if needed however all the images are in RBG
        if img.mode != 'RGB':
            img = img.convert('RGB')
        
        # Resizing image such that shortest side is 256 pixels
        width, height = img.size
        if width < height:
            new_width = size
            new_height = int(height * (size / width)) # this is done to keep the image proportion
        else:
            new_height = size
            new_width = int(width * (size / height))  # this is done to keep the image proportion
        
        img = img.resize((new_width, new_height), Image.LANCZOS)  # Image.LANCZOS is used to maintain the image quality
        
        # Center crop to size x size
        if new_width > size or new_height > size:
            left = (new_width - size) // 2
            top = (new_height - size) // 2
            right = left + size
            bottom = top + size
            img = img.crop((left, top, right, bottom))
        
        # Save the processed image
        img.save(output_path)
        return True
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return False

**Iterating through every row from every dataframe** to process the image individually. 


In [None]:
# Process and copy the images
dataset_splits = {
    'train': train_df,
    'test': test_df,
    'val': val_df
}

for split_name, split_df in dataset_splits.items():
    print(f"Processing {split_name} set...")
    output_dir = os.path.join(output_folder, split_name)
    
    # Process each image
    for i, row in enumerate(split_df.itertuples()):
        # Create class directory if it doesn't exist however it is created above but it insure it is created if not created
        class_dir = os.path.join(output_dir, row.className)
        os.makedirs(class_dir, exist_ok=True)   # make sure the directory is created
        
        # Define output path
        output_path = os.path.join(class_dir, row.fileName) # created path to image name
        
        # Process and save image
        success = process_image_alexnet(row.path, output_path)
        
        # Print progress
        if i % 500 == 0:
            print(f"  Processed {i}/{len(split_df)} images in {split_name} set")
    # break

# Creating label mapping (class name to index)
classes = sorted(train_df['className'].unique())
class_to_idx = {cls: i for i, cls in enumerate(classes)}

# Save class mapping into a text file
with open(os.path.join(output_folder, 'class_mapping.txt'), 'w') as f:
    for cls, idx in class_to_idx.items():
        f.write(f"{cls},{idx}\n")

print("\nDataset preparation complete!")
print(f"Dataset saved to: {os.path.abspath(output_folder)}")
print(f"Total classes: {len(classes)}")
print(f"Training images: {len(train_df)}")
print(f"Testing images: {len(test_df)}")
print(f"Validation images: {len(val_df)}")

Processing train set...
  Processed 0/30000 images in train set
  Processed 500/30000 images in train set
  Processed 1000/30000 images in train set
  Processed 1500/30000 images in train set
  Processed 2000/30000 images in train set
  Processed 2500/30000 images in train set
  Processed 3000/30000 images in train set
  Processed 3500/30000 images in train set
  Processed 4000/30000 images in train set
  Processed 4500/30000 images in train set
  Processed 5000/30000 images in train set
  Processed 5500/30000 images in train set
  Processed 6000/30000 images in train set
  Processed 6500/30000 images in train set
  Processed 7000/30000 images in train set
  Processed 7500/30000 images in train set
  Processed 8000/30000 images in train set
  Processed 8500/30000 images in train set
  Processed 9000/30000 images in train set
  Processed 9500/30000 images in train set
  Processed 10000/30000 images in train set
  Processed 10500/30000 images in train set
  Processed 11000/30000 images i