Introduction

In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

Data preparation

In [4]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Paths
data_dir = '../dataset/asl_alphabet_train'

# Data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,  # 20% for validation
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True
)

val_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

# Training generator
train_generator = train_datagen.flow_from_directory(
    data_dir,
    target_size=(200, 200),
    batch_size=32,
    class_mode='categorical',
    subset='training',
    shuffle=True
)

# Validation generator
val_generator = val_datagen.flow_from_directory(
    data_dir,
    target_size=(200, 200),
    batch_size=32,
    class_mode='categorical',
    subset='validation',
    shuffle=False
)


Found 69600 images belonging to 29 classes.
Found 17400 images belonging to 29 classes.


Building model

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras import Input

model = Sequential([
    # Block 1
    Input(shape=(200, 200, 3)),  # Explicit Input layer
    Conv2D(32, (5, 5), activation='relu'),
    MaxPooling2D(pool_size=(2, 2), strides=2),

    # Block 2
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2), strides=2),

    # Block 3
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2), strides=2),

    # Flatten and Dense layers
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),

    # Output layer
    Dense(29, activation='softmax')
])

model.summary()

Compile the model

In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

Define Early Stopping Callback

In [3]:
early_stopping = EarlyStopping(
    monitor='val_accuracy',  # Monitor validation accuracy
    patience=3,              # Stop after 3 epochs without improvement
    restore_best_weights=True  # Keep the best model weights
)

Training the model:

1. Fit the model

Initial run:

In [11]:
# Fit the model to the training data with validation and early stopping
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=30,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)


Epoch 1/30
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9s/step - accuracy: 0.2288 - loss: 2.6480

[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19101s[0m 9s/step - accuracy: 0.2289 - loss: 2.6477 - val_accuracy: 0.5495 - val_loss: 1.3226
Epoch 2/30
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m762s[0m 350ms/step - accuracy: 0.6369 - loss: 1.0819 - val_accuracy: 0.6314 - val_loss: 1.0694
Epoch 3/30
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m760s[0m 350ms/step - accuracy: 0.7580 - loss: 0.7090 - val_accuracy: 0.7266 - val_loss: 0.8369
Epoch 4/30
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1251s[0m 575ms/step - accuracy: 0.8114 - loss: 0.5464 - val_accuracy: 0.7637 - val_loss: 0.7704
Epoch 5/30
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3264s[0m 2s/step - accuracy: 0.8499 - loss: 0.4441 - val_accuracy: 0.7723 - val_loss: 0.7978
Epoch 6/30
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3729s[0m 2s/step - accuracy: 0.8664 - loss: 0.3907 - val_accuracy: 0.7832 - val_loss: 0.7248
Epoch 7

Added BatchNormalization after each convolutional and dense layer to stabilize learning and improve validation accuracy.

Added L2 Regularization (0.001) to prevent overfitting by penalizing large weights.

Replaced Flatten with GlobalAveragePooling2D to drastically reduce parameters (will drop from 17M to under 500K).

Enhanced data augmentation with wider rotation range and brightness variations to improve generalization.

Added learning rate scheduler (ReduceLROnPlateau) to automatically reduce learning rate when validation loss plateaus.

Slightly reduced dropout from 0.5 to 0.4 as we're now using multiple regularization techniques.



In [4]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

# Input layer
inputs = Input(shape=(200, 200, 3))

# Block 1
x = Conv2D(32, (5, 5), activation='relu', kernel_regularizer=l2(0.001))(inputs)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2), strides=2)(x)

# Block 2
x = Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001))(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2), strides=2)(x)

# Block 3
x = Conv2D(128, (3, 3), activation='relu', kernel_regularizer=l2(0.001))(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2), strides=2)(x)

# Global Average Pooling instead of Flatten
x = GlobalAveragePooling2D()(x)

# Dense layer with regularization
x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)  # Slightly reduced dropout

# Output layer
outputs = Dense(29, activation='softmax')(x)

# Create model
model = Model(inputs, outputs)


In [5]:
# Enhanced data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,       # Increased rotation range
    width_shift_range=0.2,   # Increased shift range
    height_shift_range=0.2,
    zoom_range=0.2,          # Increased zoom range
    horizontal_flip=True,
    brightness_range=[0.8, 1.2]  # Add brightness variation
)

# Learning rate scheduler
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=2,
    min_lr=0.00001
)
model.summary()

In [6]:
# Compile with the same optimizer but add metrics
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train with both callbacks
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,  # Increased epochs since we have early stopping
    callbacks=[
        early_stopping,
        reduce_lr
    ],
    verbose=1
)

Epoch 1/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.3273 - loss: 2.6823

[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8726s[0m 4s/step - accuracy: 0.3274 - loss: 2.6819 - val_accuracy: 0.4804 - val_loss: 2.0926 - learning_rate: 0.0010
Epoch 2/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2700s[0m 1s/step - accuracy: 0.8114 - loss: 0.8406 - val_accuracy: 0.5552 - val_loss: 2.1284 - learning_rate: 0.0010
Epoch 3/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2429s[0m 1s/step - accuracy: 0.8923 - loss: 0.5699 - val_accuracy: 0.5525 - val_loss: 2.1868 - learning_rate: 0.0010
Epoch 4/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1441s[0m 662ms/step - accuracy: 0.9605 - loss: 0.3628 - val_accuracy: 0.8349 - val_loss: 0.8209 - learning_rate: 2.0000e-04
Epoch 5/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2707s[0m 1s/step - accuracy: 0.9744 - loss: 0.2758 - val_accuracy: 0.8337 - val_loss: 0.8827 - learning_rate: 2.0000e-04
Epoch 6/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━

Three convolutional blocks with increasing filter complexity (32→64→128)

Spatial dropout (dropping entire feature maps) after each conv block to reduce overfitting

L2 regularization (0.002) on all convolutional and dense layers

BatchNormalization after each convolution for faster, more stable training

GlobalAveragePooling2D instead of Flatten to drastically reduce parameters

Enhanced training process with learning rate scheduling and strong data augmentation

In [7]:
# Import required libraries
from tensorflow.keras import Input, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.layers import Dense, Dropout, SpatialDropout2D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Enhanced data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,       # Increased rotation range
    width_shift_range=0.2,   # Increased shift range
    height_shift_range=0.2,
    zoom_range=0.2,          # Increased zoom range
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],  # Brightness variation
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

# Define paths and image dimensions
data_dir = '../dataset/asl_alphabet_train'  # Adjust if needed
img_height, img_width = 200, 200
batch_size = 32
num_classes = 29  # A-Z plus SPACE, DELETE, NOTHING

# Data generators
train_generator = train_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    subset='training',
    shuffle=True
)

val_generator = val_datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation',
    shuffle=False
)

# Build the improved model with Functional API
inputs = Input(shape=(img_height, img_width, 3))

# Block 1
x = Conv2D(32, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.002))(inputs)
x = BatchNormalization()(x)
x = MaxPooling2D(2, 2)(x)
x = SpatialDropout2D(0.1)(x)

# Block 2
x = Conv2D(64, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.002))(x)
x = BatchNormalization()(x)
x = MaxPooling2D(2, 2)(x)
x = SpatialDropout2D(0.1)(x)

# Block 3
x = Conv2D(128, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.002))(x)
x = BatchNormalization()(x)
x = MaxPooling2D(2, 2)(x)
x = SpatialDropout2D(0.1)(x)

# Global Average Pooling instead of Flatten (drastically reduces parameters)
x = GlobalAveragePooling2D()(x)

# Dense layers with regularization
x = Dense(128, activation='relu', kernel_regularizer=l2(0.002))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

# Output layer
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Display model summary
model.summary()

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=5,  # Slightly increased patience
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=3,
    min_lr=0.00001
)

# Training parameters
epochs = 50


Found 69600 images belonging to 29 classes.
Found 17400 images belonging to 29 classes.


In [8]:
# To train the model, run:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/50


I0000 00:00:1745671177.977774   47704 service.cc:152] XLA service 0x7fb8040076e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745671177.977940   47704 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2025-04-26 16:39:38.109647: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1745671178.742850   47704 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1745671193.889364   47704 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 179ms/step - accuracy: 0.1088 - loss: 3.8525 - val_accuracy: 0.1821 - val_loss: 3.1428 - learning_rate: 0.0010
Epoch 2/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 157ms/step - accuracy: 0.3759 - loss: 2.2342 - val_accuracy: 0.5389 - val_loss: 1.6321 - learning_rate: 0.0010
Epoch 3/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 174ms/step - accuracy: 0.5659 - loss: 1.6142 - val_accuracy: 0.6530 - val_loss: 1.2374 - learning_rate: 0.0010
Epoch 4/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 160ms/step - accuracy: 0.6586 - loss: 1.3460 - val_accuracy: 0.7352 - val_loss: 1.0995 - learning_rate: 0.0010
Epoch 5/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 152ms/step - accuracy: 0.7015 - loss: 1.2375 - val_accuracy: 0.6926 - val_loss: 1.2445 - learning_rate: 0.0010
Epoch 6/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━

Double Convolution in Each Block:

Added two consecutive 3×3 convolutions in each block instead of a single layer

This creates a deeper feature hierarchy while keeping the parameter count low

Residual Connection in Block 3:

Added a skip connection that helps with gradient flow during backpropagation

Improves learning of complex features without increasing depth excessively

Increased Regularization:

L2 regularization increased to 0.0015 from 0.001

Progressive SpatialDropout2D (0.1→0.15→0.2) to increasingly regularize deeper features

BatchNormalization After Every Conv Layer:

Stabilizes training and accelerates convergence

Reduces internal covariate shift between layers

Smaller Initial Filter Size:

Using 3×3 filters throughout (instead of 5×5) reduces parameters while maintaining receptive field through stacking

GlobalAveragePooling2D:

Retained from your current model to drastically reduce parameters compared to Flatten

Provides some built-in regularization by averaging features

In [9]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, SpatialDropout2D
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, Add
from tensorflow.keras.regularizers import l2

# Input layer
inputs = Input(shape=(200, 200, 3))

# Block 1 - Initial Feature Extraction
x = Conv2D(32, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.0015))(inputs)
x = BatchNormalization()(x)
x = Conv2D(32, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.0015))(x)
x = BatchNormalization()(x)
x = MaxPooling2D(2, 2)(x)
x = SpatialDropout2D(0.1)(x)

# Block 2 - Intermediate Features
x = Conv2D(64, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.0015))(x)
x = BatchNormalization()(x)
x = Conv2D(64, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.0015))(x)
x = BatchNormalization()(x)
x = MaxPooling2D(2, 2)(x)
x = SpatialDropout2D(0.15)(x)

# Block 3 - Advanced Features with Residual Connection
x_shortcut = Conv2D(128, (1, 1), padding='same')(x)  # Shortcut connection
x = Conv2D(128, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.0015))(x)
x = BatchNormalization()(x)
x = Conv2D(128, (3, 3), padding='same', activation='relu', kernel_regularizer=l2(0.0015))(x)
x = BatchNormalization()(x)
x = Add()([x, x_shortcut])  # Add residual connection
x = MaxPooling2D(2, 2)(x)
x = SpatialDropout2D(0.2)(x)

# Global Average Pooling instead of Flatten
x = GlobalAveragePooling2D()(x)

# Dense layers with regularization
x = Dense(128, activation='relu', kernel_regularizer=l2(0.002))(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)

# Output layer
outputs = Dense(29, activation='softmax')(x)

model = Model(inputs, outputs)


In [10]:
# Enhanced data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=6,  # Slightly increased patience
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=3,
    min_lr=0.00001
)

# Compile with Adam optimizer
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

In [11]:
# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

Epoch 1/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 187ms/step - accuracy: 0.1309 - loss: 3.9067 - val_accuracy: 0.3075 - val_loss: 2.4240 - learning_rate: 0.0010
Epoch 2/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 176ms/step - accuracy: 0.5154 - loss: 1.8418 - val_accuracy: 0.6754 - val_loss: 1.5014 - learning_rate: 0.0010
Epoch 3/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 177ms/step - accuracy: 0.7417 - loss: 1.1988 - val_accuracy: 0.8504 - val_loss: 0.8814 - learning_rate: 0.0010
Epoch 4/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 176ms/step - accuracy: 0.8054 - loss: 1.0085 - val_accuracy: 0.8254 - val_loss: 0.9695 - learning_rate: 0.0010
Epoch 5/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 176ms/step - accuracy: 0.8255 - loss: 0.9417 - val_accuracy: 0.6887 - val_loss: 1.4186 - learning_rate: 0.0010
Epoch 6/50
[1m2175/2175[0m [32m━━━━━━

Self-Attention Mechanism: Added lightweight spatial attention to focus on the most informative regions of hand gestures.

Residual Blocks Throughout: Now using residual connections in all blocks to improve gradient flow, enabling deeper feature learning without vanishing gradients.

Enhanced Data Augmentation: Added shear transforms and channel shifts to make the model more robust to variations in lighting and hand positioning.

Layer Normalization: Added after global pooling to stabilize feature distributions before classification.

Cyclic Learning Rate Option: Alternative learning rate scheduler that can help escape local minima.

Stronger Regularization: Slightly increased L2 regularization (0.0018 vs 0.0015) to further prevent overfitting.

Increased Training Duration: More epochs (75) with longer early stopping patience (8) gives the model time to find more optimal weights.