In [1]:
!pip install kaggle datasets transformers torch torchvision



In [12]:
# Import necessary libraries
import os
import zipfile
import shutil
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from transformers import AutoProcessor, CLIPVisionModel

In [3]:
# Early preprocessing of the data which includes signs other than the alphabet and a test dataset which is not usable due to its small size.

# Download the dataset from Kaggle
!kaggle datasets download -d debashishsau/aslamerican-sign-language-aplhabet-dataset

# Unzip the file
with zipfile.ZipFile('/content/aslamerican-sign-language-aplhabet-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/aslamerican-sign-language-aplhabet-dataset')

# Remove images which are not relevant
dir_to_remove = ['/content/aslamerican-sign-language-aplhabet-dataset/ASL_Alphabet_Dataset/asl_alphabet_train/del',
                 '/content/aslamerican-sign-language-aplhabet-dataset/ASL_Alphabet_Dataset/asl_alphabet_train/nothing',
                 '/content/aslamerican-sign-language-aplhabet-dataset/ASL_Alphabet_Dataset/asl_alphabet_train/space'] # list of unnecessary directories

# Go through the list and remove
for path in dir_to_remove:
    if os.path.exists(path):
        shutil.rmtree(path)

Dataset URL: https://www.kaggle.com/datasets/debashishsau/aslamerican-sign-language-aplhabet-dataset
License(s): CC0-1.0
Downloading aslamerican-sign-language-aplhabet-dataset.zip to /content
100% 4.19G/4.20G [00:55<00:00, 165MB/s]
100% 4.20G/4.20G [00:55<00:00, 80.8MB/s]


In [7]:
# Set path to the dataset directory
dataset_dir = "/content/aslamerican-sign-language-aplhabet-dataset/ASL_Alphabet_Dataset/asl_alphabet_train"

# Retrieve image paths and corresponding labels from the subdirectory names
image_paths = []
labels = []

# Go through subdirectories
for label in os.listdir(dataset_dir):
    class_dir = os.path.join(dataset_dir, label) # create path to subdirectory
    if os.path.isdir(class_dir): # check if it's a directory, not a file
        print(f"Processing class: {label}") # see processed classes for debugging
        for image_name in os.listdir(class_dir):
            if image_name.endswith((".jpg", ".jpeg", ".png")): # go through only files that are images
                image_path = os.path.join(class_dir, image_name) # create path to file
                image_paths.append(image_path)
                labels.append(label)

# Create a mapping of class labels to numeric indices
class_names = sorted(os.listdir(dataset_dir))  # Get all class names
label2index = {label: idx for idx, label in enumerate(class_names)}  # Map labels to indices

Processing class: V
Processing class: U
Processing class: E
Processing class: R
Processing class: A
Processing class: D
Processing class: F
Processing class: J
Processing class: G
Processing class: C
Processing class: B
Processing class: P
Processing class: X
Processing class: H
Processing class: L
Processing class: Z
Processing class: Q
Processing class: M
Processing class: W
Processing class: S
Processing class: K
Processing class: T
Processing class: O
Processing class: N
Processing class: Y
Processing class: I


In [9]:
# Prepare dataset class for preprocessing
class ASLDataset(Dataset):
    def __init__(self, image_paths, labels, processor, label2index):
        self.image_paths = image_paths
        self.labels = labels
        self.processor = processor
        self.label2index = label2index

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load the image
        image = Image.open(self.image_paths[idx]).convert("RGB")

        # Convert label (text) to numeric index
        text_label = self.labels[idx]
        numeric_label = self.label2index[text_label]

        # Preprocess the image using the processor
        inputs = self.processor(images=image, return_tensors="pt", padding=True)

        # Return pixel values and label
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "label": numeric_label
        }

# Initialize CLIP processor
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
dataset = ASLDataset(image_paths=image_paths, labels=labels, processor=processor, label2index=label2index)

# # Split the dataset 70/20/10%
# train_size = int(0.7 * len(dataset))
# val_size = int(0.2 * len(dataset))
# test_size = len(dataset) - train_size - val_size
# train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# # Check sizes of the split dataset
# print(f"Train size: {train_size}")
# print(f"Validation size: {val_size}")
# print(f"Test size: {test_size}")

# Create Dataloaders for train, validation, and test sets
#batch_size = 32

#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [25]:
# Create and split a small subset to test
subset, _ = random_split(dataset, [10000, len(dataset) - 10000]) # random_split to get instances of several letters
sub_val_size = int(0.2 * len(subset))
sub_train_size = len(subset) - sub_val_size
sub_train_dataset, sub_val_dataset = random_split(subset, [sub_train_size, sub_val_size]) #split into 80/20 train and validation sets

# Set batch size and create Dataloaders for the subset
batch_size = 32

sub_train_loader = DataLoader(sub_train_dataset, batch_size=batch_size, shuffle=True)
sub_val_loader = DataLoader(sub_val_dataset, batch_size=batch_size, shuffle=True)

In [26]:
# Check the shape of a batch
batch = next(iter(sub_train_loader))
print(batch)

{'pixel_values': tensor([[[[-1.7923, -1.7923, -1.7485,  ..., -1.6609, -1.7923, -1.7631],
          [-1.7923, -1.7777, -1.7631,  ..., -1.6463, -1.6609, -1.7631],
          [-1.7485, -1.7631, -0.8872,  ..., -0.6244, -0.8580, -1.5733],
          ...,
          [-1.7631, -1.7047, -1.5295,  ...,  0.3391, -0.0113, -1.6025],
          [-1.7923, -1.7631, -1.6171,  ..., -0.0550, -0.4054, -1.6609],
          [-1.7923, -1.7923, -1.6171,  ..., -1.5441, -1.5879, -1.6755]],

         [[-1.7071, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.6921],
          [-1.7221, -1.6921, -1.7521,  ..., -1.7521, -1.7071, -1.7521],
          [-1.7521, -1.7371, -0.8216,  ..., -0.8516, -0.9867, -1.6771],
          ...,
          [-1.7371, -1.6771, -1.4970,  ...,  0.2589, -0.0562, -1.6470],
          [-1.7371, -1.7071, -1.6020,  ..., -0.0862, -0.3864, -1.6470],
          [-1.7221, -1.7221, -1.6020,  ..., -1.6020, -1.6170, -1.6320]],

         [[ 2.1459,  2.0748,  2.0464,  ...,  2.0606,  2.0179,  2.0464],
          [ 2

In [30]:
# Initialize CLIPVision
clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")

# Define module with a classifier head that maps image features from CLIPVision to the number of classes
class CLIPVisionClassifier(nn.Module):
    def __init__(self, clip_model, num_classes):
        super().__init__()
        self.model = clip_model
        self.classifier = nn.Linear(self.model.config.hidden_size, num_classes)

    def forward(self, pixel_values):
        outputs = self.model(pixel_values=pixel_values)  # Forward pass to get image features
        image_features = outputs.pooler_output  # Get the pooled feature representation (aggregated image features)

        # Pass the feature vector through the classifier
        logits = self.classifier(image_features)
        return logits


# Initialize model with classifier
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPVisionClassifier(clip_model=clip_model, num_classes=26).to(device)

# Assign optimizer, scheduler and loss function
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [31]:
# Training loop
def train(model, dataloader, criterion, optimizer, scheduler, device):
    model.train()
    running_loss = 0.0
    total_batches = len(dataloader)

    for batch_idx, batch in enumerate(dataloader):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["label"].to(device)  # Numeric labels

        optimizer.zero_grad()

        # Forward pass
        logits = model(pixel_values=pixel_values)  # returns logits from the classifier head

        # Compute loss
        loss = criterion(logits, labels)
        loss.backward()

        # Gradient clipping and optimization
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()

        running_loss += loss.item()

    scheduler.step()
    epoch_loss = running_loss / total_batches
    print(f"Loss: {epoch_loss:.4f}")

    return epoch_loss

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    val_loss = 0.0

    with torch.no_grad():
        for batch in dataloader:
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["label"].to(device)

            # Forward pass
            logits = model(pixel_values=pixel_values)

            # Predicted labels
            predicted_indices = torch.argmax(logits, dim=1)

            # Calculate metrics
            correct_predictions = (predicted_indices == labels).sum().item()
            total_correct += correct_predictions
            total_samples += len(labels)

            # Compute validation loss
            loss = criterion(logits, labels)
            val_loss += loss.item()

    # Compute overall metrics
    accuracy = total_correct / total_samples
    avg_val_loss = val_loss / len(dataloader)

    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return avg_val_loss, accuracy

In [32]:
# Set number of epochs
num_epochs = 10

# Training and evaluation
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_loss = train(
        model=model,
        dataloader=tqdm(sub_train_loader, desc=f"Epoch {epoch+1}"),
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device
    )

    val_loss, val_accuracy = evaluate(
    model=model,
    dataloader=sub_val_loader,
    criterion=criterion,
    device=device
    )

Epoch 1/10


Epoch 1: 100%|██████████| 250/250 [01:44<00:00,  2.39it/s]


Loss: 0.6637
Validation Loss: 0.1736
Validation Accuracy: 0.9510
Epoch 2/10


Epoch 2: 100%|██████████| 250/250 [01:45<00:00,  2.38it/s]


Loss: 0.0986
Validation Loss: 0.1285
Validation Accuracy: 0.9625
Epoch 3/10


Epoch 3: 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]


Loss: 0.0443
Validation Loss: 0.1410
Validation Accuracy: 0.9635
Epoch 4/10


Epoch 4: 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]


Loss: 0.0288
Validation Loss: 0.1129
Validation Accuracy: 0.9690
Epoch 5/10


Epoch 5: 100%|██████████| 250/250 [01:44<00:00,  2.38it/s]


Loss: 0.0164
Validation Loss: 0.1196
Validation Accuracy: 0.9735
Epoch 6/10


Epoch 6: 100%|██████████| 250/250 [01:45<00:00,  2.38it/s]


Loss: 0.0039
Validation Loss: 0.0675
Validation Accuracy: 0.9825
Epoch 7/10


Epoch 7: 100%|██████████| 250/250 [01:45<00:00,  2.36it/s]


Loss: 0.0009
Validation Loss: 0.0636
Validation Accuracy: 0.9835
Epoch 8/10


Epoch 8: 100%|██████████| 250/250 [01:45<00:00,  2.37it/s]


Loss: 0.0004
Validation Loss: 0.0614
Validation Accuracy: 0.9845
Epoch 9/10


Epoch 9: 100%|██████████| 250/250 [01:45<00:00,  2.37it/s]


Loss: 0.0004
Validation Loss: 0.0597
Validation Accuracy: 0.9835
Epoch 10/10


Epoch 10: 100%|██████████| 250/250 [01:45<00:00,  2.37it/s]


Loss: 0.0004
Validation Loss: 0.0598
Validation Accuracy: 0.9840
