In [None]:
# Dataset check: you do NOT create the CSVs — download Sign Language MNIST and place the two files here.
import os
required = ["sign_mnist_train.csv", "sign_mnist_test.csv"]
missing = [f for f in required if not os.path.isfile(f)]
if missing:
    print("Missing CSV(s):", missing)
    print("Download from: https://www.kaggle.com/datasets/datamunge/sign-language-mnist")
    print("Then place sign_mnist_train.csv and sign_mnist_test.csv in this project's folder.")
else:
    print("✓ Found", required)

In [None]:
%cd '/home/dev/Project/'

In [None]:
import pandas as pd

train_df = pd.read_csv('sign_mnist_train.csv')
test_df = pd.read_csv('sign_mnist_test.csv')

print("--- train_df (first 5 rows) ---")
print(train_df.head())
print("\n--- train_df (info) ---")
train_df.info()

print("\n--- test_df (first 5 rows) ---")
print(test_df.head())
print("\n--- test_df (info) ---")
test_df.info()


In [None]:
import numpy as np

train_labels = train_df['label'].values
train_pixels = train_df.drop('label', axis=1).values

test_labels = test_df['label'].values
test_pixels = test_df.drop('label', axis=1).values

print("Shape of train_labels:", train_labels.shape)
print("Shape of train_pixels:", train_pixels.shape)
print("Shape of test_labels:", test_labels.shape)
print("Shape of test_pixels:", test_pixels.shape)

In [None]:
image_size = 28

train_images = train_pixels.reshape(-1, image_size, image_size).astype('float32') / 255.0
test_images = test_pixels.reshape(-1, image_size, image_size).astype('float32') / 255.0

print("Shape of train_images after reshape and normalization:", train_images.shape)
print("Shape of test_images after reshape and normalization:", test_images.shape)
print("Min pixel value in train_images:", train_images.min())
print("Max pixel value in train_images:", train_images.max())

In [None]:
# grad-cam package not used; custom Grad-CAM is implemented in the Evaluation section below

In [None]:
!pip install torch torchvision transformers huggingface_hub

##Loading sapiens model



In [None]:
import torch
from huggingface_hub import hf_hub_download
import torchvision.transforms as transforms
from PIL import Image
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading Sapiens pretrained model...")

model_path = hf_hub_download(
    repo_id="facebook/sapiens-pretrain-1b-torchscript",
    filename="sapiens_1b_epoch_173_torchscript.pt2",
    local_dir="./models"
)

model_sapiens = torch.jit.load(model_path, map_location=device)
model_sapiens = model_sapiens.eval()

print("✓ Sapiens-1B pretrained model loaded successfully")

transform_sapiens = transforms.Compose([
    transforms.Resize((1024, 1024)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_sapiens_features(image_input):
    """
    Extracts Sapiens features from an image.
    Args:
        image_input (np.ndarray): Input image (grayscale ASL MNIST)
    Returns:
        np.ndarray: Feature vector (CLS token or pooled features)
    """
    if isinstance(image_input, np.ndarray):

        if len(image_input.shape) == 2:
            image_input = np.stack([image_input]*3, axis=-1)
        if image_input.max() <= 1.0:
            image_input = (image_input * 255).astype(np.uint8)
        image_input = Image.fromarray(image_input)

    image_tensor = transform_sapiens(image_input).unsqueeze(0).to(device)

    with torch.no_grad():
        try:
            features = model_sapiens(image_tensor)

            if isinstance(features, tuple):
                features = features[0]

            if len(features.shape) == 3:
                features = features[:, 0, :]
            elif len(features.shape) > 2:
                features = features.mean(dim=list(range(2, len(features.shape))))

            features = features.squeeze(0)

        except RuntimeError as e:
            print(f"Error during forward pass: {e}")
            raise

    return features.cpu().numpy()

print("✓ Sapiens feature extractor ready (1024x1024 resolution)")

## Extract Features with I-JEPA


In [None]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

transform_ijepa = transforms.Compose([
    transforms.Resize(224),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print("Loading I-JEPA model...")
from transformers import AutoModel

model_ijepa = AutoModel.from_pretrained(
    "facebook/ijepa_vith14_1k",
    trust_remote_code=True
)
model_ijepa = model_ijepa.to(device)
model_ijepa.eval()

print("✓ I-JEPA model loaded successfully")

def extract_ijepa_features(image_input):
    """Extracts I-JEPA CLS token features from an image"""
    if isinstance(image_input, np.ndarray):
        image_input = Image.fromarray((image_input * 255).astype(np.uint8))

    image_tensor = transform_ijepa(image_input).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model_ijepa(image_tensor)
        cls_token = outputs.last_hidden_state[:, 0, :]

    return cls_token.cpu().numpy().squeeze(0)

print("✓ I-JEPA feature extractor ready")

In [None]:
from transformers import AutoModel

print("Loading DINOv2 model...")

model_dinov2 = AutoModel.from_pretrained("facebook/dinov2-base", trust_remote_code=True)
model_dinov2 = model_dinov2.to(device)
model_dinov2.eval()

print("✓ DINOv2 loaded successfully")
    
transform_dinov2 = transform_ijepa

def extract_dinov2_features(image_input):
    """Extracts dinov2 CLS token features"""
    if isinstance(image_input, np.ndarray):
        image_input = Image.fromarray((image_input * 255).astype(np.uint8))

    image_tensor = transform_dinov2(image_input).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model_dinov2(image_tensor)
        cls_token = outputs.last_hidden_state[:, 0, :]

    return cls_token.cpu().numpy().squeeze(0)

print("✓ DINOv2 feature extractor ready")

In [None]:
from transformers import AutoModel

print("Loading DINOv3 model...")

model_dinov3 = AutoModel.from_pretrained(
    "facebook/dinov3-vitb16-pretrain-lvd1689m",
    trust_remote_code=True
)
model_dinov3 = model_dinov3.to(device)
model_dinov3.eval()

print("✓ DINOv3 loaded successfully")

transform_dinov3 = transform_ijepa

def extract_dinov3_features(image_input):
    """Extracts DINOv3 CLS token features"""
    if isinstance(image_input, np.ndarray):
        image_input = Image.fromarray((image_input * 255).astype(np.uint8))
    image_tensor = transform_dinov3(image_input).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model_dinov3(image_tensor)
        cls_token = outputs.last_hidden_state[:, 0, :]

    return cls_token.cpu().numpy().squeeze(0)

print("✓ DINOv3 feature extractor ready")

In [None]:
dinov3_train_features = []
dinov3_train_labels = []

print("Extracting DINOv3 features for training set...")

for i, image in enumerate(train_images):
    feature_vector = extract_dinov3_features(image)
    dinov3_train_features.append(feature_vector)
    dinov3_train_labels.append(train_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(train_images)} training images.")

dinov3_train_features = np.array(dinov3_train_features)
dinov3_train_labels = np.array(dinov3_train_labels)

print(f"Finished. Shape: {dinov3_train_features.shape}")

dinov3_test_features = []
dinov3_test_labels = []

print("\nExtracting DINOv3 features for testing set...")

for i, image in enumerate(test_images):
    feature_vector = extract_dinov3_features(image)
    dinov3_test_features.append(feature_vector)
    dinov3_test_labels.append(test_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(test_images)} testing images.")

dinov3_test_features = np.array(dinov3_test_features)
dinov3_test_labels = np.array(dinov3_test_labels)

print(f"Finished. Shape: {dinov3_test_features.shape}")


In [None]:
import numpy as np
import torch

ijepa_train_features = []
ijepa_train_labels = []

print("Extracting I-JEPA features for training set...")

for i, image in enumerate(train_images):
    feature_vector = extract_ijepa_features(image)

    ijepa_train_features.append(feature_vector)
    ijepa_train_labels.append(train_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(train_images)} training images.")

ijepa_train_features = np.array(ijepa_train_features)
ijepa_train_labels = np.array(ijepa_train_labels)

print(f"Finished extracting I-JEPA features for training set. Shape: {ijepa_train_features.shape}")
print(f"Training labels shape: {ijepa_train_labels.shape}")

ijepa_test_features = []
ijepa_test_labels = []

print("\nExtracting I-JEPA features for testing set...")

for i, image in enumerate(test_images):
    feature_vector = extract_ijepa_features(image)

    ijepa_test_features.append(feature_vector)
    ijepa_test_labels.append(test_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(test_images)} testing images.")

ijepa_test_features = np.array(ijepa_test_features)
ijepa_test_labels = np.array(ijepa_test_labels)

print(f"Finished extracting I-JEPA features for testing set. Shape: {ijepa_test_features.shape}")
print(f"Testing labels shape: {ijepa_test_labels.shape}")

## Extract Features with Sapiens


In [None]:
import numpy as np
import torch

sapiens_train_features = []
sapiens_train_labels = []

print("Extracting Sapiens features for training set...")

for i, image in enumerate(train_images):
    feature_vector = extract_sapiens_features(image)

    sapiens_train_features.append(feature_vector)
    sapiens_train_labels.append(train_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(train_images)} training images.")

sapiens_train_features = np.array(sapiens_train_features)
sapiens_train_labels = np.array(sapiens_train_labels)

print(f"Finished extracting Sapiens features for training set. Shape: {sapiens_train_features.shape}")
print(f"Training labels shape: {sapiens_train_labels.shape}")

sapiens_test_features = []
sapiens_test_labels = []

print("\nExtracting Sapiens features for testing set...")

for i, image in enumerate(test_images):
    feature_vector = extract_sapiens_features(image)

    sapiens_test_features.append(feature_vector)
    sapiens_test_labels.append(test_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(test_images)} testing images.")

sapiens_test_features = np.array(sapiens_test_features)
sapiens_test_labels = np.array(sapiens_test_labels)

print(f"Finished extracting Sapiens features for testing set. Shape: {sapiens_test_features.shape}")
print(f"Testing labels shape: {sapiens_test_labels.shape}")

In [None]:
import numpy as np
import torch

dinov2_train_features = []
dinov2_train_labels = []

print("Extracting dinov2 features for training set...")

for i, image in enumerate(train_images):
    feature_vector = extract_dinov2_features(image)

    dinov2_train_features.append(feature_vector)
    dinov2_train_labels.append(train_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(train_images)} training images.")

dinov2_train_features = np.array(dinov2_train_features)
dinov2_train_labels = np.array(dinov2_train_labels)

print(f"Finished extracting DINOv2 features for training set. Shape: {dinov2_train_features.shape}")
print(f"Training labels shape: {dinov2_train_labels.shape}")

dinov2_test_features = []
dinov2_test_labels = []

print("\nExtracting DINOv2 features for testing set...")

for i, image in enumerate(test_images):
    feature_vector = extract_dinov2_features(image)

    dinov2_test_features.append(feature_vector)
    dinov2_test_labels.append(test_labels[i])

    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(test_images)} testing images.")

dinov2_test_features = np.array(dinov2_test_features)
dinov2_test_labels = np.array(dinov2_test_labels)

print(f"Finished extracting DINOv2 features for testing set. Shape: {dinov2_test_features.shape}")
print(f"Testing labels shape: {dinov2_test_labels.shape}")

In [None]:
print("Starting DINOv2 classifier training and evaluation...")

logistic_regression_model_dinov2 = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
logistic_regression_model_dinov2.fit(dinov2_train_features, dinov2_train_labels)

dinov2_predictions = logistic_regression_model_dinov2.predict(dinov2_test_features)

accuracy_dinov2 = accuracy_score(dinov2_test_labels, dinov2_predictions)
precision_dinov2 = precision_score(dinov2_test_labels, dinov2_predictions, average='weighted', zero_division=0)
recall_dinov2 = recall_score(dinov2_test_labels, dinov2_predictions, average='weighted', zero_division=0)
f1_dinov2 = f1_score(dinov2_test_labels, dinov2_predictions, average='weighted', zero_division=0)

print("\nDINOv2 Classifier Performance:")
print(f"Accuracy: {accuracy_dinov2:.4f}")
print(f"Precision (weighted): {precision_dinov2:.4f}")
print(f"Recall (weighted): {recall_dinov2:.4f}")
print(f"F1-Score (weighted): {f1_dinov2:.4f}")

## Train and Evaluate I-JEPA


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Imported LogisticRegression and evaluation metrics from sklearn.")

In [None]:
print("Starting I-JEPA classifier training and evaluation...")

logistic_regression_model_ijepa = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

logistic_regression_model_ijepa.fit(ijepa_train_features, ijepa_train_labels)
print("I-JEPA Logistic Regression model trained.")

ijepa_predictions = logistic_regression_model_ijepa.predict(ijepa_test_features)
print("Predictions made on I-JEPA test features.")

accuracy_ijepa = accuracy_score(ijepa_test_labels, ijepa_predictions)
precision_ijepa = precision_score(ijepa_test_labels, ijepa_predictions, average='weighted', zero_division=0)
recall_ijepa = recall_score(ijepa_test_labels, ijepa_predictions, average='weighted', zero_division=0)
f1_ijepa = f1_score(ijepa_test_labels, ijepa_predictions, average='weighted', zero_division=0)

print("\nI-JEPA Classifier Performance:")
print(f"Accuracy: {accuracy_ijepa:.4f}")
print(f"Precision (weighted): {precision_ijepa:.4f}")
print(f"Recall (weighted): {recall_ijepa:.4f}")
print(f"F1-Score (weighted): {f1_ijepa:.4f}")

## Train and Evaluate Sapiens Classifier


In [None]:
print("Starting Sapiens classifier training and evaluation...")

logistic_regression_model_sapiens = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

logistic_regression_model_sapiens.fit(sapiens_train_features, sapiens_train_labels)
print("Sapiens Logistic Regression model trained.")

sapiens_predictions = logistic_regression_model_sapiens.predict(sapiens_test_features)
print("Predictions made on Sapiens test features.")

accuracy_sapiens = accuracy_score(sapiens_test_labels, sapiens_predictions)
precision_sapiens = precision_score(sapiens_test_labels, sapiens_predictions, average='weighted', zero_division=0)
recall_sapiens = recall_score(sapiens_test_labels, sapiens_predictions, average='weighted', zero_division=0)
f1_sapiens = f1_score(sapiens_test_labels, sapiens_predictions, average='weighted', zero_division=0)

print("\nSapiens Classifier Performance:")
print(f"Accuracy: {accuracy_sapiens:.4f}")
print(f"Precision (weighted): {precision_sapiens:.4f}")
print(f"Recall (weighted): {recall_sapiens:.4f}")
print(f"F1-Score (weighted): {f1_sapiens:.4f}")

In [None]:
print("Starting DINOv3 classifier training and evaluation...")

logistic_regression_model_dinov3 = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
logistic_regression_model_dinov3.fit(dinov3_train_features, dinov3_train_labels)

dinov3_predictions = logistic_regression_model_dinov3.predict(dinov3_test_features)

accuracy_dinov3 = accuracy_score(dinov3_test_labels, dinov3_predictions)
precision_dinov3 = precision_score(dinov3_test_labels, dinov3_predictions, average='weighted', zero_division=0)
recall_dinov3 = recall_score(dinov3_test_labels, dinov3_predictions, average='weighted', zero_division=0)
f1_dinov3 = f1_score(dinov3_test_labels, dinov3_predictions, average='weighted', zero_division=0)

print("\nDINOv3 Classifier Performance:")
print(f"Accuracy: {accuracy_dinov3:.4f}")
print(f"Precision (weighted): {precision_dinov3:.4f}")
print(f"Recall (weighted): {recall_dinov3:.4f}")
print(f"F1-Score (weighted): {f1_dinov3:.4f}")


## Compare Model Performance


In [None]:
import pandas as pd

metrics_data = {
    'Model': ['I-JEPA', 'Sapiens', 'DINOv2', 'DINOv3'],
    'Accuracy': [accuracy_ijepa, accuracy_sapiens, accuracy_dinov2, accuracy_dinov3],
    'Precision': [precision_ijepa, precision_sapiens, precision_dinov2, precision_dinov3],
    'Recall': [recall_ijepa, recall_sapiens, recall_dinov2, recall_dinov3],
    'F1-Score': [f1_ijepa, f1_sapiens, f1_dinov2, f1_dinov3]
}

performance_df = pd.DataFrame(metrics_data)

print("Performance metrics stored in DataFrame:")
print(performance_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

plt.figure(figsize=(8, 6))
sns.barplot(x='Model', y='Accuracy', hue='Model', data=performance_df, palette='viridis', legend=False)
plt.title('Model Accuracy Comparison: I-JEPA, Sapiens, DINOv2, DINOv3')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0.90, 1.0)
plt.show()

print("Accuracy comparison chart displayed.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

fig, axes = plt.subplots(1, 3, figsize=(24, 6))

sns.barplot(x='Model', y='Precision', hue='Model', data=performance_df, palette='viridis', ax=axes[0], legend=False)
axes[0].set_title('Model Precision Comparison')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Precision')
axes[0].set_ylim(0.90, 1.0)

sns.barplot(x='Model', y='Recall', hue='Model', data=performance_df, palette='viridis', ax=axes[1], legend=False)
axes[1].set_title('Model Recall Comparison')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Recall')
axes[1].set_ylim(0.90, 1.0)

sns.barplot(x='Model', y='F1-Score', hue='Model', data=performance_df, palette='viridis', ax=axes[2], legend=False)
axes[2].set_title('Model F1-Score Comparison')
axes[2].set_xlabel('Model')
axes[2].set_ylabel('F1-Score')
axes[2].set_ylim(0.90, 1.0)

plt.tight_layout()
plt.show()

print("Precision, Recall, and F1-Score comparison charts displayed.")

# LoRA Adapters

In [None]:
!pip install peft transformers accelerate timm

In [None]:
!pip install peft accelerate bitsandbytes timm -q

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from PIL import Image
from tqdm import tqdm
import timm
import os

In [None]:
for name, module in base_model.named_modules():
    print(name)


In [None]:
import pandas as pd

df = pd.read_csv("sign_mnist_train.csv")
print(sorted(df["label"].unique()))
print("Num classes =", len(df["label"].unique()))


In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("sign_mnist_train.csv")
df_test = pd.read_csv("sign_mnist_test.csv")

unique_labels = sorted(df_train["label"].unique())
label_map = {old: new for new, old in enumerate(unique_labels)}

print("Label map:", label_map)

# Use separate variables for remapped labels so linear-probe comparison (train_labels/test_labels) is unchanged
train_labels_remapped = np.array([label_map[l] for l in train_labels])
test_labels_remapped = np.array([label_map[l] for l in test_labels])

print("New train label range:", train_labels_remapped.min(), train_labels_remapped.max())
print("New test  label range:", test_labels_remapped.min(), test_labels_remapped.max())
print("Unique:", np.unique(train_labels_remapped))



In [None]:
from huggingface_hub import login
login(new_session=False)

In [None]:
import torch
from transformers import AutoModel
from peft import LoraConfig, get_peft_model

device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading DinoV3 backbone...")
base_model = AutoModel.from_pretrained(
    "facebook/dinov3-vitb16-pretrain-lvd1689m",
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  
    task_type="FEATURE_EXTRACTION",
)

print("Injecting LoRA...")
model_dinov3 = get_peft_model(base_model, lora_config)
model_dinov3.to(device)
model_dinov3.print_trainable_parameters()


In [None]:
import torch.nn as nn

num_classes = 24 

class DinoV3Classifier(nn.Module):
    def __init__(self, backbone, num_classes):
        super().__init__()
        self.backbone = backbone
        hidden = backbone.config.hidden_size
        self.classifier = nn.Linear(hidden, num_classes)

    def forward(self, x):
        out = self.backbone(pixel_values=x)
        cls_token = out.last_hidden_state[:, 0, :]
        return self.classifier(cls_token)



model = DinoV3Classifier(model_dinov3, num_classes).to(device)


In [None]:
# (DinoV3Classifier is built from model_dinov3 + classifier head in the cell above)

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# ImageNet normalization to match DINO/I-JEPA backbone preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
class SignDataset(Dataset):
    def __init__(self, pixels, labels):
        self.pixels = pixels
        self.labels = labels

    def __len__(self):
        return len(self.pixels)

    def __getitem__(self, idx):
        img = self.pixels[idx].reshape(28,28).astype(np.uint8)
        img = Image.fromarray(img).convert("RGB")
        img = transform(img)
        return img, int(self.labels[idx])

train_ds = SignDataset(train_pixels, train_labels_remapped)
test_ds  = SignDataset(test_pixels, test_labels_remapped)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=64)


In [None]:
train_ds = SignDataset(train_pixels, train_labels_remapped)
test_ds  = SignDataset(test_pixels, test_labels_remapped)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=64)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(50):
    model.train()
    total, correct = 0, 0

    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        preds = model(X)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        _, predicted = preds.max(1)
        correct += predicted.eq(y).sum().item()
        total += y.size(0)

    print(f"Epoch {epoch+1} | Train Accuracy = {correct/total:.4f}")


In [None]:
model.eval()
correct = total = 0

with torch.no_grad():
    for X, y in test_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X)
        _, predicted = preds.max(1)
        correct += predicted.eq(y).sum().item()
        total += y.size(0)

print(f"Test Accuracy = {correct/total:.4f}")


In [None]:
def extract_embedding_array(img_array, model, transform, device="cuda"):
    if isinstance(img_array, np.ndarray) and img_array.ndim == 1:
        img_array = img_array.reshape(28, 28)

    img = Image.fromarray(img_array.astype(np.uint8)).convert("RGB")
    tensor = transform(img).unsqueeze(0).to(device)

    with torch.no_grad():
        out = model(pixel_values=tensor)
        cls = out.last_hidden_state[:, 0, :]
    return cls.squeeze(0)


# Retrain

In [None]:
train_losses = []
train_accuracies = []

for epoch in range(50):
    model.train()
    total, correct = 0, 0
    epoch_loss = 0.0

    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        preds = model(X)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        _, predicted = preds.max(1)
        correct += predicted.eq(y).sum().item()
        total += y.size(0)

    avg_loss = epoch_loss / len(train_loader)
    acc = correct / total

    train_losses.append(avg_loss)
    train_accuracies.append(acc)

    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Train Acc: {acc:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
plt.plot(train_accuracies, marker='o')
plt.title("Training Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")


plt.subplot(1, 2, 2)
plt.plot(train_losses, marker='o', color='red')
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()


In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X, y in test_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X)
        _, predicted = preds.max(1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

test_accuracy = sum(np.array(all_preds) == np.array(all_labels)) / len(all_labels)
print("Test Accuracy:", test_accuracy)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("\nClassification Report:")
print(classification_report(all_labels, all_preds))

cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(12,10))
sns.heatmap(cm, annot=False, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:
# Save LoRA + classifier checkpoint for Evaluation section (final_visualization)
torch.save({"model_state_dict": model.state_dict()}, "dinov3_lora_gesture.pth")
pth_file = "dinov3_lora_gesture.pth"
print("Checkpoint saved to", pth_file)

In [None]:
import torch
import numpy as np
from tqdm import tqdm

model.eval()

all_embeddings = []
all_tsne_labels = []

with torch.no_grad():
    for X, y in tqdm(test_loader, desc="Extracting embeddings"):
        X = X.to(device)

        out = model.backbone(pixel_values=X)
        cls_token = out.last_hidden_state[:, 0, :]    

        all_embeddings.append(cls_token.cpu().numpy())
        all_tsne_labels.append(y.numpy())

all_embeddings = np.vstack(all_embeddings)
all_tsne_labels = np.concatenate(all_tsne_labels)

print("Embeddings shape:", all_embeddings.shape)


In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(
    n_components=2,
    learning_rate='auto',
    init='pca',
    perplexity=30,
    random_state=42
)

emb_2d = tsne.fit_transform(all_embeddings)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))

palette = sns.color_palette("hls", num_classes)

sns.scatterplot(
    x=emb_2d[:, 0],
    y=emb_2d[:, 1],
    hue=all_tsne_labels,
    palette=palette,
    legend="full",
    s=12,
)

plt.title("t-SNE Visualization of DINOv3 + LoRA Gesture Embeddings", fontsize=16)
plt.xlabel("TSNE-1")
plt.ylabel("TSNE-2")
plt.legend(title="Class", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


# LoRA towards the end

In [None]:
import torch
from transformers import AutoModel
from peft import LoraConfig, get_peft_model

In [None]:
import torch
from transformers import AutoModel
from peft import LoraConfig, get_peft_model

device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading DinoV3 backbone...")
base_model = AutoModel.from_pretrained(
    "facebook/dinov3-vitb16-pretrain-lvd1689m",
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "layer.11.attention.q_proj",
        "layer.11.attention.k_proj",
        "layer.11.attention.v_proj",
        "layer.11.attention.o_proj",
    ],
    task_type="FEATURE_EXTRACTION",
)


print("Injecting LoRA...")
model_dinov3 = get_peft_model(base_model, lora_config)
model_dinov3.to(device)
model_dinov3.print_trainable_parameters()


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(20):
    model.train()
    total, correct = 0, 0

    for X, y in train_loader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        preds = model(X)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        _, predicted = preds.max(1)
        correct += predicted.eq(y).sum().item()
        total += y.size(0)

    print(f"Epoch {epoch+1} | Train Accuracy = {correct/total:.4f}")


In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X, y in test_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X)
        _, predicted = preds.max(1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

test_accuracy = sum(np.array(all_preds) == np.array(all_labels)) / len(all_labels)
print("Final Test Accuracy:", test_accuracy)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


print("\nClassification Report:")
print(classification_report(all_labels, all_preds))

cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(12,10))
sns.heatmap(cm, annot=False, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from transformers import AutoModel

def compare_adapter_effect(peft_model, image_input, transform, device, patch_size=16):
    print("DEBUG: Starting comparison (Base vs. Adapter)...")

    if isinstance(image_input, np.ndarray):
        img_pil = Image.fromarray((image_input * 255).astype(np.uint8))
    else:
        img_pil = image_input
    img_tensor = transform(img_pil).unsqueeze(0).to(device)

    if hasattr(peft_model, 'base_model') and hasattr(peft_model.base_model, 'model'):
        base_config = peft_model.base_model.model.config
    else:
        base_config = peft_model.config
    model_id = base_config._name_or_path

    print("DEBUG: Loading Base Model (Frozen)...")
    base_model = AutoModel.from_pretrained(
        model_id,
        attn_implementation="eager", 
        trust_remote_code=True
    ).to(device)
    base_model.eval()

    with torch.no_grad():
        out_base = base_model(img_tensor, output_attentions=True)
        attn_base = process_attention(out_base, img_tensor, patch_size)

    del base_model 


    print("DEBUG: Loading Adapter Model (LoRA)...")
    
    lora_shadow = AutoModel.from_pretrained(
        model_id,
        attn_implementation="eager", 
        trust_remote_code=True
    ).to(device)

    try:
       
        from peft import PeftModel

        lora_shadow = PeftModel(lora_shadow, peft_model.peft_config['default'])

        lora_shadow.load_state_dict(peft_model.state_dict(), strict=False)
        lora_shadow.eval()

        with torch.no_grad():
            out_lora = lora_shadow.base_model.model(img_tensor, output_attentions=True)
            attn_lora = process_attention(out_lora, img_tensor, patch_size)

    except Exception as e:
        print(f"Warning: Could not fully merge LoRA weights ({e}). Showing Base only.")
        attn_lora = attn_base 

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
    ax1.imshow(img_pil)
    ax1.set_title("Original Image")
    ax1.axis('off')

    ax2.imshow(img_pil)
    ax2.imshow(resize_map(attn_base, img_pil.size), cmap='jet', alpha=0.5)
    ax2.set_title("DINOv3 (Base/Frozen)")
    ax2.axis('off')

    ax3.imshow(img_pil)
    ax3.imshow(resize_map(attn_lora, img_pil.size), cmap='jet', alpha=0.5)
    ax3.set_title("DINOv3 + Adapter (LoRA)")
    ax3.axis('off')

    plt.show()


    del lora_shadow
    torch.cuda.empty_cache()


def process_attention(outputs, img_tensor, patch_size):
    
    attentions = outputs.attentions[-1]
    attn_mean = attentions.mean(dim=1).squeeze(0)
    cls_attn = attn_mean[0, 1:]

    w, h = img_tensor.shape[2], img_tensor.shape[3]
    num_expected = (w // patch_size) * (h // patch_size)

    if cls_attn.shape[0] > num_expected:
        patch_attn = cls_attn[-num_expected:]
    else:
        patch_attn = cls_attn

    grid = int(np.sqrt(patch_attn.shape[0]))
    return patch_attn.reshape(grid, grid).cpu().float().numpy()

def resize_map(attn_map, size):
    m = cv2.resize(attn_map, size)
    return (m - m.min()) / (m.max() - m.min() + 1e-8)
idx = np.random.randint(0, len(test_images))
print(f"Visualizing Index: {idx}")
compare_adapter_effect(model_dinov3, test_images[idx], transform_dinov3, device)

#Evaluation Below

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel
import math


class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=4, alpha=16):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank

        self.lora_A = nn.Parameter(torch.zeros(rank, in_features))
        self.lora_B = nn.Parameter(torch.zeros(out_features, rank))

        
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        return (x @ self.lora_A.T @ self.lora_B.T) * self.scaling


class Dinov3WithCustomLoRA(nn.Module):
    def __init__(self, base_model_name, num_classes, lora_rank=4):
        super().__init__()
        self.dinov3 = AutoModel.from_pretrained(base_model_name, trust_remote_code=True)

        
        for block in self.dinov3.encoder.layer:
            pass

        
        self.classifier = nn.Linear(self.dinov3.config.hidden_size, num_classes)

        


    def forward(self, x):
        outputs = self.dinov3(x)
        cls_token = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_token)

In [None]:
import torch
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from transformers import AutoModel
from peft import LoraConfig, get_peft_model

# Use this checkpoint path (set by the save cell above, or default)
try:
    pth_file
except NameError:
    pth_file = "dinov3_lora_gesture.pth"

def final_visualization(checkpoint_path, image, transform, device):
    print("=== FINAL VISUALIZATION RUN ===")

    
    print("1. Loading Base Model...")
    base_model = AutoModel.from_pretrained(
        "facebook/dinov3-vitb16-pretrain-lvd1689m",
        trust_remote_code=True,
        attn_implementation="eager"
    ).to(device)
    base_model.eval()

    
    print("2. Preparing LoRA Model Shell...")
    lora_base = AutoModel.from_pretrained(
        "facebook/dinov3-vitb16-pretrain-lvd1689m",
        trust_remote_code=True,
        attn_implementation="eager"
    )

                                    
    config = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj", "k_proj"],
        lora_dropout=0.1,
        bias="none"
    )
    lora_model = get_peft_model(lora_base, config)

    
    print("3. Injecting Weights Manually...")
    raw_state_dict = torch.load(checkpoint_path, map_location=device, weights_only=False)
    if "model_state_dict" in raw_state_dict: raw_state_dict = raw_state_dict["model_state_dict"]



    clean_dict = {}
    lora_count = 0
    for k, v in raw_state_dict.items():
        if "lora" in k:
            new_k = k.replace("backbone.", "")
            clean_dict[new_k] = v
            lora_count += 1

    print(f"   Found {lora_count} LoRA keys to inject.")

    incompatible = lora_model.load_state_dict(clean_dict, strict=False)

   
    active_params = 0
    for n, p in lora_model.named_parameters():
        if "lora_B" in n and p.abs().sum() > 0:
            active_params += 1

    if active_params > 0:
        print(f"   ✓ SUCCESS! {active_params} LoRA layers are active/loaded.")
    else:
        print("   ⚠️ WARNING: Weights still seem zero. Check key mapping manually.")

    lora_model.to(device)
    lora_model.eval()

    img_tensor = transform(Image.fromarray((image * 255).astype(np.uint8))).unsqueeze(0).to(device)

    print("4. Generating Maps...")

    def get_attn(m):
        if hasattr(m, 'base_model'):
            m = m.base_model

        if hasattr(m, 'model'):
            m = m.model

        with torch.no_grad():
            out = m(img_tensor, output_attentions=True)

        att = out.attentions[-1].mean(1).squeeze(0)[0, 1:]
        if att.shape[0] > 196: att = att[-196:]

        g = int(np.sqrt(att.shape[0]))
        res = att.reshape(g, g).cpu().numpy()
        return cv2.resize(res, (224, 224))

    def get_gradcam(m):
        grads, acts = [], []
        def bh(mod, gi, go): grads.append(go[0])
        def fh(mod, i, o): acts.append(o)

        target = None
        if hasattr(m, 'norm'): target = m.norm
        elif hasattr(m, 'base_model') and hasattr(m.base_model, 'norm'): target = m.base_model.norm
        elif hasattr(m, 'base_model') and hasattr(m.base_model, 'model'): target = m.base_model.model.norm

        if target is None: return np.zeros((224, 224))

        h1 = target.register_full_backward_hook(bh)
        h2 = target.register_forward_hook(fh)

        m.zero_grad()
        out = m(img_tensor)

        hid = out.last_hidden_state if hasattr(out, 'last_hidden_state') else out[0]
        score = hid[:, 0, :].mean()
        score.backward()

        if not grads: return np.zeros((224, 224))

        g = grads[0].cpu().detach()
        a = acts[0].cpu().detach()

        cam = (g.mean(1, keepdim=True) * a).sum(2).squeeze(0)
        if cam.shape[0] > 196: cam = cam[-196:] # Drop registers

        cam = torch.relu(cam)
        if cam.max() > 0: cam = (cam - cam.min()) / cam.max()

        h1.remove(); h2.remove()
        grid = int(np.sqrt(cam.shape[0]))
        return cv2.resize(cam.reshape(grid, grid).numpy(), (224, 224))

    att_base = get_attn(base_model)
    att_lora = get_attn(lora_model)
    gcam_base = get_gradcam(base_model)
    gcam_lora = get_gradcam(lora_model)

    fig, ax = plt.subplots(2, 3, figsize=(12, 8))
    ax[0,0].imshow(image); ax[0,0].set_title("Input")
    ax[0,1].imshow(image); ax[0,1].imshow(att_base, cmap='jet', alpha=0.5); ax[0,1].set_title("Base Attention")
    ax[0,2].imshow(image); ax[0,2].imshow(att_lora, cmap='jet', alpha=0.5); ax[0,2].set_title("LoRA Attention")

    ax[1,0].imshow(image); ax[1,0].set_title("Input")
    ax[1,1].imshow(image); ax[1,1].imshow(gcam_base, cmap='jet', alpha=0.5); ax[1,1].set_title("Base Grad-CAM")
    ax[1,2].imshow(image); ax[1,2].imshow(gcam_lora, cmap='jet', alpha=0.5); ax[1,2].set_title("LoRA Grad-CAM")

    plt.tight_layout()
    plt.show()

idx = np.random.randint(0, len(test_images))
final_visualization(pth_file, test_images[idx], transform_dinov3, device)

