In [2]:
# train.py
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split

# ---------- Config ----------
DATA_DIR = "/Users/bhara-zstch1566/CNN/Eval/final dataset"  # root containing subfolders per class
BATCH_SIZE = 32
NUM_EPOCHS = 5
LR = 1e-3
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
MODEL_SAVE_PATH = "Model/model_state.pth"
NUM_CLASSES = 3

# ---------- Transforms ----------
# Augment only for training. Val/test gets only resize+norm.
train_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

val_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# ---------- Dataset and split ----------
full_dataset = datasets.ImageFolder(DATA_DIR, transform=train_transform)  # transform used only for train subset
class_names = full_dataset.classes
print("Classes (detected):", class_names)

# create train/val split indices
indices = list(range(len(full_dataset)))
train_idx, val_idx = train_test_split(indices, test_size=0.15, random_state=42, stratify=[full_dataset.targets[i] for i in indices])

# Subsets: train uses train_transform; val we override transform
train_subset = Subset(full_dataset, train_idx)
val_subset = Subset(full_dataset, val_idx)
# Override validation transform by directly modifying dataset.dataset.transform for val subset
# (simpler approach: make a second ImageFolder and slice, but this works)
full_dataset_for_val = datasets.ImageFolder(DATA_DIR, transform=val_transform)
val_subset = Subset(full_dataset_for_val, val_idx)

train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

# ---------- Model ----------
class AlexNetSmall(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3,64,11,stride=2,padding=2), nn.ReLU(), nn.MaxPool2d(2,2),
            nn.Conv2d(64,128,5,stride=1,padding=2), nn.ReLU(), nn.MaxPool2d(2,2),
            nn.Conv2d(128,256,3,stride=1,padding=1), nn.ReLU(), nn.MaxPool2d(2,2)
        )
        self.classifier = nn.Sequential(
            nn.Linear(256*7*7, 4096), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(4096, num_classes)
        )
    def forward(self,x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

model = AlexNetSmall(num_classes=NUM_CLASSES).to(DEVICE)

# --- Option: use MobileNetV2 (recommended)
# model = models.mobilenet_v2(pretrained=True)
# for p in model.features.parameters(): p.requires_grad = False
# model.classifier[1] = nn.Linear(model.last_channel, NUM_CLASSES)
# model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ---------- Training loop ----------
best_val_acc = 0.0
for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    correct, total = 0, 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    train_acc = 100 * correct / total

    # validation
    model.eval()
    v_corr, v_total = 0, 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            outputs = model(imgs)
            _, preds = torch.max(outputs, 1)
            v_corr += (preds == labels).sum().item()
            v_total += labels.size(0)
    val_acc = 100 * v_corr / v_total
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train Loss: {running_loss/len(train_loader):.4f} | Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")

    # save best
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"Saved best model (val_acc={val_acc:.2f}%) to {MODEL_SAVE_PATH}")

print("Training finished. Best val acc: %.2f%%" % best_val_acc)
print("Class order:", class_names)


Classes (detected): ['Mobile', 'Nothing', 'water bottle']




Epoch 1/5 | Train Loss: 0.7679 | Train Acc: 67.55% | Val Acc: 89.79%
Saved best model (val_acc=89.79%) to Model/model_state.pth
Epoch 2/5 | Train Loss: 0.3271 | Train Acc: 87.59% | Val Acc: 91.42%
Saved best model (val_acc=91.42%) to Model/model_state.pth
Epoch 3/5 | Train Loss: 0.2280 | Train Acc: 92.05% | Val Acc: 93.74%
Saved best model (val_acc=93.74%) to Model/model_state.pth
Epoch 4/5 | Train Loss: 0.2300 | Train Acc: 92.67% | Val Acc: 95.59%
Saved best model (val_acc=95.59%) to Model/model_state.pth
Epoch 5/5 | Train Loss: 0.2027 | Train Acc: 93.45% | Val Acc: 94.66%
Training finished. Best val acc: 95.59%
Class order: ['Mobile', 'Nothing', 'water bottle']


In [5]:
# inference.py
import cv2
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from collections import deque
# from train import AlexNetSmall, class_names  # or manually set class_names = [...]

MODEL_PATH = "Model/model1.pth"
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# same transform as validation (no augmentation)
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# load model architecture and weights
model = AlexNetSmall(num_classes=len(class_names))
state = torch.load(MODEL_PATH, map_location=DEVICE)
model.load_state_dict(state)
model.to(DEVICE)
model.eval()

# smoothing / confidence
SMOOTH_K = 7
CONF_THRESH = 0.6
recent = deque(maxlen=SMOOTH_K)

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Cannot open webcam")

print("Starting webcam. Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Optionally crop a center square for better focus:
    h, w = frame.shape[:2]
    side = min(h, w)
    cx, cy = w//2, h//2
    x1, y1 = cx-side//2, cy-side//2
    roi = frame[y1:y1+side, x1:x1+side]

    img = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
    img_t = transform(img).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        logits = model(img_t)
        probs = F.softmax(logits, dim=1)
        conf, pred_idx = torch.max(probs, dim=1)
        conf_value = conf.item()
        pred_idx = pred_idx.item()

    # apply thresholding: if low confidence, return "Unknown/nothing"
    if conf_value < CONF_THRESH:
        recent.append("Unknown")
    else:
        recent.append(class_names[pred_idx])

    # majority vote
    vote = max(set(recent), key=recent.count) if len(recent) > 0 else "Unknown"

    # draw on frame
    display_text = f"{vote} ({conf_value:.2f})" if vote != "Unknown" else "Unknown"
    cv2.putText(frame, display_text, (20,40), cv2.FONT_HERSHEY_SIMPLEX, 1.1, (0,255,0), 2)
    # optional draw ROI box
    cv2.rectangle(frame, (x1,y1), (x1+side, y1+side), (255,0,0), 2)

    cv2.imshow("Live", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


RuntimeError: Error(s) in loading state_dict for AlexNetSmall:
	Missing key(s) in state_dict: "features.0.weight", "features.0.bias", "features.3.weight", "features.3.bias", "features.6.weight", "features.6.bias", "classifier.0.weight", "classifier.0.bias", "classifier.3.weight", "classifier.3.bias", "classifier.6.weight", "classifier.6.bias". 
	Unexpected key(s) in state_dict: "Convlayer.0.weight", "Convlayer.0.bias", "Convlayer.3.weight", "Convlayer.3.bias", "Convlayer.6.weight", "Convlayer.6.bias", "net.0.weight", "net.0.bias", "net.3.weight", "net.3.bias", "net.6.weight", "net.6.bias". 