<a href="https://colab.research.google.com/github/ccf23/FMO-Tracking/blob/main/assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Part A: Loading and Using a Pretrained Network as a Feature Extractor

1. Import required modules and libraries and mount Google Drive for access of data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import time
import os
import copy
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/cs2770_hw2')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


2. Preprocess data

In [None]:
# transforms.Normalize normalizes tensor values based on mean and standard deviation for RGB values
# [0.485, 0.456, 0.406] are the means for RGB channels
# [0.229, 0.224, 0.225] are the standard deviations for RGB channels

data_transforms = {
    'train': transforms.Compose([
      transforms.Resize((224, 224)),
      transforms.ToTensor(),
      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])                              
    ]),
    'val': transforms.Compose([
      transforms.Resize((224, 224)),
      transforms.ToTensor(),
      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
      transforms.Resize((224, 224)),
      transforms.ToTensor(),
      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

3. Create data loader

In [None]:
data_dir = 'hw2_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
  for x in ['train', 'val', 'test']}

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=8, shuffle=True, num_workers=4)
  for x in ['train', 'val', 'test']}
  
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes

  cpuset_checked))


4. Load pretrained CNN model and use `VGG16_Feature_Extraction` model to extract features of images

In [None]:
class VGG16_Feature_Extraction(torch.nn.Module):
  def __init__(self):
    super(VGG16_Feature_Extraction, self).__init__()
    VGG16_Pretrained = models.vgg16(pretrained=True)
    self.features = VGG16_Pretrained.features
    self.avgpool = VGG16_Pretrained.avgpool
    self.feature_extractor = nn.Sequential(*[VGG16_Pretrained.classifier[i] for i in range(6)])

  def forward(self, x):
    x = self.features(x)
    x = self.avgpool(x)
    x = torch.flatten(x, 1)
    x = self.feature_extractor(x)
    return x

In [None]:
model = VGG16_Feature_Extraction()
device = 'cuda:0'
model = model.to(device)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




5. Use model to extract features of images

In [None]:
image_features = {}
image_labels = {}

for phase in ['train', 'test']:
  for inputs, labels in dataloaders[phase]:
    inputs = inputs.to(device)
    model_prediction = model(inputs)
    model_prediction_numpy = model_prediction.cpu().detach().numpy()
    if (phase not in image_features):
      image_features[phase] = model_prediction_numpy
      image_labels[phase] = labels.numpy()
    else:
      image_features[phase] = np.concatenate((image_features[phase], model_prediction_numpy), axis=0)
      image_labels[phase] = np.concatenate((image_labels[phase], labels.numpy()), axis=0)

  cpuset_checked))


6. Train the network on the training data after scaling it

In [None]:
clf = make_pipeline(StandardScaler(), svm.LinearSVC(random_state=0, tol=1e-5))
clf.fit(image_features['train'], image_labels['train'])

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('linearsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=0,
                           tol=1e-05, verbose=0))],
         verbose=False)

7. Test network on test set and report accuracy. Display confusion matrix to see which classes are being incorrectly predicted the most. 

  Some of the most often mispredicted labels are:
*   Cars are being labeled as horses.
*   Tables are being labeled as TV monitors, sofas, or potted plants.
*   People are being labeled as buses and sofas.
*   Sofas are being labeled as people.



In [None]:
correct = 0

for p, t in zip(clf.predict(image_features['test']), image_labels['test']):
  if p == t:
    correct += 1

print('Accuracy of SVM: ' + str(round((100 * correct / image_labels['test'].size), 2)) + '%.')
print(confusion_matrix(clf.predict(image_features['test']), image_labels['test']))

Accuracy of SVM: 53.57%.
[[22  0  0  1  0  2  1  1  0  0  0  0  0  0  0  0  0  0  1  1]
 [ 0 13  0  0  0  0  0  0  2  0  1  2  0  1  0  2  1  0  2  0]
 [ 0  0 20  0  0  0  0  1  0  1  0  0  0  0  0  0  2  0  0  0]
 [ 1  1  0 21  0  1  0  0  0  1  0  2  2  0  1  0  1  0  1  0]
 [ 0  0  0  1  5  0  0  1  1  0  3  1  0  0  3  2  0  0  1  1]
 [ 0  0  1  0  0 14  0  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  1  0  0  0  3 14  0  1  1  1  0  0  2  3  0  1  0  1  0]
 [ 0  0  2  0  0  0  0 18  0  0  0  1  0  0  0  1  0  1  0  1]
 [ 1  0  0  0  2  0  1  0  4  2  3  1  0  0  2  1  0  8  0  3]
 [ 0  0  0  0  0  0  0  0  0  5  0  1  2  0  0  0  1  0  0  0]
 [ 1  1  0  0  5  0  0  0  5  0 11  1  0  0  1  4  0  2  0  2]
 [ 0  0  0  0  1  0  0  1  0  1  0 11  0  0  1  2  0  1  0  0]
 [ 0  0  0  0  1  0  0  0  0  4  0  0 15  0  1  1  0  0  0  0]
 [ 0  1  1  0  0  0  1  0  0  0  1  0  1 21  2  0  0  0  0  0]
 [ 0  3  1  0  6  0  5  0  2  0  2  3  2  1  8  1  0  2  1  1]
 [ 0  2  0  1  1  0  0  1  0  

# Part B: Training and Testing the CNN on our Dataset

**Preparing the network**

1. Load VGG16 with pretrained weights from ImageNet.
2. Extract the number of input features for the last fully connected layer of the model.
3. Replace the last fully connected layer with a new layer.

In [None]:
model = models.vgg16(pretrained=True)
num_ftrs = model.classifier[6].in_features
model.classifier[6] = nn.Linear(num_ftrs, len(class_names))

**Steps before starting training**

4. Set number of epochs to 25.
5. Send the model to the CUDA device.
6. Specify the criterion for evaluating the trained model.
7. Set the optimizer, learning rate, and momentum.
8. Create a scheduler to control the way that the learning rate changes during the training process.

**Training**

9. Save the initial model weight as the best model weight and set the best accuracy as zero.
10. Iterate over the epochs.
11. Iterate over the train and validation set.
12. Use the dataloader from previous steps to get a minibatch of images and their corresponding labels.
13. Initialize the gradient vector to all zeroes.
14. Use the current model weight for prediction and backpropagating the prediction loss. 
15. Update the scheduler status.
16. Compute loss and accuracy of epoch.
17. Check whether the accuracy of classification is better than the best accuracy so far to save the best model parameters.

In [None]:
model = model.to(device)
criterion = nn.CrossEntropyLoss()

epochs = 25
learning_rate = 0.001
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)      

best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0

for epoch in range(num_epochs):
  all_batchs_loss = 0.0
  all_batchs_corrects = 0.0

  for phase in ['train', 'val']:
    if phase == 'train':
      model.train()
    else:
      model.eval()

    for inputs, labels in dataloaders[phase]: # iterating over batches
      inputs = inputs.to(device)
      labels = labels.to(device)

      optimizer.zero_grad()

      with torch.set_grad_enabled(phase == 'train'):
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)
        # print(loss)
        if phase == 'train':
          loss.backward()
          optimizer.step()
      
        all_batchs_loss += loss.item() * inputs.size(0)
        all_batchs_corrects += torch.sum(preds == labels.data)

    if phase == 'train':
      scheduler.step()
    
    epoch_loss = all_batchs_loss / dataset_sizes[phase]
    epoch_acc = all_batchs_corrects.double() / dataset_sizes[phase]

    if phase == 'val' and epoch_acc > best_acc:
      best_acc = epoch_acc
      best_model_wts = copy.deepcopy(model.state_dict())
      torch.save(best_model_wts, 'best_model_weight.pth')

  cpuset_checked))


**Testing**

18.   Prepare the model in the same way it was prepared for training and load the best model weight saved in training.
19.   Set model to `eval` and phase to `'test'`. 
20. Go through the test set, predict the category of images, and compute the number of correctly classified images.
21. Compute accuracy over all data.


In [None]:
model = models.vgg16()
num_ftrs = model.classifier[6].in_features
model.classifier[6] = nn.Linear(num_ftrs, 20)
model = model.to(device)
model.load_state_dict(torch.load('best_model_weight.pth'))

model.eval()
phase = 'test'

all_labels = []
all_preds = []
for inputs, labels in dataloaders[phase]:
  inputs = inputs.to(device)
  labels = labels.to(device)
  outputs = model(inputs)
  _, preds = torch.max(outputs, 1)
  all_batchs_corrects += torch.sum(preds == labels.data)
  all_preds.append(preds.cpu().data.numpy())
  all_labels.append(labels.cpu().data.numpy())

all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)

test_acc = all_batchs_corrects.double() / dataset_sizes[phase]

conf = confusion_matrix(all_labels, all_preds)

print('Confusion matrix: ')
print(conf)
print(f"Testing accuracy from given formula: {test_acc}")
print(f"Testing accuracy: {np.trace(conf)/np.sum(conf)}")

  cpuset_checked))


Confusion matrix: 
[[25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 12  0  0  0  0  4  0  2  0  0  0  0  5  1  0  0  0  0  1]
 [ 1  1 17  1  1  0  0  0  0  1  0  1  0  0  0  1  0  0  1  0]
 [ 1  0  0 17  1  0  2  0  0  0  0  0  0  1  1  1  0  0  1  0]
 [ 0  0  0  0 13  0  0  0  2  0  4  1  0  0  3  0  0  1  0  1]
 [ 1  0  0  0  0 17  2  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 1  0  0  0  0  0 19  0  1  0  0  0  0  2  1  0  0  0  1  0]
 [ 0  0  0  0  1  0  0 22  0  0  0  1  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  3  0  1  1 10  0  5  1  0  1  1  0  0  1  0  1]
 [ 0  0  0  0  0  0  0  0  1 11  0  0  1  1  1  0  0  0  0  0]
 [ 0  0  0  0 11  0  0  0  2  0 10  0  0  0  1  1  0  0  0  0]
 [ 0  0  1  0  1  0  1  0  1  0  0 18  0  1  1  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  2  0  0 20  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  0  0  0  0  0 22  0  0  0  0  1  0]
 [ 2  0  0  0  6  0  2  0  1  0  0  1  1  0 11  0  0  1  0  0]
 [ 0  1  0  0  1  0  1  0  4  0  3  

22. Repeating with Different Hyperparameters

Accuracy with following variations of hyperparameters **(Base case is 25 epochs, learning rate of 0.001, and SGD optimizer)**:
*   Epochs = 20: 0.6659
*   Epochs = 25: 0.6197
*   Epochs = 30: 0.6554


*   Learning Rate = 0.001: 0.6197 
*   Learning Rate = 0.0005: 0.6870


*   Optimizer = SGD: 0.6197
*   Optimizer = Adam: 0.0525

# Part C: Object Detection Training and Evaluation

In [None]:
import os

from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/cs2770_hw2')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torchvision
import Required_Files.utils as utils

from Required_Files.coco_utils import get_coco_api_from_dataset
from Required_Files.coco_eval import CocoEvaluator
import copy
import torch.optim as optim
from torch.optim import lr_scheduler
from Required_Files.PennFudanDataset import PennFudanDataset
from Required_Files.pascal_dataset import PASCALDataset
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [None]:
pascal_train = PASCALDataset('PASCAL/train')
pascal_test = PASCALDataset('PASCAL/test')
pascal_val = PASCALDataset('PASCAL/val')

penn_fudan_train = PennFudanDataset('PennFudanPed_hw3/train')
penn_fudan_test = PennFudanDataset('PennFudanPed_hw3/test')
penn_fudan_val = PennFudanDataset('PennFudanPed_hw3/val')

data_loader_pascal_train = torch.utils.data.DataLoader(pascal_train, batch_size=4, shuffle=True, num_workers=4, collate_fn=utils.collate_fn)
data_loader_pascal_test = torch.utils.data.DataLoader(pascal_test, batch_size=4, shuffle=True, num_workers=4, collate_fn=utils.collate_fn)
data_loader_pascal_val = torch.utils.data.DataLoader(pascal_val, batch_size=4, shuffle=True, num_workers=4, collate_fn=utils.collate_fn)

data_loader_penn_fudan_train = torch.utils.data.DataLoader(penn_fudan_train, batch_size=4, shuffle=True, num_workers=4, collate_fn=utils.collate_fn)
data_loader_penn_fudan_test = torch.utils.data.DataLoader(penn_fudan_test, batch_size=4, shuffle=True, num_workers=4, collate_fn=utils.collate_fn)
data_loader_penn_fudan_val = torch.utils.data.DataLoader(penn_fudan_val, batch_size=4, shuffle=True, num_workers=4, collate_fn=utils.collate_fn)

  cpuset_checked))


In [None]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
device = 'cuda:0'
model = model.to(device) 

In [None]:
num_epochs = 5
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

best_mAP = 0.0

for epoch in range(num_epochs):
  for phase in ['train', 'val']:
    if phase == 'train':
      model.train() # just putting it in training mode and eval mode, not actually doing anything
    if phase == 'val':
      model.eval()

    if phase == 'train':
      for images, targets in data_loader_pascal_train: # batch - small number of images which are run in parallel on GPU

        optimizer.zero_grad() 

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)

        with torch.set_grad_enabled(phase == 'train'):
            loss = sum(loss_dict.values())
            loss.backward()
            optimizer.step()
    
      scheduler.step()
    
    if phase == 'val':
      coco = get_coco_api_from_dataset(data_loader_pascal_val.dataset)
      iou_types = ["bbox"]
      coco_evaluator = CocoEvaluator(coco, iou_types)
    
      for images, targets in data_loader_pascal_val:
        images = list(image.to(device) for image in images)
        outputs = model(images) # send in images and get labels

        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
        coco_evaluator.update(res) # basically adding to the running total in the same way as was done in part b

      coco_evaluator.synchronize_between_processes()
      coco_evaluator.accumulate()
      coco_evaluator.summarize()
      mAP = coco_evaluator.coco_eval['bbox'].stats[0] # accuracy over epoch 

      if mAP > best_mAP:
        best_mAP = mAP
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(best_model_wts, 'best_model_weight.pth')

  cpuset_checked))


creating index...
index created!
Accumulating evaluation results...
DONE (t=0.08s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.350
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.535
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.389
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.187
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.396
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.314
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.510
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.518
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.286
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.550
 Average Recall    

In [None]:
# function for calculating mAP scores and drawing bounding boxes
def update_mAP(res):
  return None

In [None]:
# testing
# have something here to keep a running total of the top 20 mAP scores

# load the model with the best weights
model = model.to(device)
model.load_state_dict(torch.load('best_model_weight.pth'))

coco = get_coco_api_from_dataset(data_loader_pascal_test.dataset)
iou_types = ["bbox"]
coco_evaluator = CocoEvaluator(coco, iou_types)

for images, targets in data_loader_pascal_test:
  images = list(image.to(device) for image in images)
  outputs = model(images)

  res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}

  # keep running total of 20 images with the largest mAP
  update_mAP(res)
    # add them to a dict of mAP scores
    # find the 20 images with the largest mAP as well as their bounding boxes
  coco_evaluator.update(res)

coco_evaluator.synchronize_between_processes()
coco_evaluator.accumulate()
coco_evaluator.summarize()
mAP = coco_evaluator.coco_eval['bbox'].stats[0]
print('Overall mAP: ' + str(mAP))

creating index...
index created!


  cpuset_checked))


Accumulating evaluation results...
DONE (t=0.07s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.593
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.881
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.659
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.214
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.537
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.684
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.472
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.663
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.667
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.299
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.606
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= la

In [None]:
# call function to draw the bounding boxes here