In [1]:
import os
import cv2
import torch
import numpy as np
from tqdm import tqdm

from PIL import Image
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix


import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader

import torchvision
from torchvision import datasets, models, transforms

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Device

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='cpu')

In [3]:
dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
dinov2_vits14 = dinov2_vits14.to(device)

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
100%|██████████| 84.2M/84.2M [00:00<00:00, 99.0MB/s]


## Prepare Dataset

In [8]:
data_transforms = {
    '7-Detection data': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [9]:
ROOT_PATH="/content/drive/MyDrive/project"
image_datasets = {
    x: datasets.ImageFolder(os.path.join(ROOT_PATH, x), data_transforms[x])
    # for x in ['train', 'test']
    for x in ['7-Detection data']
}
image_datasets

{'7-Detection data': Dataset ImageFolder
     Number of datapoints: 8062
     Root location: /content/drive/MyDrive/project/7-Detection data
     StandardTransform
 Transform: Compose(
                RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear, antialias=True)
                RandomHorizontalFlip(p=0.5)
                ToTensor()
                Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            )}

In [10]:
batch_size = 8
num_workers = 4

data_loaders = {x: DataLoader(image_datasets[x], shuffle=True, batch_size=batch_size, num_workers=4)
    # for x in ['train', 'test']
    for x in ['7-Detection data']

}



In [11]:
class_names = image_datasets['7-Detection data'].classes
class_names

['Mobile_crane', 'Tower_crane', 'low_Mobile_crane', 'low_Tower_crane']

## Model

In [12]:
class DinoVisionTransformerClassifier(nn.Module):
    def __init__(self):
        super(DinoVisionTransformerClassifier, self).__init__()
        self.transformer = dinov2_vits14
        self.classifier = nn.Sequential(
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Linear(256, len(class_names))
        )

    def forward(self, x):
        x = self.transformer(x)
        x = self.transformer.norm(x)
        x = self.classifier(x)
        return x

model = DinoVisionTransformerClassifier()
model = model.to(device)

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.000001)

## Train

In [None]:
num_epoch = 10
for epoch in range(num_epoch):
    train_acc = 0
    train_loss = 0
    loop = tqdm(data_loaders['7-Detection data'])
    for idx, (features, labels) in enumerate(loop):
        features = features.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)

        predictions = outputs.argmax(dim=1, keepdim=True).squeeze()
        correct = (predictions == labels).sum().item()
        accuracy = correct / batch_size

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch [{epoch}/{num_epoch}]")
        loop.set_postfix(loss=loss.item(), acc=accuracy)

Epoch [0/10]: 100%|██████████| 1008/1008 [1:17:03<00:00,  4.59s/it, acc=0.5, loss=0.745]
Epoch [1/10]: 100%|██████████| 1008/1008 [1:16:31<00:00,  4.55s/it, acc=0.5, loss=0.504]
Epoch [2/10]: 100%|██████████| 1008/1008 [1:18:29<00:00,  4.67s/it, acc=0.5, loss=0.541]
Epoch [3/10]: 100%|██████████| 1008/1008 [1:16:45<00:00,  4.57s/it, acc=0.5, loss=0.535]
Epoch [4/10]: 100%|██████████| 1008/1008 [1:24:35<00:00,  5.03s/it, acc=0.375, loss=0.847]
Epoch [5/10]: 100%|██████████| 1008/1008 [1:20:21<00:00,  4.78s/it, acc=0.25, loss=1.02]
Epoch [6/10]:  83%|████████▎ | 834/1008 [1:08:09<16:16,  5.61s/it, acc=0.625, loss=0.568]

In [None]:
correct = 0
total = 0
test_predicted = []
test_labels = []

with torch.no_grad():
    for features, labels in data_loaders["test"]:
        features = features.to(device)
        labels = labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted.to(device) == labels).sum().item()

        test_labels += (labels.cpu().numpy().tolist())
        test_predicted += (predicted.cpu().numpy().tolist())

print(f'Accuracy of the network on the {len(data_loaders["test"])*6} test images: {100 * correct // total} %')

## Report

In [None]:
print(classification_report(test_labels, test_predicted, target_names=class_names))

In [None]:
cm = confusion_matrix(test_labels, test_predicted)
df_cm = pd.DataFrame(
    cm,
    index = class_names,
    columns = class_names
)
df_cm

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    plt.ylabel("Surface Ground Truth")
    plt.xlabel("Predicted Surface")
    plt.legend()

show_confusion_matrix(df_cm)