## Imports

In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px # this is another plotting library for interactive plot

from sklearn.model_selection import train_test_split
from sklearn import metrics, manifold # we will use the metrics and manifold learning modules from scikit-learn
from pathlib import Path # to interact with file paths
from PIL import Image # to interact with images
from tqdm import tqdm # progress bar
from pprint import pprint # pretty print (useful for a more readable print of objects like lists or dictionaries)

import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import cv2 as cv

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

## Load Pretrained Net and create Detector 

In [22]:
# Model
# model = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True) #faster but less accurate
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True) 
# model = torch.hub.load('ultralytics/yolov5', 'yolov5n6', pretrained=True) 
# model = torch.hub.load('ultralytics/yolov3', 'yolov3') #bad 
model.to(device)

# Analyze network

# print(model)

# for param_name, param in model.named_parameters():
#     print(param_name)

# for i, (k, v) in enumerate(model.named_parameters()):
#     print(f'{i} - {k}')

#https://github.com/ultralytics/yolov5/issues/1314

#backnbone is layers 0->9

backbone_layers = [f'model.{x}' for x in range(9)]

backbone = nn.Sequential(
    model.model.model.model[0],
    model.model.model.model[1],
    model.model.model.model[2],
    model.model.model.model[3],
    model.model.model.model[4],
    model.model.model.model[5],
    model.model.model.model[6],
    model.model.model.model[7],
    model.model.model.model[8],
    model.model.model.model[9],
    model.model.model.model[10],
    )

# print(backbone)

class FeatureExtractor(nn.Module):
    def __init__(self, backbone): #(default for 640x320)
        super().__init__()

        ## Pretrained layers
        self.pretrained = backbone

        ### Flatten layer
        self.flatten = nn.Flatten(start_dim=1)

    def forward(self, x):
        # Apply convolutions
        x = self.pretrained(x)
        # Flatten
        x = self.flatten(x)
        return x

class Detector(nn.Module):
    def __init__(self, add_inputs=4, regr_out=22, class_out=13, features=76800): #(default for 640x320)
        super().__init__()
        ### Linear sections
        self.lin = nn.Sequential(
            # First linear layer
            nn.Linear(in_features=features+add_inputs, out_features=512),
            nn.ReLU(True),
            nn.Linear(in_features=512, out_features=regr_out+class_out),
        )
        # #regression output
        # self.lin_regr = nn.Sequential(
        #     # First linear layer
        #     nn.Linear(in_features=1024, out_features=512),
        #     nn.ReLU(True),
        #     # nn.Dropout(p=0.5),
        #     # Second linear
        #     nn.Linear(in_features=512, out_features=regr_out)
        # )
        # # classification output
        # self.lin_class = nn.Sequential(
        #     # First linear layer
        #     nn.Linear(in_features=1024, out_features=512),
        #     nn.ReLU(True),
        #     # nn.Dropout(p=0.5),
        #     # Second linear
        #     nn.Linear(in_features=512, out_features=class_out)
        # )
        
    def forward(self, x):
        x = self.lin(x)
        return x

feature_extractor=FeatureExtractor(backbone)
#define detector
detector = Detector(add_inputs=4, regr_out=22, class_out=13, features=20480)

#freeeze backbone
for param in feature_extractor.pretrained.parameters():
    param.requires_grad = False

feature_extractor.to(device)
detector.to(device)

# #check
# for param_name, param in detector.named_parameters():
#     print('%s \t- requires_grad=%s' % (param_name, param.requires_grad))

Using cache found in /home/irong/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2022-2-17 torch 1.10.0+cu113 CUDA:0 (NVIDIA GeForce GTX 950M, 2004MiB)

Fusing layers... 
Model Summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


Detector(
  (lin): Sequential(
    (0): Linear(in_features=20484, out_features=512, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=512, out_features=35, bias=True)
  )
)

In [23]:
# test backbone
#show the image with opencv
img = cv.imread('tests/test_img.jpg')
#resize to 480 x 640
img = cv.resize(img, (320, 240))
#convert to tensor
img = torch.from_numpy(img).float().permute(2, 0, 1)
#add dimension
img = img.unsqueeze(0).to(device)
print(img.shape)

detector.eval()

# Inference
with torch.no_grad():
    data = torch.zeros(1, 4).to(device)
    feat = feature_extractor(img)
    input = torch.cat((feat, data), dim=1)
    print(input.shape)
    out = detector(input) 
    print(out.shape) # (320, 240)->torch.Size([1, 20480])
                    # (640, 480)->torch.Size([1, 76800])





torch.Size([1, 3, 240, 320])
torch.Size([1, 20484])
torch.Size([1, 35])


## Loading images and Labels

In [24]:
#dataset
class CsvDataset(Dataset):
    def __init__(self, folder, transform=None):
        self.transform = transform
        self.data = []
        class_labels = []
        with open(folder+'/classification_labels.csv', 'r') as f:
            lines = f.read().split('\n')
            lines = lines[0:-1] #remove footer
            # Get x and y values from each line and append to self.data
            labels = []
            for i in tqdm(range(len(lines))):
                line = lines[i]
                sample = line.split(',')
                #convert to float
                label = np.array([float(s) for s in sample])
                #convert to tensor
                label = torch.from_numpy(label).float()
                # img = img.unsqueeze(0)
                class_labels.append(label)

        input_data = []
        with open(folder+'/input_data.csv', 'r') as f:
            lines = f.read().split('\n')
            lines = lines[0:-1] #remove footer
            # Get x and y values from each line and append to self.data
            labels = []
            for i in tqdm(range(len(lines))):
                line = lines[i]
                sample = line.split(',')
                #convert to float
                label = np.array([float(s) for s in sample])
                #convert to tensor
                label = torch.from_numpy(label).float()
                # img = img.unsqueeze(0)
                input_data.append(label)

        #load labels
        with open(folder+'/regression_labels.csv', 'r') as f:
            lines = f.read().split('\n')
            lines = lines[0:-1] #remove footer
            # Get x and y values from each line and append to self.data
            labels = []
            for i in tqdm(range(len(lines))):
                line = lines[i]
                sample = line.split(',')
                #convert to float
                label = np.array([float(s) for s in sample])
                #convert to tensor
                label = torch.from_numpy(label).float()
                #load img
                img = cv.imread(folder+f'/img_{i+1}.png')
                img = cv.resize(img, (320, 240))
                img = torch.from_numpy(img).float().permute(2, 0, 1)
                # img = img.unsqueeze(0)
                self.data.append((img, input_data[i], label, class_labels[i]))
                
    def __len__(self):
        # The length of the dataset is simply the length of the self.data list
        return len(self.data)

    def __getitem__(self, idx):
        # Our sample is the element idx of the list self.data
        sample = self.data[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample

#create dataset
train_dataset = CsvDataset(folder='training_imgs')

train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)


100%|██████████| 2878/2878 [00:00<00:00, 48728.45it/s]
100%|██████████| 2878/2878 [00:00<00:00, 44083.01it/s]
100%|██████████| 2878/2878 [00:13<00:00, 212.13it/s]


## Training

In [25]:
# Training function
def train_epoch(ext, det, dataloader, class_loss_fn, regr_loss_fn, optimizer, device):
    # Set the model to training mode
    ext.eval() #dont train the extractor
    det.train() #train detector
    # Initialize the loss
    train_loss_class = []
    train_loss_regr = []
    # Loop over the training batches
    for (img, input_data, regr_label, class_label) in dataloader:
        # Move the input and target data to the selected device
        img, input_data, regr_label, class_label = img.to(device), input_data.to(device), regr_label.to(device), class_label.to(device)
        # Compute the features
        features = ext(img)
        #concatenate features and input_data
        input = torch.cat((features, input_data), dim=1)
        # Zero the gradients
        optimizer.zero_grad()
        # Compute the output
        output = det(input)

        #regression 22 values
        #classification: 3 states, 3 next states, 7 signs
        regr_out = output[:, :22]
        state_out = output[:, 22:25]
        next_out = output[:, 25:28]
        sign_out = output[:, 28:]
        
        state_label = class_label[:, :3]
        next_label = class_label[:, 3:6]
        sign_label = class_label[:, 6:]

        # Compute the losses
        regr_loss = 5.0*regr_loss_fn(regr_out, regr_label)
        state_loss = class_loss_fn(state_out, state_label)
        next_loss = class_loss_fn(next_out, next_label)
        sign_loss = class_loss_fn(sign_out, sign_label)
        loss = regr_loss + state_loss + next_loss + sign_loss

        
        # Compute the gradients
        loss.backward()
        # Update the weights
        optimizer.step()
        #batch loss
        c_loss = (state_loss + next_loss + sign_loss).detach().cpu().numpy()
        r_loss = regr_loss.detach().cpu().numpy()
        train_loss_class.append(c_loss)
        train_loss_regr.append(r_loss)
    # Return the average training loss
    train_loss_c = np.mean(train_loss_class)
    train_loss_r = np.mean(train_loss_regr)
    # print(f"Training loss: {train_loss}")
    return train_loss_c, train_loss_r

def get_avg_loss(ext, det, dataloader, class_loss_fn, regr_loss_fn, device):
    ext.eval()
    det.eval()
    class_losses = []
    regr_losses = []
    with torch.no_grad():
        for (img, input_data, regr_label, class_label) in dataloader:
            # Move the input and target data to the selected device
            img, input_data, regr_label, class_label = img.to(device), input_data.to(device), regr_label.to(device), class_label.to(device)
            # Compute the features
            features = ext(img)
            #concatenate features and input_data
            input = torch.cat((features, input_data), dim=1)
            # Compute the output
            output = det(input)
            
            #regression 22 values
            #classification: 3 states, 3 next states, 7 signs
            regr_out = output[:, :22]
            state_out = output[:, 22:25]
            next_out = output[:, 25:28]
            sign_out = output[:, 28:]
            
            state_label = class_label[:, :3]
            next_label = class_label[:, 3:6]
            sign_label = class_label[:, 6:]

            # Compute the losses
            regr_loss = regr_loss_fn(regr_out, regr_label)
            state_loss = class_loss_fn(state_out, state_label)
            next_loss = class_loss_fn(next_out, next_label)
            sign_loss = class_loss_fn(sign_out, sign_label)
            class_loss = state_loss + next_loss + sign_loss

            class_losses.append(class_loss.detach().cpu().numpy())
            regr_losses.append(regr_loss.detach().cpu().numpy())
    # Return the accuracy and test loss
    class_loss = np.mean(class_losses)
    regr_loss = np.mean(regr_losses)
    return class_loss, regr_loss

In [26]:
#load models
detector.load_state_dict(torch.load('detector.pt'))
feature_extractor.load_state_dict(torch.load('feature_extractor.pt'))

#parameters
lr = 0.001
epochs = 10
optimizer = torch.optim.Adam(detector.parameters(), lr=lr)
regr_loss_fn = nn.MSELoss()
class_loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):
    class_loss, regr_loss = train_epoch(feature_extractor, detector, train_dataloader, class_loss_fn, regr_loss_fn, optimizer, device)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Regression loss: {regr_loss}")
    print(f"Classification loss: {class_loss}")
    torch.save(detector.state_dict(), 'detector.pt')
    torch.save(feature_extractor.state_dict(), 'feature_extractor.pt')

Epoch 1/10
Regression loss: 1.383567452430725
Classification loss: 0.5319092273712158
Epoch 2/10
Regression loss: 0.5856467485427856
Classification loss: 0.15974725782871246
Epoch 3/10
Regression loss: 0.4529952108860016
Classification loss: 0.08969224244356155
Epoch 4/10
Regression loss: 0.32902899384498596
Classification loss: 0.052130065858364105
Epoch 5/10
Regression loss: 0.26006975769996643
Classification loss: 0.03691543638706207
Epoch 6/10
Regression loss: 0.21560432016849518
Classification loss: 0.02538921684026718
Epoch 7/10
Regression loss: 0.18115952610969543
Classification loss: 0.017120476812124252
Epoch 8/10
Regression loss: 0.16349801421165466
Classification loss: 0.013205996714532375
Epoch 9/10
Regression loss: 0.1333140879869461
Classification loss: 0.010985524393618107
Epoch 10/10
Regression loss: 0.1297859102487564
Classification loss: 0.00826309435069561


In [27]:
#testing
test_dataset = CsvDataset(folder='test_imgs')
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=True)

#get accuracy
train_class_loss, train_regr_loss = get_avg_loss(feature_extractor, detector, train_dataloader, class_loss_fn, regr_loss_fn, device)
test_class_loss, test_regr_loss = get_avg_loss(feature_extractor, detector, test_dataloader, class_loss_fn, regr_loss_fn, device)

print(f"Training classification loss: {train_class_loss}")
print(f"Training regression loss: {train_regr_loss}\n")
print(f"Testing classification loss: {test_class_loss}")
print(f"Testing regression loss: {test_regr_loss}")

100%|██████████| 2878/2878 [00:00<00:00, 37794.45it/s]
100%|██████████| 2878/2878 [00:00<00:00, 64085.49it/s]
100%|██████████| 2878/2878 [00:16<00:00, 173.67it/s]


Training classification loss: 0.008194329217076302
Training regression loss: 0.028632866218686104

Testing classification loss: 0.008201380260288715
Testing regression loss: 0.028721364215016365


In [28]:
print(test_dataset.data[0][0].shape)

torch.Size([3, 240, 320])


In [29]:
#save pytorch model
torch.save(detector.state_dict(), 'detector.pt')
torch.save(feature_extractor.state_dict(), 'feature_extractor.pt')

#save the model so that opencv can load it
import torch
import torch.onnx
import torchvision
import torchvision.models as models
import sys

device = torch.device('cpu')
detector.to(device)
feature_extractor.to(device)
 
onnx_detector_path = "detector.onnx"
onnx_feature_extractor_path = "feature_extractor.onnx"

# set the model to inference mode
detector.eval()
feature_extractor.eval()
 
# Create some sample input in the shape this model expects 
# This is needed because the convertion forward pass the network once 
dummy_input = torch.randn(1, 3, 240, 320)
dummy_input2 = torch.randn(1, 20484)
torch.onnx.export(feature_extractor, dummy_input, onnx_feature_extractor_path, verbose=True)
torch.onnx.export(detector, dummy_input2, onnx_detector_path, verbose=True)


graph(%input.1 : Float(1, 3, 240, 320, strides=[230400, 76800, 320, 1], requires_grad=0, device=cpu),
      %pretrained.0.conv.weight : Float(32, 3, 6, 6, strides=[108, 36, 6, 1], requires_grad=0, device=cpu),
      %pretrained.0.conv.bias : Float(32, strides=[1], requires_grad=0, device=cpu),
      %pretrained.1.conv.weight : Float(64, 32, 3, 3, strides=[288, 9, 3, 1], requires_grad=0, device=cpu),
      %pretrained.1.conv.bias : Float(64, strides=[1], requires_grad=0, device=cpu),
      %pretrained.2.cv1.conv.weight : Float(32, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=0, device=cpu),
      %pretrained.2.cv1.conv.bias : Float(32, strides=[1], requires_grad=0, device=cpu),
      %pretrained.2.cv2.conv.weight : Float(32, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=0, device=cpu),
      %pretrained.2.cv2.conv.bias : Float(32, strides=[1], requires_grad=0, device=cpu),
      %pretrained.2.cv3.conv.weight : Float(64, 64, 1, 1, strides=[64, 1, 1, 1], requires_grad=0, device=cpu),
 

In [30]:
#test with opencv
sample_image = "training_imgs/img_1.png"
images = [cv.imread(f"training_imgs/img_{i+1}.png") for i in range(100)]
 
#The Magic:
det =  cv.dnn.readNetFromONNX(onnx_detector_path) 
ext = cv.dnn.readNetFromONNX(onnx_feature_extractor_path)

for i in tqdm(range(100)):
    image = images[i]
    blob = cv.dnn.blobFromImage(image, 1.0, (320, 240),(0, 0, 0), swapRB=True, crop=False)
    ext.setInput(blob)
    features = ext.forward()
    # print(features.shape)
    action_vec = np.ones((1,4))
    input = np.concatenate((features, action_vec), axis=1)
    # print(input.shape)
    det.setInput(input)
    preds = det.forward()

print(f"Predictions: {preds}")
print(f"Predictions shape: {preds.shape}")

100%|██████████| 100/100 [00:11<00:00,  8.69it/s]

Predictions: [[  -0.085596    0.058444      7.2527    0.071631    0.029917    -0.10753   -0.038321      0.0562     0.10364    0.083307      0.0144     0.18264   -0.030135    0.048837    0.068057    -0.05302  -0.0011943   -0.022427    0.043871    -0.10293    -0.08841  -0.0054706    -0.65493      3.1872     -7.2564      1.8444
      -1.5973     -4.7095      13.277     -12.225      -11.98     -13.961     -14.774     -12.789     -13.977]]
Predictions shape: (1, 35)



