In [None]:
import json
from dataset import Drive360Loader


config = json.load(open('./starter_kit/config.json'))

train_loader = Drive360Loader(config, 'train')
validation_loader = Drive360Loader(config, 'validation')
test_loader = Drive360Loader(config, 'test')

print('Loaded train loader with the following data available as a dict.')
print(train_loader.drive360.dataframe.keys())

In [None]:
from collections import defaultdict
from collections import OrderedDict
from itertools import chain
from typing import Dict
from typing import List
from typing import Set

from torchvision import models
import torch.nn as nn
import torch


class ResidualFusion(nn.Module):
    
    def __init__(self, in_panels: int, block_in_panels: int, block_out_panels: int, block: nn.Module):
        super().__init__()
        self.projection = self._make_projection(in_panels, block_in_panels)
        self.block = block
        self.downsample = self._make_downsample(block_in_panels, block_out_panels)
        self.relu = nn.ReLU()
        
    def _make_projection(self, in_panels: int, out_panels: int) -> nn.Module:
        return nn.Sequential(OrderedDict({
            'conv': nn.Conv2d(in_panels, out_panels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
            'bn': nn.BatchNorm2d(out_panels),
        }))
        
    def _make_downsample(self, in_panels: int, out_panels: int) -> nn.Module:
        return nn.Sequential(OrderedDict({
            'conv': nn.Conv2d(in_panels, out_panels, kernel_size=1, stride=2, bias=False),
            'bn': nn.BatchNorm2d(out_panels),
        }))
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        projection = self.projection(x)
        out = self.block(projection)
        out = out + self.downsample(projection)
        out = self.relu(out)
        return out


class SlowFusionModel(nn.Module):

    def __init__(self, cameras: Set[str], num_frames_per_camera: int):
        super().__init__()
        self.cameras = cameras
        self.num_frames_per_camera = num_frames_per_camera

        # Build towers to extract features from each frame from each camera
        resnet = models.resnet34(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:6])

        # Fuse features of all frames for each camera
        self.early_fusions: Dict[str, nn.Module] = dict()
        for camera in self.cameras:
            self.early_fusions[camera] = self._make_early_fusion()

        # Fuse volume from each camera
        self.late_fusion = self._make_late_fusion()
        
        # Add more convolutional layers
        self.neck = self._make_neck()

        # Build two regression heads for each target
        self.speed_head = self._make_regression_head()
        self.angle_head = self._make_regression_head()

    def forward(self, x: Dict[str, Dict[int, torch.Tensor]]) -> Dict[str, torch.Tensor]:
        # Extract features from each frame from each camera
        features = {camera: [self.feature_extractor(x[camera][frame]) for frame in 
                             range(self.num_frames_per_camera)]
                    for camera in self.cameras}

        # Fuse features of all frames for each camera
        early_fusions_out: Dict[str, torch.Tensor] = dict()
        for camera in self.cameras:
            frames = tuple(features[camera])
            frames = torch.cat(frames, 1)
            early_fusions_out[camera] = self.early_fusions[camera](frames)

        # Fuse volume from each camera
        cameras = tuple(early_fusions_out.values())
        cameras = torch.cat(cameras, 1)
        late_fusion_out = self.late_fusion(cameras)
        
        # Add more convolutional layers
        neck_out = self.neck(late_fusion_out)

        # Perform prediction on both regression heads
        neck_out = neck_out.view(neck_out.size(0), -1)
        speed_head_out = self.speed_head(neck_out)
        angle_head_out = self.angle_head(neck_out)

        return {'canSpeed': torch.squeeze(speed_head_out),
                'canSteering': torch.squeeze(angle_head_out)}

    def cuda(self, device: str = None):
        super().cuda()
        for camera in self.cameras:
            self.early_fusions[camera].cuda()
        return self

    def _make_early_fusion(self) -> nn.Module:
        resnet = models.resnet34(pretrained=False)
        return ResidualFusion(
            in_panels=128 * self.num_frames_per_camera,
            block_in_panels=128,
            block_out_panels=256,
            block=nn.Sequential(*list(resnet.children())[6:7]),
        )

    def _make_late_fusion(self) -> nn.Module:
        resnet = models.resnet34(pretrained=False)
        return ResidualFusion(
            in_panels=256 * len(self.cameras),
            block_in_panels=256,
            block_out_panels=512,
            block=nn.Sequential(*list(resnet.children())[7:8]),
        )
    
    def _make_neck(self) -> nn.Module:
        resnet = models.resnet34(pretrained=False)
        return nn.Sequential(*list(resnet.children())[8:9])


    def _make_regression_head(self) -> nn.Module:
        return nn.Sequential(OrderedDict({
            'fc1': nn.Linear(512, 64),
            'relu1': nn.ReLU(),
            'fc2': nn.Linear(64, 32),
            'relu2': nn.ReLU(),
            'fc3': nn.Linear(32, 1),
        }))

    def _make_block(self, in_channels: int, out_channels: int) -> nn.Module:
        return nn.Sequential(OrderedDict({
            'conv': nn.Conv2d(in_channels, out_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
            'bn': nn.BatchNorm2d(out_channels),
            'relu': nn.ReLU(),
        }))


In [None]:
class SomeDrivingModel(nn.Module):
    
    def __init__(self):
        super(SomeDrivingModel, self).__init__()
        final_concat_size = 0
        
        # Main CNN
        cnn = models.resnet34(pretrained=True)
        self.features = nn.Sequential(*list(cnn.children())[:-1])
        self.intermediate = nn.Sequential(nn.Linear(
                          cnn.fc.in_features, 128),
                          nn.ReLU())
        final_concat_size += 128

        # Main LSTM
        self.lstm = nn.LSTM(input_size=128,
                            hidden_size=64,
                            num_layers=3,
                            batch_first=False)
        final_concat_size += 64
        
        # Angle Regressor
        self.control_angle = nn.Sequential(
            nn.Linear(final_concat_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        # Speed Regressor
        self.control_speed = nn.Sequential(
            nn.Linear(final_concat_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, data):
        module_outputs = []
        lstm_i = []
        # Loop through temporal sequence of
        # front facing camera images and pass 
        # through the cnn.
        for k, v in data['cameraFront'].items():
            x = self.features(v)
            x = x.view(x.size(0), -1)
            x = self.intermediate(x)
            lstm_i.append(x)
            # feed the current front facing camera
            # output directly into the 
            # regression networks.
            if k == 0:
                module_outputs.append(x)

        # Feed temporal outputs of CNN into LSTM
        i_lstm, _ = self.lstm(torch.stack(lstm_i))
        module_outputs.append(i_lstm[-1])
        
        # Concatenate current image CNN output 
        # and LSTM output.
        x_cat = torch.cat(module_outputs, dim=-1)
        
        # Feed concatenated outputs into the 
        # regession networks.
        prediction = {'canSteering': torch.squeeze(self.control_angle(x_cat)),
                      'canSpeed': torch.squeeze(self.control_speed(x_cat))}
        return prediction

In [None]:
import numpy as np


model = SlowFusionModel(cameras={'cameraFront'}, num_frames_per_camera=4)
#model = SlowFusionModel(cameras={'cameraFront'}, num_frames_per_camera=4)
model = model.cuda() if config['cuda']['use'] else model

In [None]:
import torch.optim as optim
import torch

num_epochs = 40
log_every = 100  
criterion = nn.MSELoss()  # nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True)
model.train()


def validate(model, criterion, data_loader):
    val_running_loss = 0.0
    for data, target in data_loader:
        data, target = sent_to_device(data, target, config)
        pred = model(data)
        loss = compute_loss(pred, target, criterion)
        val_running_loss += loss.item()
    return val_running_loss / len(validation_loader)


def compute_loss(prediction, target, criterion):
    return criterion(prediction['canSpeed'], target['canSpeed']) + \
            2 * criterion(prediction['canSteering'], target['canSteering'])


def sent_to_device(data, target, config):
    if config['cuda']['use']:
        data = {cam: ({idx: frame.cuda() for idx, frame in frames.items()} 
                if isinstance(frames, dict) else frames.cuda())   
                for cam, frames in data.items()}
        target = {cam: labels.cuda() for cam, labels in target.items()}
    return data, target


for epoch in range(num_epochs):
    
    # Training
    train_running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = sent_to_device(data, target, config)
        optimizer.zero_grad()
        pred = model(data)
        loss = compute_loss(pred, target, criterion)
        loss.backward()
        optimizer.step()
        
        train_running_loss += loss.item()
        if batch_idx and batch_idx % log_every == 0:  
            train_loss = train_running_loss / log_every
            print('[epoch: %d, batch:  %5d] training loss: %.5f' % (epoch, batch_idx, train_loss))
            train_running_loss = 0.0
            
    torch.cuda.empty_cache()
    model.cuda()
    
    # Validation
    val_loss = validate(model, criterion, validation_loader)
    scheduler.step(val_loss)
    print('[epoch: %d] validation loss: %.5f' % (epoch, val_loss))
    
    torch.cuda.empty_cache()
    model.cuda()
    
    # Save model
    torch.save(model.state_dict(), f"./model-{epoch}.torch")
            

In [None]:
import numpy as np


model_name = "model-25-l2"
model.load_state_dict(torch.load(f"./{model_name}.torch"))
model.cuda()

model.eval()
running_mse = 0.0
with torch.no_grad():
    for batch_idx, (data, target) in enumerate(validation_loader):
        data, target = sent_to_device(data, target, config)
        prediction = model(data)
        running_mse += (np.square(prediction['canSpeed'].cpu() - target['canSpeed'].cpu())).mean() + \
                        (np.square(prediction['canSteering'].cpu() - target['canSteering'].cpu())).mean()
        if batch_idx % 100 == 0:
            print(batch_idx)
    print("MSE:", running_mse / len(validation_loader))


In [None]:
normalize_targets = config['target']['normalize']
target_mean = config['target']['mean']
target_std = config['target']['std']

def add_results(results, output):
    steering = np.squeeze(output['canSteering'].cpu().data.numpy())
    speed = np.squeeze(output['canSpeed'].cpu().data.numpy())
    if normalize_targets:
        steering = (steering*target_std['canSteering'])+target_mean['canSteering']
        speed = (speed*target_std['canSpeed'])+target_mean['canSpeed']
    if np.isscalar(steering):
        steering = [steering]
    if np.isscalar(speed):
        speed = [speed]
    results['canSteering'].extend(steering)
    results['canSpeed'].extend(speed)


In [None]:
from datetime import datetime

import pandas as pd


label = datetime.now().strftime("%d-%m-%Y--%H:%M:%S")
file = f'./submission--{model_name}--{label}.csv'
results = {'canSteering': [],
           'canSpeed': []}
with torch.no_grad():
    for batch_idx, (data, target) in enumerate(test_loader):
        data, target = sent_to_device(data, target, config)
        prediction1 = l1_model(data)
        prediction2 = l2_model(data)
        prediction = {
            'canSpeed': (prediction1['canSpeed'] + prediction2['canSpeed']) / 2,
            'canSteering': (prediction1['canSteering'] + prediction2['canSteering']) / 2,
        }
        add_results(results, prediction)
        
df = pd.DataFrame.from_dict(results)
df.to_csv(file, index=False)