In [16]:
# import packages
import os, gc
import zarr
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from typing import Dict
from collections import Counter
from prettytable import PrettyTable

In [17]:
#level5 toolkit
#We are required to use L5Kit toolkit provided by the competition host 
#to prepare/preprocess data, trian and evaluate the mode
from l5kit.data import PERCEPTION_LABELS
from l5kit.dataset import EgoDataset, AgentDataset
from l5kit.data import ChunkedDataset, LocalDataManager

# level5 toolkit 
from l5kit.configs import load_config_data
from l5kit.geometry import transform_points
from l5kit.rasterization import build_rasterizer
from l5kit.visualization import draw_trajectory, draw_reference_trajectory, TARGET_POINTS_COLOR
from l5kit.evaluation import write_pred_csv, compute_metrics_csv, read_gt_csv, create_chopped_dataset
#L5Kit is a library which lets you:¶
#Load driving scenes from zarr files
#Read semantic maps
#Read aerial maps
#Create birds-eye-view (BEV) images which represent a scene around an AV or another vehicle
#Sample data
#Train neural networks
#Visualize results

In [18]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import animation
from colorama import Fore, Back, Style

In [19]:
# deep learning
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision.models.resnet import resnet18, resnet50, resnet34

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [21]:
'''
batch size
The batch size defines the number of samples that will be propagated through the network.

For instance, let's say you have 1050 training samples and you want to set up a batch_size equal to 100.
The algorithm takes the first 100 samples (from 1st to 100th) from the training dataset and trains the network. 
Next, it takes the second 100 samples (from 101st to 200th) and trains the network again.
We can keep doing this procedure until we have propagated all samples through of the network.
Problem might happen with the last set of samples.
In our example, we've used 1050 which is not divisible by 100 without remainder. 
The simplest solution is just to get the final 50 samples and train the network.

Advantages of using a batch size < number of all samples:

It requires less memory. Since you train the network using fewer samples, 
the overall training procedure requires less memory. 
That's especially important if you are not able to fit the whole dataset in your machine's memory.

Typically networks train faster with mini-batches. 
That's because we update the weights after each propagation. 
In our example we've propagated 11 batches (10 of them had 100 samples and 1 had 50 samples) 
and after each of them we've updated our network's parameters. 
If we used all samples during propagation we would make only 1 update for the network's parameter.

Disadvantages of using a batch size < number of all samples:

The smaller the batch the less accurate the estimate of the gradient will be. 
In the figure below, you can see that the direction of the mini-batch gradient (green color)
fluctuates much more in comparison to the direction of the full batch gradient (blue color).

shuffle
Shuffling data serves the purpose of reducing variance and making sure that models remain general and overfit less.

The obvious case where you'd shuffle your data is if your data is sorted by their class/target. 
Here, you will want to shuffle to make sure that your training/test/validation sets are representative of 
the overall distribution of the data.

For batch gradient descent, the same logic applies. The idea behind batch gradient descent is 
that by calculating the gradient on a single batch, you will usually get a fairly good estimate 
of the "true" gradient. That way, you save computation time by not having to calculate the "true"
gradient over the entire dataset every time.

You want to shuffle your data after each epoch because you will always have the risk 
to create batches that are not representative of the overall dataset, and therefore, 
your estimate of the gradient will be off. Shuffling your data after each epoch ensures 
that you will not be "stuck" with too many bad batches.

In regular stochastic gradient descent, when each batch has size 1, you still want to shuffle your data 
after each epoch to keep your learning general. Indeed, if data point 17 is always used after data point 16, 
its own gradient will be biased with whatever updates data point 16 is making on the model. By shuffling your data,
you ensure that each data point creates an "independent" change on the model, without being biased
by the same points before them.

num_worker
To speed up the training process, we will make use of the num_workers optional attribute of the DataLoader class.

The num_workers attribute tells the data loader instance how many sub-processes to use for data loading.
By default, the num_workers value is set to zero, and a value of zero tells the loader to load the data 
inside the main process.

This means that the training process will work sequentially inside the main process. 
After a batch is used during the training process and another one is needed, we read the batch data from disk.

Now, if we have a worker process, we can make use of the fact that our machine has multiple cores. 
This means that the next batch can already be loaded and ready to go by the time the main process 
is ready for another batch. 
This is where the speed up comes from. The batches are loaded using additional worker processes 
and are queued up in memory.

resnet 
Deeper neural networks are more difficult to train. Why? One big problem of a deep network is
the vanishing gradient problem. Basically, the deeper the harder to train.
To solve this problem, proposed to use a reference to the previous layer to 
compute the output at a given layer. In ResNet, the output from the previous layer, called residual,
is added to the output of the current layer. 

A residual network, or ResNet for short, is an artificial neural network that helps 
to build deeper neural network by utilizing skip connections or shortcuts to jump over some layers.
skipping helps build deeper network layers without falling into the problem of vanishing gradients.
There are different versions of ResNet, including ResNet-18, ResNet-34, ResNet-50, 
and so on. The numbers denote layers, although the architecture is the same.

'''

'\nbatch size\nThe batch size defines the number of samples that will be propagated through the network.\n\nFor instance, let\'s say you have 1050 training samples and you want to set up a batch_size equal to 100.\nThe algorithm takes the first 100 samples (from 1st to 100th) from the training dataset and trains the network. \nNext, it takes the second 100 samples (from 101st to 200th) and trains the network again.\nWe can keep doing this procedure until we have propagated all samples through of the network.\nProblem might happen with the last set of samples.\nIn our example, we\'ve used 1050 which is not divisible by 100 without remainder. \nThe simplest solution is just to get the final 50 samples and train the network.\n\nAdvantages of using a batch size < number of all samples:\n\nIt requires less memory. Since you train the network using fewer samples, \nthe overall training procedure requires less memory. \nThat\'s especially important if you are not able to fit the whole dataset

In [22]:
DEBUG = False

# training cfg
training_cfg = {
    
    'format_version': 4,
    
     ## Model options
    'model_params': {
        'model_architecture': 'resnet34',
        'history_num_frames': 10,
        'history_step_size': 1,
        'history_delta_time': 0.1,
        'future_num_frames': 50,
        'future_step_size': 1,
        'future_delta_time': 0.1,
    },

    ## Input raster parameters
    'raster_params': {
        
        'raster_size': [300,300], # raster's spatial resolution [meters per pixel]: 
        #the size in the real world one pixel corresponds to.
        # set by yourself: 300*300, 350*350, 450*450, 600*600
        'pixel_size': [0.5, 0.5], # From 0 to 1 per axis, [0.5,0.5] would show the ego centered in the image.
        'ego_center': [0.25, 0.5],
        'map_type': "py_semantic",
        
        # the keys are relative to the dataset environment variable
        'satellite_map_key': "aerial_map/aerial_map.png",
        'semantic_map_key': "semantic_map/semantic_map.pb",
        'dataset_meta_key': "meta.json",

        # e.g. 0.0 include every obstacle, 0.5 show those obstacles with >0.5 probability of being
        # one of the classes we care about (cars, bikes, peds, etc.), >=1.0 filter all other agents.
        'filter_agents_threshold': 0.5
    },

    ## Data loader options
    'train_data_loader': {
        'key': "scenes/train.zarr",
        'batch_size': 16, # set by yourself,8, 16,32
        'shuffle': True,
        'num_workers': 4
    },

    ## Train params
    'train_params': {
        'checkpoint_every_n_steps': 5000,
        'max_num_steps': 100 if DEBUG else 20000 #set by yourself, 10000, 20000, 25000, 30000,  75000
    }
}

# inference cfg
inference_cfg = {
    
    'format_version': 4,
    'model_params': {
        'history_num_frames': 10,
        'history_step_size': 1,
        'history_delta_time': 0.1,
        'future_num_frames': 50,
        'future_step_size': 1,
        'future_delta_time': 0.1
    },
    
    'raster_params': {
        'raster_size': [300, 300], # same as train
        'pixel_size': [0.5, 0.5],
        'ego_center': [0.25, 0.5],
        'map_type': 'py_semantic',
        'satellite_map_key': 'aerial_map/aerial_map.png',
        'semantic_map_key': 'semantic_map/semantic_map.pb',
        'dataset_meta_key': 'meta.json',
        'filter_agents_threshold': 0.5
    },
    
        'test_data_loader': {
        'key': 'scenes/test.zarr',
        'batch_size': 16, 
        'shuffle': False,
        'num_workers': 4
    }

}

In [23]:
#Couple of things to note:

#model_architecture: you can put 'resnet18', 'resnet34' or 'resnet50'.
#raster_size: specify the size of the image, the default is [224,224].
#Increase raster_size can improve the score. However the training time will be significantly longer.
#batch_size: number of inputs for one forward pass, again one of the parameters to tune.
#max_num_steps: the number of iterations to train, i.e. number of epochs.
#checkpoint_every_n_steps: the model will be saved at every n steps,
#again change this number as to how you want to keep track of the model.

In [24]:
# root directory
DIR_INPUT = "/Users/shuozhang/Downloads/lyft-motion-prediction-autonomous-vehicles/"

#submission
SINGLE_MODE_SUBMISSION = f"{DIR_INPUT}/single_mode_sample_submission.csv"
MULTI_MODE_SUBMISSION = f"{DIR_INPUT}/multi_mode_sample_submission.csv"

# set env variable for data
os.environ["L5KIT_DATA_FOLDER"] = DIR_INPUT
dm = LocalDataManager(None)
print(training_cfg)

{'format_version': 4, 'model_params': {'model_architecture': 'resnet34', 'history_num_frames': 10, 'history_step_size': 1, 'history_delta_time': 0.1, 'future_num_frames': 50, 'future_step_size': 1, 'future_delta_time': 0.1}, 'raster_params': {'raster_size': [300, 300], 'pixel_size': [0.5, 0.5], 'ego_center': [0.25, 0.5], 'map_type': 'py_semantic', 'satellite_map_key': 'aerial_map/aerial_map.png', 'semantic_map_key': 'semantic_map/semantic_map.pb', 'dataset_meta_key': 'meta.json', 'filter_agents_threshold': 0.5}, 'train_data_loader': {'key': 'scenes/train.zarr', 'batch_size': 16, 'shuffle': True, 'num_workers': 4}, 'train_params': {'checkpoint_every_n_steps': 5000, 'max_num_steps': 20000}}


In [25]:
# training cfg
train_cfg = training_cfg["train_data_loader"]

# rasterizer
rasterizer = build_rasterizer(training_cfg, dm)

# dataloader
train_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open()
train_dataset = AgentDataset(training_cfg, train_zarr, rasterizer)
train_dataloader = DataLoader(train_dataset, shuffle=train_cfg["shuffle"], batch_size=train_cfg["batch_size"], 
                             num_workers=train_cfg["num_workers"])
print(train_dataset)

+------------+------------+------------+-----------------+----------------------+----------------------+----------------------+---------------------+
| Num Scenes | Num Frames | Num Agents | Total Time (hr) | Avg Frames per Scene | Avg Agents per Frame | Avg Scene Time (sec) | Avg Frame frequency |
+------------+------------+------------+-----------------+----------------------+----------------------+----------------------+---------------------+
|   16265    |  4039527   | 320124624  |      112.19     |        248.36        |        79.25         |        24.83         |        10.00        |
+------------+------------+------------+-----------------+----------------------+----------------------+----------------------+---------------------+


In [26]:
class LyftModel(nn.Module):
    
    def __init__(self, cfg):
        super().__init__()
        
        self.backbone = resnet34(pretrained=True) 
        
        num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
        num_in_channels = 3 + num_history_channels

        self.backbone.conv1 = nn.Conv2d(
            num_in_channels,
            self.backbone.conv1.out_channels,
            kernel_size=self.backbone.conv1.kernel_size,
            stride=self.backbone.conv1.stride,
            padding=self.backbone.conv1.padding,
            bias=False,
        )
        
        # This is 512 for resnet18 and resnet34;
        # And it is 2048 for the other resnets
        backbone_out_features = 512
        
        # X, Y coords for the future positions 
        num_targets = 2 * cfg["model_params"]["future_num_frames"]


        self.head = nn.Sequential(
            nn.Linear(in_features=backbone_out_features, out_features=4096),
        )

        self.logit = nn.Linear(4096, out_features=num_targets)
        
    def forward(self, x):
        x = self.backbone.conv1(x)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        x = self.backbone.avgpool(x)
        x = torch.flatten(x, 1)
        
        x = self.head(x)
        x = self.logit(x)
        
        return x

In [27]:
'''
LyftMultiModel(
  (backbone): ResNet(
    (conv1): Conv2d(25, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (layer2): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (downsample): Sequential(
          (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): BasicBlock(
        (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (layer3): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (downsample): Sequential(
          (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): BasicBlock(
        (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (layer4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (downsample): Sequential(
          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): BasicBlock(
        (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
    (fc): Linear(in_features=512, out_features=1000, bias=True)
  )
  (head): Sequential(
    (0): Linear(in_features=512, out_features=4096, bias=True)
  )
  (logit): Linear(in_features=4096, out_features=303, bias=True)
)
'''

'\nLyftMultiModel(\n  (backbone): ResNet(\n    (conv1): Conv2d(25, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n    (relu): ReLU(inplace=True)\n    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n    (layer1): Sequential(\n      (0): BasicBlock(\n        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n        (relu): ReLU(inplace=True)\n        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n      )\n      (1): BasicBlock(\n        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n        (bn1): BatchNorm2d(64, eps=1e-05, momentum

In [28]:
# compiling model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LyftModel(training_cfg).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss(reduction="none")

In [29]:
# get hardware type (CPU, GPU, TPU)
device # cuba:0

device(type='cpu')

In [31]:
#training loop
tr_it = iter(train_dataloader)
progress_bar = tqdm(range(training_cfg["train_params"]["max_num_steps"]))

losses_train = []

for _ in progress_bar:
    try:
        data = next(tr_it)
    except StopIteration:
        tr_it = iter(train_dataloader)
        data = next(tr_it)
    model.train()
    torch.set_grad_enabled(True)
    
    # forward pass
    inputs = data["image"].to(device)
    target_availabilities = data["target_availabilities"].unsqueeze(-1).to(device)
    targets = data["target_positions"].to(device)
    
    outputs = model(inputs).reshape(targets.shape)
    loss = criterion(outputs, targets)

    # not all the output steps are valid, but we can filter them out from the loss using availabilities
    loss = loss * target_availabilities
    loss = loss.mean()
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    losses_train.append(loss.item())
        
    progress_bar.set_description(f"loss: {loss.item()} loss(avg): {np.mean(losses_train)}")

loss: 13.90205192565918 loss(avg): 123.70076433817546:   0%|          | 3/20000 [00:36<70:23:04, 12.67s/it] 

KeyboardInterrupt: 

In [20]:
# # save full trained model
# torch.save(model.state_dict(), f'model_state_last.pth')

In [21]:
# test configuration
test_cfg = inference_cfg["test_data_loader"]

# Rasterizer
rasterizer = build_rasterizer(inference_cfg, dm)

# Test dataset/dataloader
test_zarr = ChunkedDataset(dm.require(test_cfg["key"])).open()
test_mask = np.load(f"{DIR_INPUT}/scenes/mask.npz")["arr_0"]
test_dataset = AgentDataset(inference_cfg, test_zarr, rasterizer, agents_mask=test_mask)
test_dataloader = DataLoader(test_dataset,
                             shuffle=test_cfg["shuffle"],
                             batch_size=test_cfg["batch_size"],
                             num_workers=test_cfg["num_workers"])


print(test_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7fa573f64a20>


In [22]:
# Saved state dict from the training notebook
WEIGHT_FILE = 'model_state_last.pth'
model_state = torch.load(WEIGHT_FILE, map_location=device)
model.load_state_dict(model_state)

<All keys matched successfully>

In [23]:
device # cuba:0

device(type='cpu')

In [24]:
model.eval()
torch.set_grad_enabled(False)

# store information for evaluation
future_coords_offsets_pd = []
timestamps = []

agent_ids = []
progress_bar = tqdm(test_dataloader)
for data in progress_bar:
    
    inputs = data["image"].to(device)
    target_availabilities = data["target_availabilities"].unsqueeze(-1).to(device)
    targets = data["target_positions"].to(device)

    outputs = model(inputs).reshape(targets.shape)
    
    future_coords_offsets_pd.append(outputs.cpu().numpy().copy())
    timestamps.append(data["timestamp"].numpy().copy())
    agent_ids.append(data["track_id"].numpy().copy())

100%|██████████| 4446/4446 [4:51:34<00:00,  2.72s/it]  


In [25]:
write_pred_csv('xx.csv', # name your own submission file
               timestamps=np.concatenate(timestamps),
               track_ids=np.concatenate(agent_ids),
               coords=np.concatenate(future_coords_offsets_pd),
              )