<center><img src='fig/YOLO-v3-network.jpg'><center>
<center><img src='fig/YOLOv3_layers.png'><center>

In [None]:
"""
Implementation of YOLOv3 architecture
"""

import torch
import torch.nn as nn

""" 
Information about architecture config:
Tuple is structured by (filters, kernel_size, stride) 
Every conv is a same convolution. 
List is structured by "B" indicating a residual block followed by the number of repeats
"S" is for scale prediction block and computing the yolo loss
"U" is for upsampling the feature map and concatenating with a previous layer
"""
# Tuple: (out_channels, kernel_size, stride)
# List: ["B", num_of_repeats]
# residual blocks use the last two tuples
config = [
    (32, 3, 1),
    (64, 3, 2),
    ["B", 1],
    (128, 3, 2),
    ["B", 2],
    (256, 3, 2),
    ["B", 8],
    (512, 3, 2),
    ["B", 8],
    (1024, 3, 2),
    ["B", 4],  # To this point is Darknet-53
    (512, 1, 1),
    (1024, 3, 1),
    "S", # scaled prediction 1
    (256, 1, 1),
    "U", # upsampling
    (256, 1, 1),
    (512, 3, 1),
    "S", # scaled prediction 2
    (128, 1, 1),
    "U", # upsampling
    (128, 1, 1),
    (256, 3, 1),
    "S", # scaled prediction 3
]

# CNN block
class CNNBlock(nn.module):
    # bn_act is used to define if the block is going to be using batch norm activation function
    def __init__(self, in_channels, out_channels, bn_act = True, **kwargs):
        super().__init__()
        # if you used batch normalization, then bias is an unneccessary paramenter, thus the not bn_act
        self.conv = nn.Conv2d(in_channels, out_channels, bias = not bn_act, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky = nn.LeakyReLU(0.1) # activation function
        self.use_bn_act = bn_act
        
    def forward(self, x):
        # this if is here because on scaled predictions we aren't going to use batch norm activation
        # scaled predictions are going to be outputs, so we don't want to bn those.
        if self.use_bn_act:
            return self.leaky(self.bn(self.conv(x)))
        else:
            return self.conv(x)

class ResidualBlock(nn.Module):
    # the reason why use_residual is specified is because in some cases we will use skip connection,
    # while in others it'll just go through the config file and not use it
    # default of num_repeats is 1, but the actual value used will depend of the num_repeat argument
    # in the lists ["B", num_repeats]
    def __init__(self, channels, use_residual = True, num_repeats=1):
        super().__init__()
        self.layers() = nn.ModuleList()
        for repeat in num_repeats:
            # this reduces the number of filters, then brings it back again
            self.layers += [nn.Sequential(CNNBlock(channels, channels//2, kernel_size = 1),
                                          CNNBlock(channels//2, channels, kernel_size = 3, padding = 1),
                                         )
                           ]
        self.use_residual = use_residual
        self.num_repeats = num_repeats
    
    def forward(self, x):
        # we didn't change anything, same padding and same number of channels.
        # so we arejust adding x after it go through the conv layers.
        # however, if we aren't using residual, just layer of x
        for layer in self.layers:
            x = layer(x) + x if self.use_residual else layer(x)
        return x
                
        
        
            
        
class ScalePrediction(nn.Module):
    pass

class YOLOv3(nn.Module):
    pass

