# Create Dataset object 
- first list all image(.png) and label(.txt) files 
- augment images to create more variability in training data 
    - essential to not overfit model 
- get image, bounding box info, and label from files 
    - target must be in form: target={'boxes': xxx 'labels': xxx} 
    


In [0]:
import numpy as np 
import torch 
from torchvision import transforms
from PIL import Image 
import os 
import random 
from torch.utils.data import Dataset, DataLoader

#####################################
# image augmentation library 
# https://github.com/aleju/imgaug
# pip install git+https://github.com/aleju/imgaug.git
#####################################
import imgaug as ia 
import imgaug.augmenters as iaa 

class KITTI(Dataset): 
    def __init__(self, basedir, transforms=transforms.ToTensor()): 
        self.basedir = basedir
        self.transforms = transforms 

        # get list of filenames         
        self.images = [file for file in os.listdir(self.basedir + '/training/image_2') if file.endswith('.png')]
        self.labels = [file for file in os.listdir(self.basedir + '/label_2') if file.endswith('.txt')]
        self.images.sort()
        self.labels.sort()
        
        # class type to integer 
        self.CLASS = {'Car':0, 'Van':1, 'Truck':2, 'Pedestrian':3, 'Person_sitting':4, 'Cyclist':5, 'Tram':6, 'Misc':7, 'DontCare':8}
        self.classes = ['Car', 'Van', 'Truck', 'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram', 'Misc', 'DontCare']
    
    def __len__(self): 
        return len(self.labels)        

    def augment(self, image): 
        '''
        Augments a single image 
        Args 
            image: PIL image ---> convert TO numpy 
        Returns
            aug: PIL image ----> converted FROM numpy 
        ''' 
        image = np.asarray(image) # convert PIL image to numpy array 
        rand_int = random.randint(-5,5)
        value = 0 if rand_int < 0 else rand_int
        seq = iaa.Sequential([
                iaa.SomeOf((0, 2)),
                    iaa.Emboss(alpha=(0, 1.0), strength=(0, 0.75)), # emboss images
                    iaa.OneOf([
                        iaa.GaussianBlur((0, 2.0)), # blur images with a sigma between 0 and 3.0
                        iaa.AverageBlur(k=(5, 7)), # blur image using local means with kernel sizes between 5 and 7
                        iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 3 and 11
                    ]),
                
                    iaa.OneOf([
                        # either change the brightness of the whole image (sometimes
                        # per channel) or change the brightness of subareas
                        iaa.Multiply((0.8, 1.2), per_channel=0.5),
                        iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images
                    ]),
                
                    iaa.OneOf([
                        iaa.Dropout(p=0.05, per_channel=True),
                        iaa.Crop(px=(0, value)), # crop images from each side by 0 to 4px (randomly chosen)
                    ])
            ])
        
        img = seq.augment_image(image) 
        img = Image.fromarray(img.astype('uint8'), 'RGB') # convert numpy array to PIL image 
        return img 

    def __getitem__(self, idx):

        image = Image.open(self.basedir + '/training/image_2/' + self.images[idx])
        label = open(self.basedir + '/label_2/' + self.labels[idx],'r')
        
        # Get the image 
        image = self.augment(image)
        if self.transforms is not None:
            image = self.transforms(image)  

        # Get the bounding box and class types 
        Lines = label.readlines() # read lines in txt file one by one
        bbox = [] 
        types = []
        for i, line in enumerate(Lines): 
            elem = line.split()
            if i == 0: 
                left = float(elem[4]) 
                top = float(elem[5])
                right = float(elem[6])
                bottom = float(elem[7])
                bbox = np.array([left, top, right, bottom])
            else: 
                left = float(elem[4]) 
                top = float(elem[5])
                right = float(elem[6])
                bottom = float(elem[7])
                curr_box = np.array([left, top, right, bottom])
                bbox = np.vstack((bbox, curr_box))
            label_type = elem[0]
            label_type = self.CLASS.get(label_type)
            types.append(label_type)

        # convert to tensor  
        boxes = torch.as_tensor(bbox, dtype = torch.float32)
        labels = torch.as_tensor(types, dtype=torch.int64)
        
        # get target into right format 
        target = {} 
        target['boxes'] = boxes
        target['labels'] = labels 
        return (image, target)
        

# ### Testing ### 
# basedir = '/content/drive/My Drive/Personal/2Dobject_detection/data'
# ds = KITTI(basedir)
# a = ds.__len__()
# b = ds.__getitem__(65)


# Finetune a pretrained model 


In [0]:
import torchvision 
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 9 
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)


# Train model 

In [0]:
import torch.optim as optim 

basedir = '/content/drive/My Drive/Personal/2Dobject_detection/data'
ds = KITTI(basedir)
train_loader = DataLoader(dataset=ds, batch_size=1, shuffle=True)
optimizer = optim.Adam(model.parameters(), lr=.001)
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [0]:
from tqdm import tqdm 

def train_one_epoch(model, optimizer, device): 
    for i,(images, targets) in enumerate(train_loader): 
        model.train()
        optimizer.zero_grad()
        images = [image.to(device) for image in images]

        #### the dataloader puts the dict values into a different shape then 
        #### what I created them to be. Wonder why? 
        #### maybe because I have batch_size == 1: 
        #### stacking images caused issues; maybe because different sizes? 
        #### documentation says images can be different sizes with collate_fn::::: TO DO 
    
        if targets['boxes'].dim() == 3: 
            size = targets['boxes'].shape
            targets['boxes'] = targets['boxes'].reshape((size[1], size[2]))
        if targets['labels'].dim() == 2: 
            targets['labels'] = targets['labels'].reshape((targets['labels'].shape[1]))
        
        targets = [{k:v.to(device) for k,v in targets.items()}]
        # print(targets['labels'])
        # print(targets['labels'].shape)
        output = model(images, targets)
        losses = sum(loss for loss in output.values())
        losses.backward() 
        optimizer.step()
        if i % 50 == 0: 
            print('Loss: {}'.format(losses))
    return output 



def train(model, optimizer, device, num_epochs=10): 
    model.to(device)
    for epoch in tqdm(range(1, num_epochs+1)): 
        output = train_one_epoch(model,optimizer, device)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, optimizer, device, num_epochs=15)






Loss: 1.2268264293670654
Loss: 0.19299210608005524
Loss: 0.6901226043701172
Loss: 0.42334648966789246
Loss: 0.6255536675453186
Loss: 0.6094328761100769
Loss: 2.276123523712158
Loss: 5.634336471557617
Loss: 1.792634129524231
Loss: 9.558202743530273
Loss: 89.03678894042969
Loss: 8.754332542419434
Loss: 43.03195571899414
Loss: 1.3514418601989746
Loss: 0.7985383868217468
Loss: 0.7064328789710999
Loss: 0.6408401727676392
Loss: 0.6467110514640808
Loss: 0.9968171119689941
Loss: 0.5014932751655579
Loss: 1.2766413688659668
Loss: 0.5586798191070557
Loss: 0.6295728087425232
Loss: 0.5943513512611389
Loss: 0.47257745265960693
Loss: 0.5654472708702087
Loss: 0.6523834466934204
Loss: 0.6639809012413025
Loss: 0.8757748007774353
Loss: 0.47696396708488464
Loss: 0.7971283197402954
Loss: 0.6320591568946838
Loss: 0.3628688454627991
Loss: 0.4482140839099884
Loss: 0.5003513097763062
Loss: 0.3706916570663452
Loss: 0.7949826717376709
Loss: 0.3964989185333252
Loss: 0.38971754908561707
Loss: 0.48299115896224976
L




  7%|▋         | 1/15 [6:31:53<91:26:26, 23513.35s/it][A[A[A

Loss: 0.34173908829689026
Loss: 0.24417048692703247
Loss: 1.0455904006958008
Loss: 0.5900596380233765
Loss: 0.8083372116088867
Loss: 0.24932146072387695
Loss: 0.4139249324798584
Loss: 0.42718467116355896
Loss: 0.3828098773956299
Loss: 0.640572726726532
Loss: 0.2501707673072815
Loss: 1.2918858528137207
Loss: 0.6096632480621338
Loss: 0.9390166401863098
Loss: 0.4835008978843689
Loss: 1.6715396642684937
Loss: 0.3358192443847656
Loss: 0.26861003041267395
Loss: 0.8295992612838745
Loss: 0.821998119354248
Loss: 0.7044740915298462
Loss: 0.23923178017139435
Loss: 0.37356552481651306
Loss: 0.5223569869995117
Loss: 0.22065499424934387
Loss: 0.44175073504447937
Loss: 1.2367305755615234
Loss: 0.5988364219665527
Loss: 0.5182716846466064
Loss: 1.068030834197998
Loss: 0.30778953433036804
Loss: 0.7365075945854187
Loss: 0.2532760798931122
Loss: 0.6996801495552063
Loss: 0.24263402819633484
Loss: 0.3704892098903656
Loss: 0.9332658052444458
Loss: 0.3760693073272705
Loss: 0.5687687397003174
Loss: 1.860356807