In [1]:
import os
from tqdm import tqdm
import yaml
import xml.etree.ElementTree as ET
from PIL import Image
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Deeplearning
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import torchvision
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T

#components
from Transformer_RPN.components.vision_transformer import VisionTransformer
from Transformer_RPN.components.region_proposal_network import RegionProposalNetwork
from Transformer_RPN.components.roi_head import ROIHead

In [2]:
os.chdir("../")

In [3]:
def read_yaml(path):
    with open(path) as yaml_file:
        content = yaml.safe_load(yaml_file)
        return content

In [4]:
content = read_yaml('config/config.yaml')

In [5]:
data_ingestion = content['data_ingestion']

In [6]:
data_ingestion

{'img_dir': 'Data/Strawberry data/strawberry_data',
 'xml_dir': 'Data/Strawberry data/strawberry_labels',
 'class': ['angular_leafspot',
  'anthracnose_fruit_rot',
  'gray_mold',
  'leaf_scorch',
  'leaf_spot']}

In [7]:
class LoadData:
    def __init__(self, img_dir, xml_dir, label2idx):
        self.img_dir = img_dir
        self.xml_dir = xml_dir
        self.label2idx = label2idx
        self.img_infos = []

    def load(self):
        xml_files = [os.path.join(self.xml_dir, dir, file) for dir in os.listdir(self.xml_dir) for file in os.listdir(os.path.join(self.xml_dir, dir))]
        for file in tqdm(xml_files, desc='Processing XML files'):
            img_info = {}
            img_info['id'] = os.path.basename(file).split('.xml')[0]
            xml_info = ET.parse(file)
            root = xml_info.getroot()
            size = root.find('size')
            folder = file.split('/')[3]
            img_info['image'] = os.path.join(self.img_dir, folder,'{}.jpg'.format(img_info['id']))
            width = int(size.find('width').text)
            height = int(size.find('height').text)
            img_info['height'] = height
            img_info['width'] = width
            detections = []
        

            for obj in xml_info.findall('object'):
                det = {}
                label = label2idx[obj.find('name').text]
                if obj.find('name').text == 'leaf_blight':
                    print(img_info['id'])
                bbox_info = obj.find('bndbox')
                bbox = [
                    int(float(bbox_info.find('xmin').text))-1,
                    int(float(bbox_info.find('ymin').text))-1,
                    int(float(bbox_info.find('xmax').text))-1,
                    int(float(bbox_info.find('ymax').text))-1
                ]
                det['label'] = label
                det['bbox'] = bbox
                detections.append(det)
            
            img_info['detections'] = detections
            self.img_infos.append(img_info)
        return self.img_infos

In [8]:
classes = data_ingestion['class']
classes = sorted(classes)
classes = ['background'] + classes
label2idx = {classes[idx]: idx for idx in range(len(classes))}
idx2label = {idx: classes[idx] for idx in range(len(classes))}

In [9]:
load_data = LoadData(img_dir=data_ingestion['img_dir'], xml_dir=data_ingestion['xml_dir'], label2idx=label2idx)
data = load_data.load()

Processing XML files:   0%|          | 0/941 [00:00<?, ?it/s]

Processing XML files: 100%|██████████| 941/941 [00:00<00:00, 4088.10it/s]


In [10]:
total_len_data = len(data)
total_len_data

941

In [11]:
train_size = int(total_len_data * 0.7)
val_size = int(total_len_data * 0.15)
test_size = total_len_data - train_size - val_size
print(f"Train Size: {train_size}\nValidation Size : {val_size}\nTest_size: {test_size}")

Train Size: 658
Validation Size : 141
Test_size: 142


In [12]:
random.shuffle(data)
train_data = data[:train_size]
val_data = data[train_size: train_size+val_size]
test_data = data[train_size+val_size:]

In [13]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.target_size = (224,224)
        self.images_info = data
        self.transform = torchvision.transforms.Compose([
            torchvision.transforms.Resize(self.target_size),  # Resize images
            torchvision.transforms.ToTensor()  # Convert to tensor
        ])
    
    def __len__(self):
        return len(self.images_info)
    
    def __getitem__(self, index):
        img_info = self.images_info[index]
        img = Image.open(img_info['image'])
        
        # Store original dimensions
        original_w, original_h = img.size
        target_w, target_h = self.target_size
    
        # Resize the image
        img_tensor = self.transform(img)
        
        # Scale bounding boxes to new image size
        targets = {}
        targets['bboxes'] = []
        targets['labels'] = torch.as_tensor([d['label'] for d in img_info['detections']], dtype=torch.int64)

        scale_x = target_w / original_w
        scale_y = target_h / original_h

        for detection in img_info['detections']:
            x1, y1, x2, y2 = detection['bbox']

            # Scale bbox to new dimensions
            x1 = int(x1 * scale_x)
            y1 = int(y1 * scale_y)
            x2 = int(x2 * scale_x)
            y2 = int(y2 * scale_y)

            targets['bboxes'].append([x1, y1, x2, y2])

        targets['bboxes'] = torch.as_tensor(targets['bboxes'], dtype=torch.float32)

        return img_tensor, targets, img_info['image']

In [14]:
train_datasets = CustomDataset(train_data)

In [15]:
test_datasets = CustomDataset(test_data)

In [16]:
val_datasets = CustomDataset(val_data)

In [17]:
train_dl = DataLoader(train_datasets,batch_size=1,shuffle=True,num_workers=4)
test_dl = DataLoader(test_datasets,batch_size=1,shuffle=True,num_workers=4)
val_dl = DataLoader(val_datasets,batch_size=1,shuffle=True,num_workers=4)

In [18]:
for image, target, _ in tqdm(train_dl):
    print(target['bboxes'])
    print(image.shape)
    image = image
    target = target
    break

  0%|          | 0/658 [00:00<?, ?it/s]

tensor([[[  8., 132.,  18., 142.],
         [  0., 114.,   9., 122.],
         [ 58., 123.,  66., 132.],
         [ 90., 171.,  98., 180.],
         [ 87., 148.,  96., 155.],
         [127., 170., 134., 181.],
         [133., 154., 140., 164.],
         [116., 144., 124., 151.],
         [124., 104., 130., 114.],
         [142., 136., 162., 162.],
         [148., 126., 155., 135.],
         [174., 164., 179., 175.],
         [151., 175., 158., 183.],
         [148., 183., 155., 191.],
         [152., 183., 162., 203.],
         [103., 178., 115., 192.],
         [109., 168., 116., 178.],
         [ 84.,  73., 109., 128.],
         [116.,  33., 122.,  41.],
         [109., 105., 114., 112.],
         [ 22., 106.,  28., 112.],
         [ 71., 158.,  78., 165.],
         [188., 180., 195., 189.],
         [140., 162., 149., 173.],
         [109., 160., 115., 167.],
         [ 99., 152., 109., 164.],
         [103., 135., 111., 141.],
         [ 61., 140.,  74., 148.],
         [ 17., 125.




In [19]:
image.shape

torch.Size([1, 3, 224, 224])

In [20]:
target

{'bboxes': tensor([[[  8., 132.,  18., 142.],
          [  0., 114.,   9., 122.],
          [ 58., 123.,  66., 132.],
          [ 90., 171.,  98., 180.],
          [ 87., 148.,  96., 155.],
          [127., 170., 134., 181.],
          [133., 154., 140., 164.],
          [116., 144., 124., 151.],
          [124., 104., 130., 114.],
          [142., 136., 162., 162.],
          [148., 126., 155., 135.],
          [174., 164., 179., 175.],
          [151., 175., 158., 183.],
          [148., 183., 155., 191.],
          [152., 183., 162., 203.],
          [103., 178., 115., 192.],
          [109., 168., 116., 178.],
          [ 84.,  73., 109., 128.],
          [116.,  33., 122.,  41.],
          [109., 105., 114., 112.],
          [ 22., 106.,  28., 112.],
          [ 71., 158.,  78., 165.],
          [188., 180., 195., 189.],
          [140., 162., 149., 173.],
          [109., 160., 115., 167.],
          [ 99., 152., 109., 164.],
          [103., 135., 111., 141.],
          [ 61., 1

In [21]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [22]:
device

'cuda'

In [23]:
vision_model = VisionTransformer().to(device=device)

In [24]:
base_feature = vision_model(image.to(device))

In [25]:
base_feature.shape

torch.Size([1, 197, 768])

In [26]:
rpn_params = read_yaml("params.yaml")['rpn_params']

In [27]:
rpn_params

{'scales': [128, 256, 512],
 'ascpect_ratios': [0.5, 1, 2],
 'low_iou_threshold': 0.3,
 'high_iou_threshold': 0.7,
 'rpn_nms_threshold': 0.7,
 'rpn_batch_size': 256,
 'rpn_prenms_train_topk': 12000,
 'rpn_prenms_test_topk': 3000,
 'rpn_train_topk': 2000,
 'rpn_test_topk': 300,
 'input_channels': 768}

In [28]:
rpn_model = RegionProposalNetwork(ascpect_ratios=rpn_params['ascpect_ratios'], scales=rpn_params['scales'], in_channels=rpn_params['input_channels'], rpn_prenms_topk=rpn_params['rpn_prenms_train_topk'], rpn_nms_threshold=rpn_params['rpn_nms_threshold'], rpn_topk=rpn_params['rpn_train_topk'], high_iou_threshold=rpn_params['high_iou_threshold'],low_iou_threshold=rpn_params['low_iou_threshold']).to(device=device)

In [29]:
rpn_output = rpn_model(image, base_feature, target=target, device=device)

In [30]:
rpn_output

{'proposals': tensor([[  0.,   0., 224., 224.]], device='cuda:0'),
 'scores': tensor([1.], device='cuda:0'),
 'rpn_classification_loss': tensor(240.4287, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'rpn_localization_loss': tensor(1369.7397, device='cuda:0', grad_fn=<DivBackward0>)}

In [31]:
roi_head_param = read_yaml('params.yaml')['roi_params']

In [32]:
roi_head_param

{'num_classes': 6,
 'roi_batch_size': 128,
 'fc_inner_dim': 1024,
 'roi_iou_threshold': 0.5,
 'roi_low_bg_iou': 0.0,
 'roi_pool_size': 7,
 'roi_nms_threshold': 0.3,
 'roi_topk_detections': 100,
 'roi_score_threshold': 0.05,
 'roi_pos_fraction': 0.25}

In [33]:
roi_head_model = ROIHead(model_config=roi_head_param, num_classes=roi_head_param['num_classes'],in_channels=768).to(device)

In [34]:
image.shape

torch.Size([1, 3, 224, 224])

In [35]:
frcnn_output = roi_head_model(base_feature, rpn_output['proposals'], image.shape[-2:], target, device)

[0.0625, 0.0625]


In [36]:
frcnn_output

{'frcnn_classification_loss': tensor(343.4810, device='cuda:0', grad_fn=<NllLossBackward0>),
 'frcnn_localization_loss': tensor(143.5847, device='cuda:0', grad_fn=<DivBackward0>)}

In [37]:
from Transformer_RPN.pipeline.model import TransformerRPN

In [44]:
model = TransformerRPN(roi_head_param=roi_head_param, rpn_params=rpn_params, device=device, training=True)

In [45]:
model = model.to(device)

In [46]:
model

TransformerRPN(
  (vision_model): VisionTransformer(
    (transformer_encoder_layers): Sequential(
      (0): TransformerEncoder(
        (MSA): MultiHeadSelfAttention(
          (multihead_attention): ModuleList(
            (0-11): 12 x SelfAttention()
          )
        )
        (MLP): MultiLayerPerceptron(
          (mlp): Sequential(
            (0): Linear(in_features=768, out_features=3072, bias=True)
            (1): GELU(approximate='none')
            (2): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (dropout3): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoder(
        (MSA): MultiHeadSelfAttention(
          (multihead_attention): ModuleList(
            (0-11): 12 x

In [47]:
model(image = image, target=target, device=device)

[0.0625, 0.0625]


({'proposals': tensor([[  0.0000,   0.0000, 224.0000, 224.0000],
          [  0.0000,   0.0000, 114.9902, 224.0000]], device='cuda:0'),
  'scores': tensor([1., 0.], device='cuda:0'),
  'rpn_classification_loss': tensor(267.4563, device='cuda:0',
         grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
  'rpn_localization_loss': tensor(1607.2919, device='cuda:0', grad_fn=<DivBackward0>)},
 {'frcnn_classification_loss': tensor(324.4942, device='cuda:0', grad_fn=<NllLossBackward0>),
  'frcnn_localization_loss': tensor(125.7053, device='cuda:0', grad_fn=<DivBackward0>)})