Welcome to the final project of Deep Learning for Robot Perception, University of Michigan. This project proposes a perception system involving object detection and depth estimation to localize cutting points for grape bunches. The idea is to come up with a vision system capable of allowing robots to perceive grape stems and motion plan to grasp them.

Run some imports

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset

import matplotlib.pyplot as plt

from PIL import Image

import numpy as np

import json

  from .autonotebook import tqdm as notebook_tqdm


Google drive stuff, ignore for local.

In [2]:
# %load_ext autoreload
# %autoreload 2

# from google.colab import drive
# drive.mount("/content/drive", True)

# import os
# import sys

# path = '/content/drive/My Drive/grape_net'
# sys.path.append(path)

Test methods

In [3]:
from grape_net import hello_grape_net
from helpers import hello_helpers

hello_grape_net()
hello_helpers()

hello grape_net!
hello helpers!


Load the dataset and dataloader

In [4]:
from grape_net import GrapeNet
from grape_net import GrapeDataset
from torch.utils.data import DataLoader

#DATASET HERE
img_folder = '292034_grapes/grape_dataset/img'
label_folder = '292034_grapes/grape_dataset/ann'
grape_dataset = GrapeDataset(img_folder, label_folder)

print(f'Dataset loaded: {grape_dataset.__len__()} images and labels\n')

Loaded 300 images from folder 292034_grapes/grape_dataset/img
Loaded 300 labels from folder 292034_grapes/grape_dataset/ann
Dataset loaded: 300 images and labels



In [5]:
#DATALOADER
batch_size = 2
def collate_fn(batch):
    return tuple(zip(*batch))  

train_loader = DataLoader(grape_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

print(f'DataLoader is ready with {len(train_loader)} batches')

DataLoader is ready with 75 batches


Training code

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class RPN(nn.Module):
    def __init__(self, in_channels, num_anchors):
        super(RPN, self).__init__()
        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
        self.objectness_pred = nn.Conv2d(in_channels, num_anchors, kernel_size=1)
        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1)

    def forward(self, x):
        features = F.relu(self.conv(x))
        objectness_scores = self.objectness_pred(features)  
        bbox_offsets = self.bbox_pred(features)  # Predict bounding box offsets
        return objectness_scores, bbox_offsets

class GrapeJuice(nn.Module):
    def __init__(self, num_anchors=5):
        super(GrapeJuice, self).__init__()
        
        # feature extractor
        resnet = models.resnet101(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-2])  # remove the last classification layer
        
        # rpn
        self.rpn = RPN(in_channels=2048, num_anchors=num_anchors)
        
        # bbox regression
        self.bbox_regression = nn.Linear(2048 * 7 * 7, 4 * num_anchors)  # 4 for bounding box coordinates (x1, y1, x2, y2)

    def forward(self, x):
        features = self.feature_extractor(x)
        objectness_scores, bbox_offsets = self.rpn(features)
        features = features.view(features.shape[0], -1)
        bbox_offsets = self.bbox_regression(features)
        return objectness_scores, bbox_offsets
    
def calculate_loss(outputs, verbose=False):
    loss_classifier = outputs['loss_classifier']
    loss_box_reg = outputs['loss_box_reg']
    loss_objectness = outputs['loss_objectness']
    loss_rpn_box_reg = outputs['loss_rpn_box_reg']

    # Add all the losses together
    total_loss = loss_classifier + loss_box_reg + loss_objectness + loss_rpn_box_reg

    if verbose:
        print(f'classifier loss: {loss_classifier:.4f}   box reg loss: {loss_box_reg:.4f}   objectness loss: {loss_objectness:.4f}   rpn loss: {loss_rpn_box_reg:.4f}   total loss: {total_loss:.4f}')

    return total_loss


def train(model, train_loader, optimizer, num_epochs=5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    loss_history = []

    for epoch in range(num_epochs):
        running_loss = 0.0

        for images, targets in train_loader:
            
            images = list(image.to(torch.float32) for image in images)
            targets = [t for t in targets]
            
            no_bbox = False
            for t in targets:
                if t['boxes'].shape[0] == 0:
                    no_bbox = True

            if no_bbox:
                continue
            
            optimizer.zero_grad()
            loss_dict = model(images, targets)
            loss = calculate_loss(loss_dict, verbose=True)

            # model.eval()
            # predictions = model(images)
            # print(predictions[0]['boxes'])
            # model.train()
            
            if len(loss_history) > 1 and abs(loss_history[-1] - loss) > 2:
                print('Bad loss, discarding..')
                continue
            
            loss.backward()
            
            optimizer.step()

            running_loss += loss.item() * images[0].size(0)

            loss_history.append(loss.item())

        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'Average epoch loss: {epoch_loss:.4f}\n')


    with torch.no_grad():
        print(f'Trained {num_epochs} epochs..')
        plt.plot(loss_history)
        plt.title('Training Loss')
        plt.xlabel('Iteration')
        plt.ylabel('Loss')
        plt.show()    

In [7]:
import torchvision.models.detection.faster_rcnn
from torchvision.models.detection.faster_rcnn import FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torch.optim as optim


model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)
num_classes = 2
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.Adam(params, lr=5e-4, weight_decay=5e-4)

train(model, train_loader, optimizer, num_epochs=15)

classifier loss: 0.7812   box reg loss: 0.0000   objectness loss: 5.1862   rpn loss: 6.8931   total loss: 12.8606
classifier loss: 0.0001   box reg loss: 0.0000   objectness loss: 1.9793   rpn loss: 13.8934   total loss: 15.8728
classifier loss: 0.0000   box reg loss: 0.0000   objectness loss: 1.1099   rpn loss: 6.3006   total loss: 7.4105
Bad loss, discarding..


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'grapejuice_model1.pth')

Example usage

In [None]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np

# Load the saved model
model.load_state_dict(torch.load('grapejuice_model1.pth'))
model.eval()

print('model set to eval!')

# Load and preprocess the test image
input_image = Image.open('/Users/adibalaji/Desktop/UMICH-23-24/UMICH-Winter-2024/deeprob/final/292034_grapes/grape_dataset/img/CDY_2015.jpg')
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])
input_image = transform(input_image).unsqueeze(0)

# Run inference
with torch.no_grad():
    out = model(input_image)

print(out)
