In [None]:
import torch
import torchvision
import pandas as pd
import os
import numpy as np
from PIL import Image
#import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import cv2 as cv
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision.transforms as T

In [None]:
image_list = os.listdir('./data/')
#image_list

In [None]:
class MarginaliaDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        self.n_samples = len(data)

    def __getitem__(self, index):
        img = self.data[index]["data"]
        boxes = self.data[index]["boxes"]
        labels = self.data[index]["labels"]
        id = self.data[index]["image_id"]
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        return img, target, id
    
    def __len__(self):
        return self.n_samples

In [None]:
def preprocessing(imageID):
    """reads in image and returns preprocessed np array"""
    img = cv.imread(f"./data/{imageID}.png")
   # img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    img = img/255
    img = torch.tensor(img, dtype=torch.float32)
    img = img.permute(2,0,1) # change channel position
    return img


In [None]:
def generate_data():
    data = []
    all_box_coordinates = []
    #print(image_list)
    for image in image_list:
        image_dict = {}
        #id = image.removesuffix('.png') 
        if image.endswith(".png"): 
            id = image[:-4]
        else: 
            continue

        sub_df = boxes[boxes["number"] == int(id)]
        num_boxes = len(sub_df)
        box_coordinates = []
        for i in range(num_boxes):
            sub_sub_df = sub_df.iloc[i]
            xmin_scaled = int(sub_sub_df["xmin_scaled"])
            ymin_scaled = int(sub_sub_df["ymin_scaled"])
            xmax_scaled = int(sub_sub_df["xmax_scaled"])
            ymax_scaled = int(sub_sub_df["ymax_scaled"])
            box_coordinates.append(torch.tensor([xmin_scaled, ymin_scaled, xmax_scaled, ymax_scaled], dtype=torch.int32))
        if num_boxes > 1:
            box_coordinates = torch.stack(box_coordinates, axis=0)
        else:
            box_coordinates = box_coordinates[0]
            box_coordinates = box_coordinates.view(1,4)

        all_box_coordinates.append(box_coordinates)
        
        image_data = preprocessing(id) # returns list

        # labels
        labels = torch.ones(num_boxes, dtype=torch.int64)

        # stack it to dict
        image_dict["data"] = image_data
        image_dict["boxes"] = box_coordinates
        image_dict["labels"] = labels
        image_dict["image_id"] = id

        data.append(image_dict)
    return data

In [None]:

def collate_fn(batch):
    return tuple(zip(*batch))


In [None]:
boxes = pd.read_csv("rescaled_data.csv")
boxes = boxes[["number", "xmin_scaled", "ymin_scaled", "xmax_scaled", "ymax_scaled"]]

image_list = os.listdir('./data/')

data = generate_data()

In [None]:
print(len(data))

513


In [None]:
train_data = data[:500]
test_data = data[500:520]

train_dataset = MarginaliaDataset(train_data)
test_dataset = MarginaliaDataset(test_data)

train_dl = DataLoader(train_dataset, batch_size=4, num_workers=4, collate_fn=collate_fn, pin_memory = True)
val_dl = DataLoader(test_dataset, batch_size=4, collate_fn=collate_fn, pin_memory = True)

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_classes = 2  # 1 class (marginalia) + background
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model=model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
num_epochs = 5

model.to(device)
for epoch in range(num_epochs):
    epoch_loss = 0
    for images, targets, _ in train_dl:
        optimizer.zero_grad()
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
#         print(loss_dict)
        losses = sum(loss for loss in loss_dict.values())
        epoch_loss += losses.item()

        losses.backward()
        optimizer.step()
    print(f"loss for epoch {epoch}: {epoch_loss / len(train_dl)}")

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

loss for epoch 0: 0.4919731457233429
loss for epoch 1: 0.3927506911754608
loss for epoch 2: 0.34403543615341187
loss for epoch 3: 0.3155714858174324
loss for epoch 4: 0.29589018678665163


In [None]:
results=[]
detection_threshold = 0.1 # the lower, the less we keep
model.eval()
model.to(device)

for images, targets, id in val_dl:    

    images = list(image.to(device) for image in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    outputs = model(images)

    for i, image in enumerate(images):

        boxes = outputs[i]['boxes']
        scores = outputs[i]['scores']
        labels = outputs[i]['labels']

        keep = torchvision.ops.nms(boxes, scores, detection_threshold) # the lower, the less we keep
        boxes = boxes[keep]
        scores = scores[keep]
        image_id = id[i]
    
        op = (id[i], boxes, scores)
        results.append(op)

# print(results)

In [None]:
def visualize_prediction(imageID, tensor_bounding_box):
    tensor_bounding_box = tensor_bounding_box.cpu().detach().numpy()
    image = cv.imread(f"data/{imageID}.png")
    image = np.asarray(image)
    for box in tensor_bounding_box:
        x_min = box[0]
        y_min = box[1]
        x_max = box[2]
        y_max = box[3]
        
        color = (0, 0, 255)
        start_point = (int(x_min), int(y_min))
        end_point = (int(x_max), int(y_max))
        thickness = 2
        cv.rectangle(image, start_point, end_point, color, thickness)
    cv.imwrite(f'./results/prediction_{imageID}.png', image)

In [None]:
# visualize_prediction(184423, results[0][1])

In [None]:
for result in results:
  id = result[0]
  boxes = result[1]
  visualize_prediction(id, boxes)

60.453262
88.122314
83.78
348.8205
151.51982
387.17358
205.75401
266.56982
150.90802
219.92976
186.408
67.07389
298.26456
47.800663
326.4887
438.77963
441.77823
447.08698
391.73105
0.0
190.64165
391.06906
108.2879
397.49762
392.7065
0.0
0.0
379.15863
323.119
309.61215
28.859343
304.3553
0.0
222.29736
0.0
306.56863
29.292852
30.104889
129.15248
436.52866
83.22048
111.397095
125.50752
430.20377
475.2548
114.87758
430.99478
490.85776
434.38147
131.74506
249.31404
486.62692
270.016
272.47885
251.65714
69.00178
298.59607
59.614204
69.0322
63.58235
310.95673
0.5416653
103.17096
37.035397
128.03517
163.42209
157.37466
185.63475
306.39703
287.3221
192.84843
271.06256
8.593128
131.39307
291.4219
422.70767
167.52316
40.725933
228.66705
123.299835
123.460754
150.1233
438.85358
112.84903
426.0348
354.1249
190.64333
145.24966
191.46797
351.60336
210.9639
193.28632
367.8495
286.94907
133.19489
242.20863
80.254906
329.40384
172.62808
144.2146
315.03537
14.549049
253.483
166.87564
192.85951
381.81836
