In [None]:
%pip install pandas
%pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cu121
%pip install opencv-python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision import datasets, transforms
from torch.utils.data import Dataset 
from torch.utils.data import DataLoader
from PIL import Image
import json
import os
import cv2
import random
from torchvision.transforms.functional import pad

In [8]:
# Preparing the data

card_to_int = {
    'Ah': 0, '2h': 1, '3h': 2, '4h': 3, '5h': 4, '6h': 5, '7h': 6, '8h': 7, '9h': 8, 'Th': 9, 'Jh': 10, 'Qh': 11, 'Kh': 12,
    'Ac': 13, '2c': 14, '3c': 15, '4c': 16, '5c': 17, '6c': 18, '7c': 19, '8c': 20, '9c': 21, 'Tc': 22, 'Jc': 23, 'Qc': 24, 'Kc': 25,
    'Ad': 26, '2d': 27, '3d': 28, '4d': 29, '5d': 30, '6d': 31, '7d': 32, '8d': 33, '9d': 34, 'Td': 35, 'Jd': 36, 'Qd': 37, 'Kd': 38,
    'As': 39, '2s': 40, '3s': 41, '4s': 42, '5s': 43, '6s': 44, '7s': 45, '8s': 46, '9s': 47, 'Ts': 48, 'Js': 49, 'Qs': 50, 'Ks': 51
}

# Number of classes in the card dataset.
num_classes = 52 + 1  # 52 cards + 1 background class
target_width = 980
target_height = 600

# Create subclass of the pytorch base custom dataset.
class ImageDataset(torch.utils.data.Dataset):

    # Initializes the parameters and loads in the native data.
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        with open(os.path.join(root_dir, 'Tablestest.json'), 'r') as f: self.table_info = json.load(f)

    # Gets total number of samples, used to determine the number of iterations required to go through entire dataset.
    def __len__(self):
            return len(self.table_info)
    
    def transform_boxes(self, idx):
         # Load table dimensions and raw boxes
        table_dims = self.table_info[idx]['table_dims']
        boxes = self.table_info[idx]['boxes']

        # Calculate the scaling factors
        scale_factor_x = target_width / table_dims[0]
        scale_factor_y = target_height / table_dims[1]

        # Apply the scaling factors to the raw bounding boxes
        boxes_transformed = [
            [
                box[0] * scale_factor_x,
                box[1] * scale_factor_y,
                box[2] * scale_factor_x,
                box[3] * scale_factor_y,
            ]
            for box in boxes
        ]

        return boxes_transformed

    # Load and return a sample from the dataset given an index. When using data loaders, this is called to fetch a specific sample from the dataset.
    def __getitem__(self, idx):
        # Retrieves the tables image path and associated label for the table image.
        img_path = os.path.join(self.root_dir, 'Table Images', self.table_info[idx]['table_image'])
        boxes = self.transform_boxes(idx)
        labels = [card_to_int[label] for label in self.table_info[idx]['table_label']]
        # Print the boxes and labels for debugging
        print(f"Index: {idx}, Boxes: {boxes}, Labels: {labels}")
        # Opens the image via the card and table image paths.
        img = Image.open(img_path)
        # Converts images to RGB that were currently RGB with Alpha (RGBA).
        img = img.convert("RGB")

        # Applies the transformations to the images when they are accessed through the dataset instances. 
        if self.transform:
            img = self.transform(img)

        target = {
            'boxes': torch.tensor(boxes, dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.int64)
        }

        return img, target

def custom_collate_fn(batch):
    images, targets = zip(*batch)
    images = torch.stack(images, 0)
    targets = [{k: v for k, v in t.items()} for t in targets]
    return images, targets
    
table_transform = transforms.Compose([transforms.Resize((target_height, target_width), Image.BICUBIC), transforms.ToTensor()])

# ImageDataset instances. Loader is used to provide access to the tensor data during training and testing.
root_dir = "C:/Users/Admin/Desktop/Primary Skills/Programming & ML/Machine Learning/Card Detection"
table_dataset = ImageDataset(root_dir=root_dir, transform=table_transform)
train_dataset = table_dataset
#train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    collate_fn=custom_collate_fn
)

print("Data preparation completed.")
print(f"Number of samples in training dataset: {len(train_dataset)}")

Data preparation completed.
Number of samples in training dataset: 2


In [9]:
# Load a pre-trained model for the backbone
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
backbone.out_channels = 1280  # Set the number of output channels for the backbone

# Define the anchor generator
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

# Create the Faster R-CNN model
model = FasterRCNN(backbone,
                   num_classes=num_classes,
                   rpn_anchor_generator=anchor_generator)

In [11]:
# Training the model on the table images

# Device selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer using an algorithm during training with the set learning rate.
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

num_epochs = 40
print_frequency = 10

# Loop through the number of epochs.
for epoch in range(num_epochs):
    # Initialize the running loss.
    running_loss = 0.0
    # Iterate over the DataLoader (train_loader) to get batches of input images and their targets.
    for i, (inputs, targets) in enumerate(train_loader, 0):
        # Move the inputs and targets to the appropriate device (CPU or GPU).
        inputs, targets = inputs.to(device), [{k: v.to(device) for k, v in t.items()} for t in targets]
        # Clear the gradients of the model parameters before each forward pass.
        optimizer.zero_grad()
        # Pass the input images through the model to get the predicted outputs.
        outputs = model(inputs, targets)
        # Calculate the loss between the predicted outputs and the ground truth targets.
        loss = outputs['loss_classifier'] + outputs['loss_box_reg'] + outputs['loss_objectness'] + outputs['loss_rpn_box_reg']
        # Perform backpropagation to compute gradients of the loss with respect to the model parameters.
        loss.backward()
        # Update the model parameters using the optimizer.
        optimizer.step()
        # Accumulate the running loss for each batch.
        running_loss += loss.item()

    # Print the average loss every print_frequency epochs
    if (epoch + 1) % print_frequency == 0:
        average_loss = running_loss / (i + 1)
        print(f"Epoch {epoch + 1}, Loss: {average_loss}")

print("Finished training")

# Plotting the epoch loss and accuracy 
#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
#ax1.plot(range(print_frequency, num_epochs + 1, print_frequency), losses)
#ax1.set(xlabel='Epoch', ylabel='Loss', title='Epoch Loss')
#ax2.plot(range(print_frequency, num_epochs + 1, print_frequency), accuracies)
#ax2.set(xlabel='Epoch', ylabel='Accuracy', title='Epoch Accuracy')
#plt.show()


Index: 0, Boxes: [[359.7971602434077, 184.74820143884892, 411.4807302231237, 246.0431654676259], [414.46247464503045, 184.74820143884892, 465.15212981744423, 246.0431654676259], [469.12778904665316, 184.74820143884892, 519.8174442190669, 246.0431654676259], [576.4705882352941, 184.74820143884892, 627.1602434077079, 246.0431654676259], [9.939148073022313, 529.2086330935251, 36.774847870182555, 561.1510791366907], [37.76876267748479, 529.2086330935251, 65.59837728194726, 561.1510791366907], [66.5922920892495, 529.2086330935251, 93.42799188640974, 561.1510791366907], [95.4158215010142, 529.2086330935251, 122.25152129817444, 561.1510791366907], [124.2393509127789, 529.2086330935251, 151.07505070993915, 561.1510791366907], [153.0628803245436, 529.2086330935251, 178.90466531440163, 561.1510791366907], [180.8924949290061, 529.2086330935251, 207.72819472616632, 561.1510791366907], [208.72210953346857, 529.2086330935251, 236.55172413793105, 561.1510791366907], [237.54563894523326, 529.208633093