In [None]:
import os
import io
import json
import csv
import random
import numpy as np
from PIL import Image
import lmdb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split
import torch.nn as nn
from torchsummary import summary

In [None]:
# Use MPS (Apple GPU) if available; otherwise, fall back to CPU
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

#DEVICE = torch.device("cuda")

# Using consolidated training data
1. Legibility
2. Digit classification

Run the following cell to create merged ground truths:

In [None]:
# File paths (update these as needed)
txt_file_path = "./data/SoccerNetLegibility/train/train_gt.txt" #legibility ground truth (frame level)
json_file_path = "./data/train_gt_numbers.json" #numbers ground truth (tracklet level)
output_file_path = "./data/merged_ground_truths.csv"

# Step 1: Load legibility ground truths from TXT file
legibility_data = {}

with open(txt_file_path, "r") as txt_file:
    for line in txt_file:
        parts = line.strip().split(",")
        frame_name = parts[0].strip(".jpg")  # Format: trackletID_frameID.jpg
        legibility = int(parts[1])  # 0 or 1
        legibility_data[frame_name] = legibility

# Step 2: Load jersey number ground truths from JSON file
with open(json_file_path, "r") as json_file:
    jersey_numbers = json.load(json_file)  # Dictionary: {"trackletID": jersey_number}

# Step 3: Merge data and write to output file
with open(output_file_path, "w") as output_file:
    for frame_name, legibility in legibility_data.items():
        tracklet_id = frame_name.split("_")[0]  # Extract tracklet number
        jersey_number = jersey_numbers.get(tracklet_id, -1)  # Default to -1 if not found
        output_file.write(f"{frame_name}, {legibility}, {jersey_number}\n")

print(f"Merged ground truths saved to {output_file_path}")


In [None]:
# Preprocessing
transform = transforms.Compose([
    transforms.Resize(128),      # Resize the smaller edge while keeping aspect ratio
    transforms.CenterCrop(128),  # Ensures final size
    transforms.ToTensor()       # Convert to PyTorch tensor (C, H, W)
])

In [None]:
# Loading ground truths
# For each cropped jersey number image (named as <tracklet>_<frame>.jpg), it extracts the <tracklet> part from the name   
# and looks up its ground truth in the JSON file, loads the image, performs transformations, and appends it to a list.

def load_merged_data(img_dir="./data/SoccerNetLegibility/train/images", gt_file="./data/merged_ground_truths.csv"):
    # Load the ground truths into a dictionary
    gt_data = {}

    with open(gt_file, "r") as csv_file:
        reader = csv.reader(csv_file)
        for row in reader:
            frame_id = row[0].strip()  # tracklet_number_frame_id
            legibility = int(row[1].strip())  # 0 or 1
            jersey_number = int(row[2].strip())  # -1 or 1 to 99
            gt_data[frame_id] = (legibility, jersey_number)


    images = []
    labels = []
    
    for filename in os.listdir(img_dir):
        if filename.endswith(".jpg"):
            # key = filename.split("_")[0]  # Extract tracklet ID from filename
            # label = gt_data[key]

            img_path = os.path.join(img_dir, filename)
            try:
                img = Image.open(img_path).convert("RGB")  
                img_tensor = transform(img)  
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
                continue 
            
            images.append(img_tensor)

            key = filename.split(".")
            label = gt_data[key[0]]

            # we use separate labels for legibility and jersey number (so the model will have to output them separately)
            if label[1] == -1 or label[0] == 0:
                labels.append([0, 0])   
            else:
                labels.append(label)

    
    return torch.stack(images), torch.tensor(labels, dtype=torch.long)  # return Tensors


In [None]:
# Load images and labels from merged_ground_truths.csv file for single-loss
def load_data_csv(img_dir="./data/SoccerNetLegibility/train/images", gt_file="./data/merged_ground_truths.csv"):
    # Load the ground truths into a dictionary
    gt_data = {}

    with open(gt_file, "r") as csv_file:
        reader = csv.reader(csv_file)
        for row in reader:
            frame_id = row[0].strip()  # tracklet_number_frame_id
            jersey_number = int(row[2].strip())  # -1 or 1 to 99
            gt_data[frame_id] = jersey_number


    images = []
    labels = []
    
    for filename in os.listdir(img_dir):
        if filename.endswith(".jpg"):
            # key = filename.split("_")[0]  # Extract tracklet ID from filename
            # label = gt_data[key]

            img_path = os.path.join(img_dir, filename)
            try:
                img = Image.open(img_path).convert("RGB")  
                img_tensor = transform(img)  
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
                continue 
            
            images.append(img_tensor)

            key = filename.split(".")
            label = gt_data[key[0]]

            if label[1] == -1:
                labels.append(0)   
            else:
                labels.append(label)

    
    return torch.stack(images), torch.tensor(labels, dtype=torch.long)  # return Tensors

In [None]:
# Load images and labels in the format: <legibility>, <jersey number>
# images, labels = load_merged_data()

# Load images just using the number as the ground truth:
images, labels = load_data_csv()

In [None]:
dataset = TensorDataset(images, labels)

# 95-5 train-test split
train_size = int(0.95 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# For initial model testing, try to overfit to a small subset of the training data to see if the model has sufficient complexity
small_train_size = int(0.15 * len(train_dataset))
small_indices = random.sample(range(len(train_dataset)), small_train_size)
small_train_dataset = torch.utils.data.Subset(train_dataset, small_indices)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)
small_train_loader = DataLoader(small_train_dataset, batch_size=32, shuffle=True, num_workers=2)

print(f"Total dataset size: {len(dataset)}")
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(f"Small test dataset size: {len(small_train_dataset)}")

In [None]:
class JerseyNumberClassifier(nn.Module):
    def __init__(self):
        super(JerseyNumberClassifier, self).__init__()
        
        # 64x64 input
        self.block1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        # 32x32 inputs
        self.block2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        # 16x16 inputs
        self.block3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        # 8x8 inputs 
        self.block4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        # 4x4 inputs
        self.block5 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # 2x2 inputs
        self.block6 = nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        # 1x1 inputs
        self.fc = nn.Sequential(
            nn.Linear(1024 * 1 * 1, 8192),   # output of last conv block is 1024 * 1 * 1
            nn.ReLU(),
            nn.Linear(8192, 4096),
            nn.ReLU(),
            nn.Linear(4096, 1024),
            nn.ReLU(),
            nn.Linear(1024, 100)
        )
    
    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.block6(x)
        x = torch.flatten(x, start_dim=1)  
        x = self.fc(x)
        return x
