# Start code

In [1]:
import torch
import os
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

import json
import cv2
import numpy as np

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create Torch Dataset

In [4]:
class KeypointsDataset(Dataset):
    def __init__(self, img_dir, data_file):
        self.img_dir = img_dir
        with open(data_file, "r") as f:
            self.data = json.load(f)
        
        self.transforms = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        img = cv2.imread(f"{self.img_dir}/{item['id']}.png")
        h,w = img.shape[:2]

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = self.transforms(img)
        kps = np.array(items['kps']).flatten()
        kps = kps.astype(np.float32)

        kps[::2] *= 224.0 / w # Adjust x coordinates
        kps[1::2] *= 224.0 / h # Adjust y coordinates

        return img, kps

In [5]:
def pad_keypoints(keypoints, max_keypoints=8):
    while len(keypoints) < max_keypoints:
        keypoints = np.append(keypoints, 0.0)  # Append with 0.0 for missing keypoints
    return keypoints


class KeypointsDataset(Dataset):
    def __init__(self, img_dir, label_dir):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.image_files = sorted(os.listdir(img_dir))  # Get all image files (sorted)
        
        self.transforms = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.img_dir, img_name)
        label_path = os.path.join(self.label_dir, img_name.replace('.jpg', '.txt'))
        
        # Read the image
        img = cv2.imread(img_path)
        h, w = img.shape[:2]
        
        # Convert to RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Apply transformations
        img = self.transforms(img)
        
        # Read the label file and extract keypoints
        with open(label_path, 'r') as f:
            label = f.readlines()
        
        # Parse the keypoints from the label file
        keypoints = []
        for line in label:
            parts = line.strip().split()
            keypoints.append([float(parts[1]), float(parts[2])])  # Assuming x, y are at index 1, 2
        
        # Flatten keypoints list
        keypoints = np.array(keypoints).flatten()
        keypoints = keypoints.astype(np.float32)
        
        # Pad keypoints if needed
        keypoints = pad_keypoints(keypoints)
        
        # Normalize keypoints to fit the 224x224 image size
        keypoints[::2] *= 224.0 / w  # Adjust x coordinates
        keypoints[1::2] *= 224.0 / h  # Adjust y coordinates

        return img, keypoints


In [None]:

train_dataset = KeypointsDataset("data/images","data/data_train.json")
val_dataset = KeypointsDataset("data/images","data/data_val.json")

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)


In [6]:
# Train Dataset
train_dataset = KeypointsDataset("data/train/images", "data/train/labels")

# Validation Dataset
val_dataset = KeypointsDataset("data/valid/images", "data/valid/labels")

# Test Dataset
test_dataset = KeypointsDataset("data/test/images", "data/test/labels")

# Dataloaders for each split
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


# Creat model

In [7]:
model = models.resnet50(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, 8)  # 4 keypoints × 2 coordinates




In [8]:
model = model.to(device)

# Train model

In [9]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [10]:
epochs=50
for epoch in range(epochs):
    for i, (imgs,kps) in enumerate(train_loader):
        imgs = imgs.float().to(device)  # Ensure the images are in float32 format
        kps = kps.float().to(device)    # Ensure the keypoints are in float32 format

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, kps)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Epoch {epoch}, iter {i}, loss: {loss.item()}")

Epoch 0, iter 0, loss: 0.13672569394111633
Epoch 0, iter 10, loss: 0.014773351140320301
Epoch 0, iter 20, loss: 0.007442539092153311
Epoch 0, iter 30, loss: 0.005183512344956398
Epoch 0, iter 40, loss: 0.0013900813646614552
Epoch 0, iter 50, loss: 0.001122383284382522
Epoch 0, iter 60, loss: 0.0005513367941603065
Epoch 0, iter 70, loss: 0.002329389564692974
Epoch 0, iter 80, loss: 0.0004610777832567692
Epoch 1, iter 0, loss: 0.0008066309383139014
Epoch 1, iter 10, loss: 0.00036697747418656945
Epoch 1, iter 20, loss: 0.00070352473994717
Epoch 1, iter 30, loss: 0.0005108090699650347
Epoch 1, iter 40, loss: 0.0009007157641462982
Epoch 1, iter 50, loss: 0.0004993715556338429
Epoch 1, iter 60, loss: 0.00044928560964763165
Epoch 1, iter 70, loss: 0.0002599689760245383
Epoch 1, iter 80, loss: 0.00036963014281354845
Epoch 2, iter 0, loss: 0.0004243637959007174
Epoch 2, iter 10, loss: 0.0010871411068364978
Epoch 2, iter 20, loss: 0.00037053116830065846
Epoch 2, iter 30, loss: 0.0015145991928875

In [None]:
torch.save(model.state_dict(), "keypoints_model.pth")