<a href="https://colab.research.google.com/github/agrawal-harsh2003/AI_Gym_Trainner/blob/HPE/AI_Gym_Trainner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing necessary libraries

In [None]:
!pip install torch torchvision datasets transformers matplotlib
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from tqdm import tqdm
from transformers import ResNetConfig, ResNetModel, ResNetForImageClassification, AutoImageProcessor

Import Coco-Wholebody dataset using Hugging Face API.

In [None]:
from datasets import load_dataset
ds = load_dataset("harsh-7070/COCO-Wholebody-annotated")
print(ds['train'][0].keys())
print(ds.keys())

Supporting code to find Num of Keypoints in the dataset.

In [None]:
# def find_max_keypoints(dataset):
#     max_keypoints = 0
#     count = 0
#     for item in dataset:
#         keypoints = item['objects']['keypoints']  # Assuming the keypoints are in this field
#         num_keypoints = len(keypoints)
#         max_keypoints = max(max_keypoints, num_keypoints)
#         count += 1
#         if count % 1000 == 0:
#             print(count)

#     print(max_keypoints)

# find_max_keypoints(ds['train'])

Custom functions to help normalize the Dataset.

In [None]:
#data augmentation.
data_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),
])

#Padding the keypoints to create equal length across the annotation.
def pad_keypoints(keypoints_tensor, target_length, padding_value=-1):
    current_length = keypoints_tensor.shape[0]

    if current_length < target_length:
        padding_size = target_length - current_length
        padding = torch.full((padding_size, 3), padding_value)
        padded_keypoints = torch.cat((keypoints_tensor, padding), dim=0)
    else:
        padded_keypoints = keypoints_tensor
    return padded_keypoints

def custom_collate_fn(batch):
    pixel_values_list = [item[0] for item in batch]
    keypoints_list = [item[1] for item in batch]

    pixel_values_padded = torch.stack(pixel_values_list, dim=0)
    keypoints_padded = torch.nn.utils.rnn.pad_sequence(keypoints_list, batch_first=True, padding_value=-1)

    return pixel_values_padded, keypoints_padded

Custom Dataset Class.

In [None]:
class COCOWholeBodyDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        self.image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        image = item['image']
        if image.mode != 'RGB':
            image = image.convert('RGB')
        keypoints = item['objects']['keypoints']
        keypoints_tensor = torch.tensor(keypoints, dtype=torch.float32).view(-1, 3)

        #Pad the keypoints tensor
        # keypoints_tensor = pad_keypoints(keypoints_tensor, 20)
        keypoints_tensor = pad_keypoints(keypoints_tensor, 2660)

        # Process the image using the image processor
        inputs = self.image_processor(images=image, return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze()

        if self.transform:
            pixel_values = self.transform(pixel_values)

        return pixel_values, keypoints_tensor

# Hyperparameters
num_epochs = 15
batch_size = 512
learning_rate = 0.001
num_keypoints = 2660 #20*133 (20 = max no. of annotation merged as one after pre-processing the .jsonl before uploadin to COCO Hub)

Dataset creation using pre-existing split and data loader.

In [None]:
train_dataset = COCOWholeBodyDataset(ds['train'], transform=data_transforms)
val_dataset = COCOWholeBodyDataset(ds['validation'])

#data loaders with collate function
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)

Model Initialization and Configuration leveraging Pre-Trained model from Hugging Face Hub.

In [None]:
class ResNetForImageClassification(nn.Module):
    def __init__(self, config):
        super(ResNetForImageClassification, self).__init__()
        self.config = config
        self.resnet_backbone = torchvision.models.resnet50(pretrained=True)
        self.resnet_backbone.fc = nn.Identity()
        self.classifier = nn.Sequential(
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, num_keypoints * 2)
        )

    def forward(self, x):
        x = self.resnet_backbone(x)

        if len(x.shape) == 4:
            x = nn.functional.adaptive_avg_pool2d(x, (1, 1))
        elif len(x.shape) == 2:
            x = x.flatten(1)
        else:
            raise ValueError(f"Unexpected tensor shape: {x.shape}")

        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

config = ResNetConfig(
    num_channels=3,
    embedding_size=64,
    hidden_sizes=[256, 512, 1024, 2048],
    depths=[3, 4, 6, 3],
    layer_type="bottleneck",
    hidden_act="relu",
    num_labels = num_keypoints * 2
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNetForImageClassification(config).to(device)

In [None]:
# for images, keypoints in train_loader:
#     images = images.to(device)
#     keypoints = keypoints.to(device)
#     print(images.shape)
#     print(keypoints.shape)
#     output = model(images)
#     print(output.shape)
#     break
# torch.Size([32, 3, 224, 224])
# torch.Size([32, 238, 3])
# torch.Size([32, 476])

Model Training

In [None]:
import torch.optim as optim

criterion = nn.MSELoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

best_val_loss = float('inf')
best_model_path = "Best_HPE.pth"

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    total_samples = 0
    for batch_idx, (images, keypoints) in enumerate(tqdm(train_loader)):
        images = images.to(device)
        keypoints = keypoints.to(device)

        keypoints = keypoints[:, :, :2]
        # keypoints = keypoints.view(images.size(0), -1)  # Flatten to [batch_size, 238 * 2]
        keypoints = keypoints.reshape(images.size(0), -1)

        output = model(images)

        mask = (keypoints != -1).float()
        loss = criterion(output, keypoints)
        masked_loss = loss * mask
        loss = masked_loss.sum() / mask.sum()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        total_samples += images.size(0)


    avg_loss = running_loss / total_samples
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Validation loop
model.eval()

val_loss = 0.0
val_samples = 0

with torch.no_grad(): for batch_idx, (images, keypoints) in enumerate(tqdm(val_loader)):
  images = images.to(device)
  keypoints = keypoints.to(device)[:, :, :2]
  keypoints = keypoints.reshape(images.size(0), -1)

  output = model(images)

  mask = (keypoints != -1).float()
  loss = criterion(output, keypoints)
  masked_loss = loss * mask
  loss = masked_loss.sum() / mask.sum()

  val_loss += loss.item() * images.size(0)
  val_samples += images.size(0)

avg_val_loss = val_loss / val_samples
print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_val_loss:.4f}")

# Save the model if the validation loss is the best we've seen so far
if avg_val_loss < best_val_loss:
  best_val_loss = avg_val_loss torch.save(model.state_dict(), best_model_path)
  print(f"Saved best model with validation loss: {best_val_loss:.4f}")