In [1]:
# Local utilities
from util import *
environment_check()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA is available
Tensor on GPU: tensor([1., 2., 3.], device='cuda:0')

PyTorch3D is using CUDA


In [2]:
import torch
from torch import nn
from torchvision.models import resnet18

class SimplifiedImageToRTNetwork(nn.Module):
    def __init__(self, output_features=12):
        super().__init__()

        # Feature Extractor
        self.feature_extractor = resnet18(pretrained=True)
        # Modify the first convolutional layer to accept 1-channel input
        self.feature_extractor.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        # Remove the original fully connected layer (classification head)
        self.feature_extractor.fc = nn.Identity()

        # Additional layers could be added here to process the feature maps before the regression head
        # For instance, global average pooling could condense the spatial dimensions of the feature maps

        # Regression Head to output RT matrix (12 values for 3x4 matrix)
        self.regression_head = nn.Linear(512, output_features)  # Assuming the output of ResNet18 is 512 features

    def forward(self, x):
        # Extract features from the image
        feature_maps = self.feature_extractor(x)

        # Optional: Apply additional processing on feature_maps here

        # Adapt feature_maps for the regression head
        # If you added global average pooling in the network, this step might look different
        # Example: feature_vector = feature_maps.mean([2, 3]) if using global average pooling
        feature_vector = torch.flatten(feature_maps, 1)

        # Predict RT matrix
        rt_matrix = self.regression_head(feature_vector)

        # Reshape to 3x4 matrix
        rt_matrix = rt_matrix.view(-1, 3, 4)

        return rt_matrix

# Example usage:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimplifiedImageToRTNetwork().to(device)

# Assuming 'img' is a single image tensor that you want to process
# Ensure 'img' is of the right shape, with a batch dimension and a channel dimension
img_tensor = img.unsqueeze(0).to(device)  # Add a batch dimension if needed
rt_pred = model(img_tensor)  # Predicted RT matrix
print("Predicted RT Matrix:")
print(rt_pred)




NameError: name 'img' is not defined

In [3]:
import torch
import json
from PIL import Image
from torchvision import transforms, models
from torch.utils.data import DataLoader
from torch import nn

# Define the dataset class
class PoseRefinementDataset(torch.utils.data.Dataset):
    def __init__(self, data_json_filepath):
        with open(data_json_filepath, 'r') as f:
            self.data_json = json.load(f)
 
    def __len__(self):
        return len(self.data_json)

    def __getitem__(self, idx):
        entry = self.data_json[idx]
        image = Image.open(entry['silhouette_path']).convert('L').convert('RGB')  # Convert to RGB
        rt_matrix = torch.tensor(entry['RT'])
        return image, rt_matrix

In [4]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torchvision import models, transforms
from torch import nn

class ViTImageToRTNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.vit = models.vit_b_16(pretrained=True)
        self.vit.heads = nn.Identity()  # Remove the classifier head

        # Define the linear layer for regression; assume 768 features from ViT B-16 model
        self.regressor = nn.Linear(768, 16)  # Output a 4x4 transformation matrix

    def forward(self, x):
        x = x.repeat(1, 3, 1, 1)  # Ensure input is RGB
        x = self.vit(x)
        rt_matrix = self.regressor(x[:, 0])  # Apply linear layer to [CLS] token output
        return rt_matrix.view(-1, 4, 4)


def train_model(dataset, epochs=10, batch_size=32, learning_rate=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = ViTImageToRTNetwork().to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for images, rt_matrices in loader:
            images, rt_matrices = images.to(device), rt_matrices.to(device)
            optimizer.zero_grad()
            outputs = model(images)

            # Only consider the translation part of the outputs and labels
            loss = criterion(outputs[:, :, 3], rt_matrices[:, :, 3])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}: Avg Loss = {total_loss / len(loader)}")

# Example usage:
# Assuming `data_json_path` is defined and points to the correct JSON file
dataset = PoseRefinementDataset("./pose_refine_dataset/dataset_info.json")
train_model(dataset, epochs=5, batch_size=4)




TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>