In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import sys
sys.path.append("../models")
from var_autoencoders import encoder,Decoder,VAE
import torch
import trimesh
from torch.utils.data import Dataset
from PIL import Image
import os
from torchvision import transforms
from torch.utils.data import DataLoader

In [67]:
#data preparation
json_dir = "../../pix3d.json"
with open(json_dir, 'r') as f:
    data = json.load(f)

In [68]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,img,category,img_size,2d_keypoints,mask,img_source,model,model_raw,model_source,3d_keypoints,voxel,rot_mat,trans_mat,focal_length,cam_position,inplane_rotation,truncated,occluded,slightly_occluded,bbox
0,img/bed/0001.png,bed,"[395, 244]","[[[182.5, 147.09375], [174.5, 225.09375], [16....",mask/bed/0001.png,ikea,model/bed/IKEA_MALM_2/model.obj,,ikea,model/bed/IKEA_MALM_2/3d_keypoints.txt,model/bed/IKEA_MALM_2/voxel.mat,"[[0.7813941591465821, 0.00095539348511137, -0....","[-0.00024347016915001151, 0.09068297313399999,...",35.270398,"[-0.7062195301276326, 0.2367305448542897, -0.8...",-0.078517,False,False,False,"[4, 22, 362, 228]"
1,img/bed/0002.png,bed,"[1007, 599]","[[[309.29285714285714, 543.9148660714286], [-1...",mask/bed/0002.png,ikea,model/bed/IKEA_MALM_2/model.obj,,ikea,model/bed/IKEA_MALM_2/3d_keypoints.txt,model/bed/IKEA_MALM_2/voxel.mat,"[[0.6331473196939317, 0.08400992130502279, -0....","[0.025652375712099995, 0.0434050556712, 1.1086...",32.378901,"[-0.8365679093356918, 0.3969797870961433, -0.6...",-0.107273,True,True,False,"[46, 47, 927, 599]"
2,img/bed/0003.png,bed,"[372, 292]","[[[308.0, 202.09375], [-1.0, -1.0], [-1.0, -1....",mask/bed/0003.png,ikea,model/bed/IKEA_HEMNES_1/model.obj,,ikea,model/bed/IKEA_HEMNES_1/3d_keypoints.txt,model/bed/IKEA_HEMNES_1/voxel.mat,"[[0.9337851369875168, 0.004246139989357373, -0...","[0.05160487022685999, 0.0022441725076000067, 0...",39.511348,"[-0.33341418148967134, 0.18052455616467913, -0...",-0.019073,True,False,False,"[0, 33, 344, 292]"
3,img/bed/0004.png,bed,"[1063, 755]","[[[230.06357142857144, 399.5266517857143], [93...",mask/bed/0004.png,ikea,model/bed/IKEA_HEMNES_1/model.obj,,ikea,model/bed/IKEA_HEMNES_1/3d_keypoints.txt,model/bed/IKEA_HEMNES_1/voxel.mat,"[[0.5029912563246631, 0.09073372306215703, -0....","[0.02463177822849999, 0.0738148517616, 1.09891...",31.871608,"[-0.9464253871842163, 0.2661593054097636, -0.4...",-0.120969,False,False,False,"[89, 74, 984, 738]"
4,img/bed/0005.png,bed,"[414, 449]","[[[-1.0, -1.0], [-1.0, -1.0], [-1.0, -1.0], [4...",mask/bed/0005.png,ikea,model/bed/IKEA_MALM_2/model.obj,,ikea,model/bed/IKEA_MALM_2/3d_keypoints.txt,model/bed/IKEA_MALM_2/voxel.mat,"[[0.7883156484286317, -0.04024974826747893, 0....","[-0.205655543756, 0.057996630364799975, 1.6363...",89.131102,"[1.1526451872054488, 0.3727803200217579, -1.12...",-0.083036,True,False,True,"[38, 44, 414, 420]"


In [69]:
class pix3d_dataset(Dataset):
    def __init__(self, dataframe, transform=None, max_vertices=2000, max_faces=2000):
        self.transform = transform
        self.dataframe = dataframe
        self.max_vertices = max_vertices
        self.max_faces = max_faces

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_path = "../../" + self.dataframe.iloc[idx]['img']
        mask_path = "../../" + self.dataframe.iloc[idx]['mask']
        obj_path = "../../" + self.dataframe.iloc[idx]['model']

        image = Image.open(img_path).convert('RGB')
        mask = Image.open(mask_path).convert('L')

        mesh = trimesh.load(obj_path)
        mesh_v = mesh.geometry[list(mesh.geometry.keys())[0]]
        vertices = np.array(mesh_v.vertices)
        faces = np.array(mesh_v.faces)

        if vertices.shape[0] < self.max_vertices:
            vertices_padded = np.pad(vertices, ((0, self.max_vertices - vertices.shape[0]), (0, 0)), 'constant')
        else:
            vertices_padded = vertices[:self.max_vertices]
        
        if faces.shape[0] < self.max_faces:
            faces_padded = np.pad(faces, ((0, self.max_faces - faces.shape[0]), (0, 0)), 'constant')
        else:
            faces_padded = faces[:self.max_faces]

        if self.transform:
            image = self.transform(image)
            mask = self.transform(mask)

        vertices_padded = torch.tensor(vertices_padded, dtype=torch.float32)
        faces_padded = torch.tensor(faces_padded, dtype=torch.long)

        sample = {
            'image': image,
            'mask': mask,
            'vertices': vertices_padded,
            'faces': faces_padded
        }

        return sample

In [70]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(), 
])
dataset = pix3d_dataset(dataframe=df, transform=transform)
dataloader = DataLoader (dataset, batch_size=4, shuffle=False)

In [71]:
for batch in dataloader:
    images = batch['image']
    masks = batch['mask']
    vertices = batch['vertices']
    faces = batch['faces']
    
    print("Images:", images.shape)
    print("Masks:", masks.shape)
    print("Vertices:", vertices.shape)
    print("Faces:", faces.shape)
    break 

Images: torch.Size([4, 3, 128, 128])
Masks: torch.Size([4, 1, 128, 128])
Vertices: torch.Size([4, 2000, 3])
Faces: torch.Size([4, 2000, 3])


In [52]:
sample_mesh = trimesh.load("../../model/bed/IKEA_BEDDINGE/model.obj")
mesh = sample_mesh.geometry[list(sample_mesh.geometry.keys())[0]]
vertices = mesh.vertices
faces = mesh.faces
print(vertices.shape, faces.shape)

(114, 3) (72, 3)


In [64]:
epochs = 2
lr = 3e-4
batch_size = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VAE(3, 256, 128, 32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.BCELoss()
encoder = encoder(3, 256, 128).to(device)
Decoder = Decoder(128, 256, 32).to(device)

In [65]:
for epoch in range(epochs):
    for i, batch in enumerate(dataloader):
        images = batch['image'].to(device)
        masks = batch['mask'].to(device)
        vertices = batch['vertices'].to(device)
        faces = batch['faces'].to(device)

        optimizer.zero_grad()
        mu, logvar = encoder(images)

        z = model.reparameterize(mu, logvar)

        recon_images = Decoder(z)

        loss = model.loss_function(recon_images, images, mu, logvar)

        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print('Epoch[{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch + 1, epochs, i + 1, len(dataloader), loss.item()))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x262144 and 16384x128)