In [2]:
%load_ext autoreload
%autoreload 2
from util.shape_implicit import ShapeImplicit
from deepsdf.model import DeepSDFDecoder
import trimesh
import torch
import pickle
from util.tet_grid import Grid
import numpy as np
import clip;
import kaolin as kal
import os
from kaolin.ops.conversions import marching_tetrahedra
from torch.nn import functional as F
from matplotlib import pyplot as plt
from torch import nn
tet_res = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_size = 256
text = "L shaped sofa"


In [3]:
g = Grid(-1,1, tet_res, tet_res, tet_res, device)
v = g.v.clone()
T = g.T.clone()
mean = torch.as_tensor((0.48145466, 0.4578275, 0.40821073), dtype=torch.float, device=device)
std = torch.as_tensor((0.26862954, 0.26130258, 0.27577711), dtype=torch.float, device=device)
model = DeepSDFDecoder(latent_size)
model.load_state_dict(torch.load("checkpoints/sdf_best.ckpt", map_location='cpu'))
model.eval()
model.to(device)
model_clip, preprocess = clip.load("ViT-B/32", device=device)
model_clip.eval()

os.makedirs("images", exist_ok=True)
cam_proj = kal.render.camera.generate_perspective_projection(1, ratio=1.0, dtype=torch.float32).cuda()
p = np.array([-3, -2, 2 ,3])
coords = [torch.from_numpy(np.random.permutation(p)[:3]).float() for i in range(10)]
# print(coords)
camera_positions = torch.stack([torch.FloatTensor([0 , 0, -2])]).cuda()
look_at = torch.FloatTensor([0, 0, 0]).cuda().unsqueeze(0).expand_as(camera_positions)
camera_up_direction = torch.FloatTensor([0, 1, 0]).cuda().unsqueeze(0).expand_as(camera_positions)
cam_transform =  kal.render.camera.generate_transformation_matrix(camera_positions, look_at, camera_up_direction).cuda().float()
v.requires_grad = False
T.requires_grad = False
criterion = torch.nn.CosineSimilarity()
def laplace_regularizer_const(mesh_verts, mesh_faces):
    term = torch.zeros_like(mesh_verts)
    norm = torch.zeros_like(mesh_verts[..., 0:1])

    v0 = mesh_verts[mesh_faces[:, 0], :]
    v1 = mesh_verts[mesh_faces[:, 1], :]
    v2 = mesh_verts[mesh_faces[:, 2], :]

    term.scatter_add_(0, mesh_faces[:, 0:1].repeat(1,3), (v1 - v0) + (v2 - v0))
    term.scatter_add_(0, mesh_faces[:, 1:2].repeat(1,3), (v0 - v1) + (v2 - v1))
    term.scatter_add_(0, mesh_faces[:, 2:3].repeat(1,3), (v0 - v2) + (v1 - v2))

    two = torch.ones_like(v0) * 2.0
    norm.scatter_add_(0, mesh_faces[:, 0:1], two)
    norm.scatter_add_(0, mesh_faces[:, 1:2], two)
    norm.scatter_add_(0, mesh_faces[:, 2:3], two)

    term = term / torch.clamp(norm, min=1.0)

    return torch.mean(term**2)
timelapse = kal.visualize.Timelapse("logs")


In [None]:
Z = torch.ones(1, 256).normal_(mean=0, std=0.01).to(device)
Z.requires_grad = True
init_Z = Z.clone().detach().cpu().numpy()
best_sim = 0
v = g.v.clone()
T = g.T.clone()
# os.system("rm -r logs")
all_texts = ["l shaped couch", "chesterfield"]
for text_orig in all_texts:    
    text = clip.tokenize([text_orig]).to(device)
    text_features = model_clip.encode_text(text)
    nof_iterations = 1000
    for jp in range(8):
        Z = torch.from_numpy(init_Z).float().cuda()
        Z.requires_grad = True

        best_sim = 0
        optimizer = torch.optim.Adam([Z], lr=1e-2)
        v = g.v.clone()
        T = g.T.clone()
        try:

            for i in range(nof_iterations):
                optimizer.zero_grad()
                sdfs_tensors = []
                deform_tensors = []
                color_tensors = []
                split_size = v.shape[1] // 32
                q = torch.split(v,  split_size, dim=1)
                for pts in q:
                    pts = pts.squeeze(0).float()
                    sdfs_tensors.append(model(torch.cat([Z.expand(pts.shape[0], -1), pts.float()], dim=-1)))

                sdfs = torch.stack(sdfs_tensors).unsqueeze(0).view(1,-1)

                vertices, faces = marching_tetrahedra(v, T, sdfs)

                if i == 0:
                    in_mesh = trimesh.Trimesh(vertices[0].detach().cpu().numpy(), torch.fliplr(faces[0]).detach().cpu().numpy())
                    in_mesh.export("in.obj")
                vertices = vertices[0].float()
                faces = faces[0].long()
                faces = torch.fliplr(faces)
                # t = torch.rand( size=(1, faces.shape[0], faces.shape[1], 3)).cuda().expand(camera_positions.shape[0], faces.shape[0], faces.shape[1], 3)
                t = torch.randint(high=254, size=(1, faces.shape[0], faces.shape[1], 3)).cuda().float().expand(camera_positions.shape[0], faces.shape[0], faces.shape[1], 3)
                face_vertices_camera, face_vertices_image, face_normals =  kal.render.mesh.prepare_vertices(vertices, faces, cam_proj, camera_transform=cam_transform)
                im = kal.render.mesh.rasterize(224, 224, face_vertices_camera[:,:,:,-1].float(),
                            face_vertices_image.float(), t)
                img = im[0] / 255 
                img = img
                img = ((img - mean) / std)
                img = img.permute(0, 3, 1, 2)
                image_features = model_clip.encode_image(img).float()
                cos_sim = F.cosine_similarity(image_features, text_features.float()).mean()
                if cos_sim.item() > best_sim:
                    best_sim = cos_sim.item()
                    best_vertices = vertices.clone()
                    best_faces = faces.clone()
                sims = criterion(image_features, text_features.float()) 
                loss =  -10*(sims.sum())
                loss.backward(retain_graph=True)
                print(loss.item(), (Z.grad**2).mean())
                optimizer.step()
                timelapse.add_mesh_batch(category=text_orig + 'output' + str(jp),
                                         iteration=i,
                                         faces_list=[faces],
                                         vertices_list=[vertices])
                plt.imsave(os.path.join("images", f"{i+1:02d}.jpg"),    arr = im[0][0].squeeze(0).detach().cpu().numpy() / 255)
        except Exception as e:
            print("failed 1 train", e)
            continue

-27.417808532714844 tensor(0.0002, device='cuda:0')
-27.559158325195312 tensor(0.0001, device='cuda:0')
-26.81973648071289 tensor(9.5569e-05, device='cuda:0')
-26.923654556274414 tensor(0.0003, device='cuda:0')
-27.583513259887695 tensor(8.0721e-05, device='cuda:0')
-27.359153747558594 tensor(0.0005, device='cuda:0')
-27.10374641418457 tensor(0.0002, device='cuda:0')
-27.427783966064453 tensor(0.0003, device='cuda:0')
-27.70981788635254 tensor(0.0002, device='cuda:0')
-27.59756088256836 tensor(0.0001, device='cuda:0')
-27.087188720703125 tensor(0.0002, device='cuda:0')
-27.325918197631836 tensor(8.5413e-05, device='cuda:0')
-27.198497772216797 tensor(0.0003, device='cuda:0')
-26.923538208007812 tensor(0.0001, device='cuda:0')
-27.19244956970215 tensor(0.0002, device='cuda:0')
-27.527170181274414 tensor(0.0002, device='cuda:0')
-27.60561180114746 tensor(0.0002, device='cuda:0')
-27.018281936645508 tensor(0.0002, device='cuda:0')
-27.048683166503906 tensor(0.0002, device='cuda:0')
-27.78

In [4]:
best_mesh = trimesh.Trimesh(best_vertices.detach().cpu().numpy(), best_faces.detach().cpu().numpy())
last_mesh = trimesh.Trimesh(vertices.detach().cpu().numpy(), faces.detach().cpu().numpy())
last_mesh.show()

In [18]:
criterion(image_features.expand_as(text_features), text_features.float())

RuntimeError: The expanded size of the tensor (1) must match the existing size (2) at non-singleton dimension 0.  Target sizes: [1, 512].  Tensor sizes: [2, 512]

In [None]:
in_mesh.show()

In [6]:
best_mesh.show()

In [53]:
import cv2
image_folder = 'images'
video_name = 'video.avi'

images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
frame = cv2.imread(os.path.join(image_folder, images[0]))
height, width, layers = frame.shape



video = cv2.VideoWriter(video_name, 0, 5, (width,height))

for image in images:
    video.write(cv2.imread(os.path.join(image_folder, image)))

cv2.destroyAllWindows()
video.release()

In [23]:
from IPython.display import Video

f = Video("video.mp4", embed=True)
f