In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
import numpy as np
    
class FeatureNet(nn.Sequential):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
          nn.Conv2d(4, 32, 3),
          nn.Conv2d(32, 32, 3),
          nn.Conv2d(32, 32, 3),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.Conv2d(32, 64, 3, stride=2),
          nn.Conv2d(64, 128, 3),
          nn.Conv2d(128, 128, 3),
          nn.BatchNorm2d(128),
          nn.ReLU(),    
          nn.Conv2d(128, 256, 3, stride=2),
          nn.Conv2d(256, 256, 3),
          nn.Conv2d(256, 256, 3),
          nn.BatchNorm2d(256),
          nn.ReLU(),    
          nn.Conv2d(256, 196, 1),
          nn.BatchNorm2d(196),
          nn.ReLU(),     
        )
    
    def forward(self, data):
        return self.model(data)
    
class PoseRegression(nn.Sequential):
  
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
          nn.Linear(588, 294),
          nn.BatchNorm1d(294),
          nn.Linear(294, 147),
          nn.BatchNorm1d(147),
          nn.ReLU(),
          nn.Linear(147, 42),
          nn.BatchNorm1d(42),
          nn.ReLU(),
          nn.Linear(42, 21),
          nn.BatchNorm1d(21),
          nn.Tanh()
        )
    
    def forward(self, data):
        return self.model(data)
    
    

class Transformation(nn.Module):
    
    def __init__(self, rotation, translation):
        super().__init__()
        self.rotation = rotation
        self.translation = translation
        self.br = torch.transpose(rotation, 1, 0)
        self.bt = torch.matmul(self.br, translation)
        
    def forward(self, data):
        reprojected_pt = torch.matmul(self.rotation,data) + self.translation#[:,None]
        return reprojected_pt
    
    def backward(self, data):
        reprojected_pt = torch.matmul(self.br,data) + self.bt#[:,None]
        return reprojected_pt
        
class ExtendedSpatialSoftargMax(nn.Module):
  
    def __init__(self, height, width, channel, temperature=None, data_format='NCHW'):
        super(ExtendedSpatialSoftargMax, self).__init__()
        self.data_format = data_format
        self.height = height
        self.width = width
        self.channel = channel

        if temperature:
            self.temperature = Parameter(torch.ones(1)*temperature)
        else:
            self.temperature = 1.

        pos_x, pos_y = np.meshgrid(
                np.linspace(-1., 1., self.height),
                np.linspace(-1., 1., self.width)
                )
        pos_x = torch.from_numpy(pos_x.reshape(self.height*self.width)).float()
        pos_y = torch.from_numpy(pos_y.reshape(self.height*self.width)).float()
        self.register_buffer('pos_x', pos_x)
        self.register_buffer('pos_y', pos_y)

    def forward(self, feature, depth):
        # Output:
        #   (N, C*2) x_0 y_0 ...
        if self.data_format == 'NHWC':
            feature = feature.transpose(1, 3).tranpose(2, 3).view(-1, self.height*self.width)
        else:
            feature = feature.view(-1, self.height*self.width)

        softmax_attention = F.softmax(feature/self.temperature, dim=-1)
        expected_x = torch.sum(self.pos_x*softmax_attention, dim=1, keepdim=True)
        expected_y = torch.sum(self.pos_y*softmax_attention, dim=1, keepdim=True)
#         expected_xy = torch.cat([expected_x, expected_y], 1)
#         feature_keypoints = expected_xy.view(-1, self.channel*2)
        
        image_height = depth.shape[3]
        image_weight = depth.shape[2]
        
        ix = torch.round(expected_x * image_weight).long()
        iy = torch.round(expected_y * image_height).long()
        
        z = depth[:, 0, ix, iy]
        
        result = torch.cat([expected_x * z, expected_y * z, z], 1)
        feature_keypoints = result.view(-1, self.channel*3)

        return feature_keypoints


In [2]:
class Keyframe(nn.Module):
    def __init__(self, rotation, translation):
        super(Keyframe, self).__init__()
        self.feature = FeatureNet()
        self.extended_spatial_max = ExtendedSpatialSoftargMax(31,21,196)
        self.transform = Transformation(rotation, translation)
        self.pose_regress = PoseRegression()
        self.rotation = rotation
        self.translation = translation
              
    def forward(self, data, depth):
        output = self.feature(data)
        output = self.extended_spatial_max(output, depth)
        output = self.transform(output)
        output = self.pose_regress(output)
        return output

In [3]:
import math

def euler2rotm(theta):
    R_x = np.array([[1,         0,                  0                   ],
                    [0,         math.cos(theta[0]), -math.sin(theta[0]) ],
                    [0,         math.sin(theta[0]), math.cos(theta[0])  ]
                    ])
    R_y = np.array([[math.cos(theta[1]),    0,      math.sin(theta[1])  ],
                    [0,                     1,      0                   ],
                    [-math.sin(theta[1]),   0,      math.cos(theta[1])  ]
                    ])         
    R_z = np.array([[math.cos(theta[2]),    -math.sin(theta[2]),    0],
                    [math.sin(theta[2]),    math.cos(theta[2]),     0],
                    [0,                     0,                      1]
                    ])            
    R = np.dot(R_z, np.dot( R_y, R_x ))
    return R

def point_in_camera(point, rotation, translation):
    reprojected_pt = torch.matmul(rotation,point) + translation#[:,None]
    return reprojected_pt

In [4]:
rotation = torch.Tensor(euler2rotm([np.pi / 4, np.pi, -np.pi / 2]))
translation = torch.Tensor(np.array([1.0, 0, 0.75]).reshape((3,1)))

k = Keyframe(rotation, translation)

In [5]:
# coding=utf-8
# Copyright 2021 The Yan Li, UTK, Knoxville, TN.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import cv2 # for resize image
from os import listdir
from os.path import join
import pickle
from torch.utils.data import Dataset
import random as r
import numpy as np


class dataset(Dataset):
    def __init__(self, data_dir, transform=None, target_transform=None):
        self.data_dir = data_dir
        self.mean = [0.485, 0.456, 0.406] 
        self.std = [0.229, 0.224, 0.225]
        self.image_folder = join(data_dir, 'color')
        self.depth_folder = join(data_dir, 'depth')
        self.info_folder = join(data_dir, 'info')
        self.action_folder = join(data_dir, 'action')
        self.image_files = listdir(self.image_folder)
        self.depth_files = listdir(self.depth_folder)
        self.info_files = listdir(self.info_folder)
        self.action_files = listdir(self.action_folder)
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        # image
        data = self.loadfile(join(self.image_folder, self.image_files[idx]))
        selected_frame = r.randint(1, data.shape[0])
        img = data[selected_frame,0]
        img = cv2.resize(img, dsize=(160, 120), interpolation=cv2.INTER_CUBIC)
        img = img / 255.0
        img = img - self.mean
        img = img / self.std
        
        # depth
        data = self.loadfile(join(self.depth_folder, self.depth_files[idx]))            
        depth = data[selected_frame,0]
        td = cv2.resize(depth, dsize=(160, 120), interpolation=cv2.INTER_CUBIC)        
        depth = torch.Tensor(data[selected_frame,0])
        color = torch.Tensor(img)
        td = torch.Tensor(td)
        td = torch.reshape(td, (120,160,1))
        data = torch.cat((color, td), 2)        
        data = data.permute(2, 0 ,1)
        data = torch.reshape(data, (1, 4, 120, 160))
        
        # info
        info = self.loadfile(join(self.info_folder, self.info_files[idx]))
        #action 
        action = self.loadfile(join(self.action_folder, self.action_files[idx]))        
        gt = self.resolveInfo(info, action, selected_frame)
        
        return data, depth, gt
    
    def loadfile(self, filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)
        
    def resolveInfo(self, info, action, selected_frame):

        x = np.concatenate((np.array(info[selected_frame][5][0]), 
             np.array(info[selected_frame][5][1])),axis=0)
        y = np.concatenate((np.array(info[selected_frame][6][0]), 
             np.array(info[selected_frame][6][1])), axis=0)
        z = np.concatenate((np.array(action[selected_frame]['pose'][0]), 
             np.array(action[selected_frame]['pose'][1])),axis=0)
        
        gt = np.concatenate((x,y,z),axis=0)
        gt = torch.Tensor(gt).reshape([1,gt.shape[0]])
        return gt
        
#mean = [0.485, 0.456, 0.406] 
#std = [0.229, 0.224, 0.225]
#
#data_path = '../datasets/key_frame_identifier/block-insertion-test/'
#img_folder = data_path + 'color/'
#depth_folder = data_path + 'depth/'
#pcl_file = '000000-1'

In [6]:
data_dir = 'datasets/key_frame_identifier/block-insertion-test'
data = dataset(data_dir)

In [7]:
print(len(data))
img, depth, gt = data[999]
print(gt.shape)

1000
torch.Size([1, 21])


In [8]:
x = torch.Tensor(7,1)
y = torch.Tensor(7,1)
z = torch.Tensor(7,1)

o = torch.cat((x,y,z),dim=0)
print(o.shape)

torch.Size([21, 1])


In [9]:
import datetime
import torch
import cv2 # for resize image
from os import listdir
from os.path import join
import pickle
from torch.utils.data import Dataset
import random as r
import numpy as np


class dataset(Dataset):
    def __init__(self, data_dir, transform=None, target_transform=None):
        self.data_dir = data_dir
        self.mean = [0.485, 0.456, 0.406] 
        self.std = [0.229, 0.224, 0.225]
        self.image_folder = join(data_dir, 'color')
        self.depth_folder = join(data_dir, 'depth')
        self.info_folder = join(data_dir, 'info')
        self.action_folder = join(data_dir, 'action')
        self.image_files = listdir(self.image_folder)
        self.depth_files = listdir(self.depth_folder)
        self.info_files = listdir(self.info_folder)
        self.action_files = listdir(self.action_folder)
        
    def __len__(self):
        return len(self.image_files) - 1
    
    def __getitem__(self, idx):
        # image
        data = self.loadfile(join(self.image_folder, self.image_files[idx]))
        r.seed(datetime.datetime.now())
        selected_frame = r.randint(1, data.shape[0])
        img = data[selected_frame,0]
        img = cv2.resize(img, dsize=(160, 120), interpolation=cv2.INTER_CUBIC)
        img = img / 255.0
        img = img - self.mean
        img = img / self.std
        
        # depth
        data = self.loadfile(join(self.depth_folder, self.depth_files[idx]))            
        depth = data[selected_frame,0]
        td = cv2.resize(depth, dsize=(160, 120), interpolation=cv2.INTER_CUBIC)
        depth = torch.Tensor(depth)
        color = torch.Tensor(img)
        td = torch.Tensor(td)
        td = torch.reshape(td, (120,160,1))
        data = torch.cat((color, td), 2)        
        data = data.permute(2, 0 ,1)
        data = torch.reshape(data, (1, 4, 120, 160))
        
        # info
        info = self.loadfile(join(self.info_folder, self.info_files[idx]))
        #action 
        action = self.loadfile(join(self.action_folder, self.action_files[idx]))        
        gt = self.resolveInfo(info, action, selected_frame)
        
        return data, depth, gt
    
    def loadfile(self, filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)
        
    def resolveInfo(self, info, action, selected_frame):
        
        print(selected_frame)

        x = np.concatenate((np.array(info[selected_frame][5][0]), 
             np.array(info[selected_frame][5][1])),axis=0)
        y = np.concatenate((np.array(info[selected_frame][6][0]), 
             np.array(info[selected_frame][6][1])), axis=0)
        z = np.concatenate((np.array(action[selected_frame]['pose'][0]), 
             np.array(action[selected_frame]['pose'][1])),axis=0)
        
        gt = np.concatenate((x,y,z),axis=0)
        gt = torch.Tensor(gt)
#        gt = torch.Tensor(gt).reshape([1,gt.shape[0]])
        return gt
        
#mean = [0.485, 0.456, 0.406] 
#std = [0.229, 0.224, 0.225]
#
#data_path = '../datasets/key_frame_identifier/block-insertion-test/'
#img_folder = data_path + 'color/'
#depth_folder = data_path + 'depth/'
#pcl_file = '000000-1'


In [10]:
import torch
import torch.nn as nn

    
class FeatureNet(nn.Sequential):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
          nn.Conv2d(4, 32, 3),
          nn.Conv2d(32, 32, 3),
          nn.Conv2d(32, 32, 3),
          nn.BatchNorm2d(32),
          nn.ReLU(),
          nn.Conv2d(32, 64, 3, stride=2),
          nn.Conv2d(64, 128, 3),
          nn.Conv2d(128, 128, 3),
          nn.BatchNorm2d(128),
          nn.ReLU(),    
          nn.Conv2d(128, 256, 3, stride=2),
          nn.Conv2d(256, 256, 3),
          nn.Conv2d(256, 256, 3),
          nn.BatchNorm2d(256),
          nn.ReLU(),    
          nn.Conv2d(256, 196, 1),
          nn.BatchNorm2d(196),
          nn.ReLU(),     
        )
    
    def forward(self, data):
        return self.model(data)
    
    
class PoseRegression(nn.Sequential):
  
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
          nn.Linear(588, 294),
          nn.BatchNorm1d(294),
          nn.Linear(294, 147),
          nn.BatchNorm1d(147),
          nn.ReLU(),
          nn.Linear(147, 42),
          nn.BatchNorm1d(42),
          nn.ReLU(),
          nn.Linear(42, 21),
          nn.BatchNorm1d(21),
          nn.Tanh()
        )
    
    def forward(self, data):
        return self.model(data)


class Transformation(nn.Module):
    
    def __init__(self, rotation, translation):
        super().__init__()
        self.rotation = rotation
        self.translation = translation
        self.br = torch.transpose(rotation, 1, 0)
        self.bt = torch.matmul(self.br, translation)
        
    def forward(self, data):
        reprojected_pt = torch.matmul(self.rotation,data) + self.translation#[:,None]
        return reprojected_pt
    
    def backward(self, data):
        reprojected_pt = torch.matmul(self.br,data) + self.bt#[:,None]
        return reprojected_pt
        


In [11]:
class Keyframe(nn.Module):
    def __init__(self, rotation, translation, spatial_height=31, 
                 spatial_weight=21, spatial_channel=196):
        super(Keyframe, self).__init__()
        self.feature = FeatureNet()
#        self.extended_spatial_max = ExtendedSpatialSoftargMax(31,21,196)
        self.extended_spatial_max = ExtendedSpatialSoftargMax(spatial_height, 
                                                              spatial_weight, 
                                                              spatial_channel)
        self.transform = Transformation(rotation, translation)
        self.pose_regress = PoseRegression()
        self.rotation = rotation
        self.translation = translation
              
    def forward(self, data, depth):
        output = self.feature(data)
        output = self.extended_spatial_max(output, depth)
        output = self.transform(output)
        output = self.pose_regress(output)
        return output
    
    

In [12]:
import math
import numpy as np

    
def selectDevice():        
    torch.cuda.is_available()    
    if torch.cuda.is_available():  
      dev = "cuda:0" 
    else:  
      dev = "cpu"      
    return torch.device(dev)  


def euler2rotm(theta):
    R_x = np.array([[1,         0,                  0                   ],
                    [0,         math.cos(theta[0]), -math.sin(theta[0]) ],
                    [0,         math.sin(theta[0]), math.cos(theta[0])  ]
                    ])
    R_y = np.array([[math.cos(theta[1]),    0,      math.sin(theta[1])  ],
                    [0,                     1,      0                   ],
                    [-math.sin(theta[1]),   0,      math.cos(theta[1])  ]
                    ])         
    R_z = np.array([[math.cos(theta[2]),    -math.sin(theta[2]),    0],
                    [math.sin(theta[2]),    math.cos(theta[2]),     0],
                    [0,                     0,                      1]
                    ])            
    R = np.dot(R_z, np.dot( R_y, R_x ))
    return R


def camTrans():
    rotation = torch.Tensor(euler2rotm([np.pi / 4, np.pi, -np.pi / 2]))
    translation = torch.Tensor(np.array([1.0, 0, 0.75]).reshape((3,1)))
    
    return rotation, translation

In [13]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn.parameter import Parameter
import numpy as np

class ExtendedSpatialSoftargMax(nn.Module):
  
    def __init__(self, height, width, channel, temperature=None, data_format='NCHW'):
        super(ExtendedSpatialSoftargMax, self).__init__()
        self.data_format = data_format
        self.height = height
        self.width = width
        self.channel = channel

        if temperature:
            self.temperature = Parameter(torch.ones(1)*temperature)
        else:
            self.temperature = 1.

        pos_x, pos_y = np.meshgrid(
                np.linspace(-1., 1., self.height),
                np.linspace(-1., 1., self.width)
                )
        pos_x = torch.from_numpy(pos_x.reshape(self.height*self.width)).float()
        pos_y = torch.from_numpy(pos_y.reshape(self.height*self.width)).float()
        self.register_buffer('pos_x', pos_x)
        self.register_buffer('pos_y', pos_y)

    def forward(self, feature, depth):
        # Output:
        #   (N, C*2) x_0 y_0 ...
        if self.data_format == 'NHWC':
            feature = feature.transpose(1, 3).tranpose(2, 3).view(-1, self.height*self.width)
        else:
            feature = feature.view(-1, self.height*self.width)

        softmax_attention = F.softmax(feature/self.temperature, dim=-1)
        expected_x = torch.sum(self.pos_x*softmax_attention, dim=1, keepdim=True)
        expected_y = torch.sum(self.pos_y*softmax_attention, dim=1, keepdim=True)
#         expected_xy = torch.cat([expected_x, expected_y], 1)
#         feature_keypoints = expected_xy.view(-1, self.channel*2)
        
        image_height = depth.shape[3]
        image_weight = depth.shape[2]
        
        ix = torch.round(expected_x * image_weight).long()
        iy = torch.round(expected_y * image_height).long()
        
        z = depth[:, 0, ix, iy] 
        z_prime_x = z / 900 * image_weight
        z_prime_y = z / 900 * image_height
        
        result = torch.cat([expected_x * z_prime_x, expected_y * z_prime_y, z], 1)
        feature_keypoints = result.view(-1, self.channel*3)

        return feature_keypoints

In [46]:
# coding=utf-8
# Copyright 2021 The Yan Li, UTK, Knoxville, TN.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import torch
import torch.nn as nn


def main():
    device = selectDevice()
    
    rot, t = camTrans()
    
    model = Keyframe(rot, t).to(device)    
    # preparing dataset
    print('preparing dataset')
    data_path = '/workspace/datasets/key_frame_identifier/block-insertion-test/'    
    training_set = dataset(data_path)
    params = {'batch_size': 64,
              'shuffle': True,
              'num_workers': 1}    
    train_generator = torch.utils.data.DataLoader(training_set, **params)
    # setup loss fucntion
    print('setting up loss function')
    criterion = nn.MSELoss()
    # setup optimizer 
    print('setup optimizer')
    learning_rate = 1e-3
    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

    # maximum epochs
    max_epochs = 200000
    
    # start the training
    print('starting the training process')
    for epoch in range(max_epochs):
#        name = f'{FLAGS.task}-{FLAGS.agent}-{FLAGS.n_demos}-{train_run}'
        
#        curr_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
#        log_dir = os.path.join(data_path, 'logs', curr_time, 'train')
#        
        for data, depth, gt in train_generator:
            print(data.shape, depth.shape, gt.shape)
            ps = model(data, depth)
            loss = criterion(ps, gt)
            if t % 100 == 99:
                print(t, loss.item())
            
            optimizer.zero_grad()
            loss.backward()            
            optimizer.step()
            

if __name__ == '__main__':
    main()

preparing dataset
setting up loss function
setup optimizer
starting the training process
74
10
44
71
64
31
68
90
56
11
47
7
19
19
41
83
24
29
29
80
103
85
82
4
10
28
72
13
92
46
45
51
34
84
43
44
85
79
35
9
102
93
43
23
86
56
23
59
87
30
91
86
34
79
33
28
111
38
3
63
17
76
88


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-41-ae80c77d169e>", line 56, in __getitem__
    gt = self.resolveInfo(info, action, selected_frame)
  File "<ipython-input-41-ae80c77d169e>", line 72, in resolveInfo
    z = np.concatenate((np.array(action[selected_frame]['pose'][0]),
TypeError: 'NoneType' object is not subscriptable


52
47
66
63
26
97
63
71
26
34
65
80
51
66
17
9
81
72
31
10
34
13
2
71
72
1
39
7
77
30
68
86
23
46
9
37
55
73
33
27
78
84
10
42
46
48
49
33
72
81
6
76
24
53
101
65
31
39
69
99
75
57
65
73


In [None]:
from datetime import datetime
#and the access its now method simpler
d1 = datetime.now()
print(d1)

In [94]:
x = torch.rand([12544,1])
y = torch.rand([12544,1])
x[x>1] = 1
x[x<-1] = -1
y[y>1] = 1
y[y<-1] = -1

coord = (x * 320 + 320) + (y * 240 + 239) * 640

print(sum(coord>480*640))

baseline = torch.arange(0, 64) * 640*480.0
baseline = baseline.repeat(196,1)
# baseline = np.meshgrid(1,64) * 640*480
# print(np.array(baseline).shape)
baseline = torch.transpose(baseline, 1, 0)
print(baseline.shape)
baseline = baseline.reshape(64 * 196, 1)
print(baseline.shape)
print(coord.shape)

tensor([0])
torch.Size([64, 196])
torch.Size([12544, 1])
torch.Size([12544, 1])


In [95]:
coord = torch.round(coord + baseline).long()

print(coord.shape)
d = torch.rand([64*480*640])
print(d.shape)
z = torch.take(d, coord)
print(z.shape)
# print(baseline[1:12544:196])

torch.Size([12544, 1])
torch.Size([19660800])
torch.Size([12544, 1])
