<a href="https://colab.research.google.com/github/alwnraj/compvisionalgo/blob/main/deepslam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

#  PyTorch using CPU only
device = torch.device("cpu")

# Defining a simple CNN for feature extraction
class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(32 * 8 * 8, 128)  # Assuming input size is 32x32

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# SLAM algorithm
class SLAM:
    def __init__(self):
        self.feature_extractor = FeatureExtractor().to(device)
        self.optimizer = optim.Adam(self.feature_extractor.parameters())
        self.map = {}
        self.current_position = (0, 0)
        self.start_time = None
        self.checkpoints = []

    def extract_features(self, image):
        with torch.no_grad():
            features = self.feature_extractor(image.unsqueeze(0).unsqueeze(0).float().to(device))
        return features.squeeze().cpu().numpy()

    def update_map(self, features):
        self.map[self.current_position] = features

    def move(self, dx, dy):
        self.current_position = (self.current_position[0] + dx, self.current_position[1] + dy)
        self.checkpoints.append(self.current_position)

    def run(self, data):
        self.start_time = time.time()

        for i, (image, movement) in enumerate(data):
            features = self.extract_features(image)
            self.update_map(features)
            self.move(*movement)

            print(f"Checkpoint {i+1}: Position {self.current_position}")

        end_time = time.time()
        print(f"Algorithm runtime: {end_time - self.start_time:.2f} seconds")

# Example usage
def main():
    slam = SLAM()

    # Simulated data: (image, movement)
    simulated_data = [
        (torch.randn(32, 32), (1, 0)),
        (torch.randn(32, 32), (0, 1)),
        (torch.randn(32, 32), (-1, 0)),
        (torch.randn(32,32), (-1,0))
    ]

    slam.run(simulated_data)

if __name__ == "__main__":
    main()

Checkpoint 1: Position (1, 0)
Checkpoint 2: Position (1, 1)
Checkpoint 3: Position (0, 1)
Checkpoint 4: Position (-1, 1)
Algorithm runtime: 0.09 seconds


In [None]:
!tar -xzvf "/content/drive/MyDrive/Colab Notebooks/rgbd_dataset_freiburg1_xyz.tgz"  -C "/content/drive/MyDrive/Colab Notebooks/outputfolder"

rgbd_dataset_freiburg1_xyz/
rgbd_dataset_freiburg1_xyz/accelerometer.txt
rgbd_dataset_freiburg1_xyz/rgb/
rgbd_dataset_freiburg1_xyz/rgb/1305031107.143260.png
rgbd_dataset_freiburg1_xyz/rgb/1305031125.650575.png
rgbd_dataset_freiburg1_xyz/rgb/1305031106.711508.png
rgbd_dataset_freiburg1_xyz/rgb/1305031119.747193.png
rgbd_dataset_freiburg1_xyz/rgb/1305031114.211303.png
rgbd_dataset_freiburg1_xyz/rgb/1305031103.743326.png
rgbd_dataset_freiburg1_xyz/rgb/1305031124.249327.png
rgbd_dataset_freiburg1_xyz/rgb/1305031103.543444.png
rgbd_dataset_freiburg1_xyz/rgb/1305031124.850535.png
rgbd_dataset_freiburg1_xyz/rgb/1305031105.211268.png
rgbd_dataset_freiburg1_xyz/rgb/1305031110.311404.png
rgbd_dataset_freiburg1_xyz/rgb/1305031107.343509.png
rgbd_dataset_freiburg1_xyz/rgb/1305031112.043270.png
rgbd_dataset_freiburg1_xyz/rgb/1305031107.411271.png
rgbd_dataset_freiburg1_xyz/rgb/1305031120.315196.png
rgbd_dataset_freiburg1_xyz/rgb/1305031106.075330.png
rgbd_dataset_freiburg1_xyz/rgb/1305031114.57923

In [None]:
import os
import numpy as np
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import pandas as pd
from PIL import Image
import time

device = torch.device("cpu")

# ... (previous code remains the same) ...
def read_file_list(filename):
    """
    Reads a trajectory from a text file.
    """
    file = open(filename)
    data = file.read()
    lines = data.replace(","," ").replace("\t"," ").split("\n")
    list = [[v.strip() for v in line.split(" ") if v.strip()!=""] for line in lines if len(line)>0 and line[0]!="#"]
    list = [(float(l[0]),l[1:]) for l in list if len(l)>1]
    return dict(list)

class TUM_RGBD_Dataset(Dataset):
    def __init__(self, base_dir, transform=None):
        self.base_dir = base_dir
        self.transform = transform

        self.rgb_dict = read_file_list(os.path.join(base_dir, 'rgb.txt'))
        self.depth_dict = read_file_list(os.path.join(base_dir, 'depth.txt'))
        self.groundtruth = pd.read_csv(os.path.join(base_dir, 'groundtruth.txt'),
                                       sep=' ', comment='#', header=None,
                                       names=['timestamp', 'tx', 'ty', 'tz', 'qx', 'qy', 'qz', 'qw'])

        # Synchronize RGB and depth images
        self.rgb_timestamps = list(self.rgb_dict.keys())
        self.depth_timestamps = list(self.depth_dict.keys())
        self.synced_timestamps = self.synchronize_timestamps()

    def synchronize_timestamps(self):
        synced = []
        for rgb_time in self.rgb_timestamps:
            depth_time = min(self.depth_timestamps, key=lambda x: abs(x - rgb_time))
            if abs(rgb_time - depth_time) < 0.02:  # 20ms threshold
                synced.append((rgb_time, depth_time))
        return synced

    def __len__(self):
        return len(self.synced_timestamps)

    def __getitem__(self, idx):
        rgb_time, depth_time = self.synced_timestamps[idx]

        rgb_path = os.path.join(self.base_dir, self.rgb_dict[rgb_time][0])
        depth_path = os.path.join(self.base_dir, self.depth_dict[depth_time][0])

        rgb_img = Image.open(rgb_path)
        depth_img = Image.open(depth_path)

        if self.transform:
            rgb_img = self.transform(rgb_img)
            depth_img = self.transform(depth_img)

        # Get the closest ground truth pose
        closest_gt = self.groundtruth.iloc[(self.groundtruth['timestamp'] - rgb_time).abs().argsort()[0]]
        pose = closest_gt[['tx', 'ty', 'tz']].values

        return rgb_img, depth_img, pose

class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(4, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(256 * 8 * 8, 512)
        self.fc2 = nn.Linear(512, 3)  # Output 3D pose (x, y, z)
        self.feature_extraction_time = 0
        self.feature_extraction_count = 0

    def forward(self, rgb, depth):
        start_time = time.time()
        x = torch.cat((rgb, depth), dim=1)
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        features = x.view(-1, 256 * 8 * 8)
        x = torch.relu(self.fc1(features))
        x = self.fc2(x)
        end_time = time.time()
        self.feature_extraction_time += (end_time - start_time)
        self.feature_extraction_count += 1
        return x, features

class SLAM:
    def __init__(self):
        self.feature_extractor = FeatureExtractor().to(device)
        self.optimizer = optim.Adam(self.feature_extractor.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()
        self.map = {}
        self.current_position = np.array([0.0, 0.0, 0.0])
        self.trajectory = [self.current_position]
        self.checkpoint_times = []

    def update(self, rgb, depth, gt_pose):
        start_time = time.time()
        self.optimizer.zero_grad()

        estimated_pose, features = self.feature_extractor(rgb, depth)

        loss = self.criterion(estimated_pose, gt_pose.float())
        loss.backward()
        self.optimizer.step()

        # Update position and map
        self.current_position = gt_pose.detach().cpu().numpy().squeeze()
        self.trajectory.append(self.current_position)
        self.map[tuple(self.current_position)] = features.detach().cpu().numpy().squeeze()

        end_time = time.time()
        self.checkpoint_times.append((start_time, end_time))

        return loss.item()

def main():
    base_dir = '/content/drive/MyDrive/Colab Notebooks/outputfolder/rgbd_dataset_freiburg1_xyz'

    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
    ])

    dataset = TUM_RGBD_Dataset(base_dir, transform=transform)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

    slam = SLAM()

    total_start_time = time.time()

    for i, (rgb, depth, gt_pose) in enumerate(dataloader):
        rgb, depth, gt_pose = rgb.to(device), depth.to(device), gt_pose.to(device)

        loss = slam.update(rgb, depth, gt_pose)

        if i % 10 == 0:
            checkpoint_start, checkpoint_end = slam.checkpoint_times[-1]
            print(f"Frame {i}, Loss: {loss:.4f}, Position: {slam.current_position}")
            print(f"Checkpoint time: {checkpoint_end - checkpoint_start:.4f} seconds")

    total_end_time = time.time()

    print("SLAM completed. Total frames processed:", len(dataset))
    print(f"Total runtime: {total_end_time - total_start_time:.2f} seconds")

    avg_feature_extraction_time = slam.feature_extractor.feature_extraction_time / slam.feature_extractor.feature_extraction_count
    print(f"Average feature extraction time: {avg_feature_extraction_time:.4f} seconds")
    print(f"Total features extracted: {slam.feature_extractor.feature_extraction_count}")
    print(f"Total feature extraction time: {slam.feature_extractor.feature_extraction_time:.2f} seconds")

if __name__ == "__main__":
    main()

Frame 0, Loss: 9093.6416, Position: [1.3405 0.6266 1.6575]
Checkpoint time: 0.3735 seconds
Frame 10, Loss: 1830.6576, Position: [1.2582 0.6251 1.5661]
Checkpoint time: 0.1663 seconds
Frame 20, Loss: 13.5613, Position: [1.1772 0.6252 1.4713]
Checkpoint time: 0.2325 seconds
Frame 30, Loss: 0.0265, Position: [1.1165 0.6209 1.3836]
Checkpoint time: 0.2489 seconds
Frame 40, Loss: 0.1646, Position: [1.1096 0.6227 1.3785]
Checkpoint time: 0.1727 seconds
Frame 50, Loss: 0.3122, Position: [1.1695 0.6239 1.4841]
Checkpoint time: 0.1838 seconds
Frame 60, Loss: 0.3337, Position: [1.2535 0.6284 1.6032]
Checkpoint time: 0.1680 seconds
Frame 70, Loss: 0.3656, Position: [1.3282 0.6228 1.6883]
Checkpoint time: 0.1629 seconds
Frame 80, Loss: 1.6642, Position: [1.3963 0.6302 1.7508]
Checkpoint time: 0.1701 seconds
Frame 90, Loss: 0.5730, Position: [1.3518 0.6325 1.7038]
Checkpoint time: 0.2326 seconds
Frame 100, Loss: 4.0989, Position: [1.2514 0.6158 1.6012]
Checkpoint time: 0.1776 seconds
Frame 110, Los