In [1]:
%matplotlib widget

In [2]:
import numpy as np
import torch

In [3]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load CLIP embeddings

In [5]:
scene_data = torch.load('../logs/probe/emb_blender_paper_lego_clip_vit.pth')
embedding = scene_data['embedding']
embedding = torch.from_numpy(embedding).float()

In [6]:
plt.scatter(*embedding[:, :2].T)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.collections.PathCollection at 0x7f101ede8280>

In [7]:
def pairwise_cosine_similarity(features):
    assert features.ndim == 2  # [B, D]
    norm = torch.norm(features, dim=1)
    features_normalized = features / norm.unsqueeze(1)
    similarity = features_normalized.mm(features_normalized.transpose(0, 1))
    return similarity

In [8]:
similarity = pairwise_cosine_similarity(embedding)
similarity

tensor([[1.0000, 0.9112, 0.8482,  ..., 0.8342, 0.8256, 0.8758],
        [0.9112, 1.0000, 0.9407,  ..., 0.9150, 0.9343, 0.9582],
        [0.8482, 0.9407, 1.0000,  ..., 0.9444, 0.9623, 0.9370],
        ...,
        [0.8342, 0.9150, 0.9444,  ..., 1.0000, 0.9338, 0.9125],
        [0.8256, 0.9343, 0.9623,  ..., 0.9338, 1.0000, 0.9413],
        [0.8758, 0.9582, 0.9370,  ..., 0.9125, 0.9413, 1.0000]])

In [9]:
plt.imshow(similarity.numpy())
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x7f0fe9216100>

# Pose similiarity (ground truth)

In [10]:
scene_data['poses'][0]

array([[-9.9990219e-01,  4.1922452e-03, -1.3345719e-02, -5.3798322e-02],
       [-1.3988681e-02, -2.9965907e-01,  9.5394367e-01,  3.8454704e+00],
       [-4.6566129e-10,  9.5403719e-01,  2.9968831e-01,  1.2080823e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  1.0000000e+00]],
      dtype=float32)

In [11]:
poses = torch.from_numpy(scene_data['poses']).float()

In [28]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(poses[:, 0, -1], poses[:, 1, -1], poses[:, 2, -1])
plt.title('Camera origins')
fig.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [13]:
xyz = poses[:, :-1, 3]
diffs = xyz.unsqueeze(0) - xyz.unsqueeze(1)
pose_distances = torch.sqrt((diffs ** 2).sum(dim=-1))

In [14]:
thetas = torch.atan2(xyz[:, 1], xyz[:, 0])

In [15]:
pose_similiarty = pairwise_cosine_similarity(xyz)

In [38]:
plt.figure()
plt.imshow(pose_similiarty)
plt.colorbar()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.colorbar.Colorbar at 0x7f0fdc4ede20>

In [16]:
# pose_similarity = pairwise_cosine_similarity(poses[:, :, 3].flatten(1))
plt.figure()
plt.imshow(pose_distances)
plt.colorbar()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.colorbar.Colorbar at 0x7f0fe90c8670>

In [17]:
colors = torch.cat([thetas for _ in range(len(thetas))])

plt.figure()
plt.scatter(pose_distances.flatten(), similarity.flatten(), c=colors)
plt.ylabel('Cosine similiarity of CLIP ViT embeddings')
plt.xlabel('Euclidean distance between cameras')
plt.colorbar()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.colorbar.Colorbar at 0x7f0fe911d490>

In [93]:
cmap = 'Greens'
bins = 30
ylim = [0, 1]

fig, axes = plt.subplots(1, 2, sharey=True, figsize=(6, 3))
axes[0].hist2d(x=pose_distances.flatten().cpu().numpy(),
               y=similarity.flatten().cpu().numpy(),
               cmap=cmap, bins=bins, range=[[0, 8], ylim])
axes[0].set_ylabel('Cosine similarity of\nCLIP ViT embeddings')
axes[0].set_xlabel('Euclidean distance\nbetween camera origins')

axes[1].hist2d(x=pose_similiarty.flatten().cpu().numpy(),
               y=similarity.flatten().cpu().numpy(),
               cmap=cmap, bins=bins, range=[[-1, 1], ylim])
# axes[1].set_ylabel('Cosine similarity of CLIP ViT embeddings')
axes[1].set_xlabel('Cosine similarity\nof camera origins')

axes[1].set_ylim(ylim)
axes[1].set_xlim([-1, 1])

plt.tight_layout()
plt.savefig("files/representation_similarity_clip_vit.pdf")

# plt.title('Camera position versus representation similarity')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [46]:
plt.figure()
plt.hist2d(x=pose_distances.flatten().cpu().numpy(), y=similarity.flatten().cpu().numpy())
plt.ylabel('Cosine similarity of CLIP ViT embeddings')
plt.xlabel('Euclidean distance between camera origins')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'Euclidean distance between camera origins')

In [47]:
colors = torch.cat([thetas for _ in range(len(thetas))])

plt.figure()
# plt.scatter(pose_similiarty.flatten(), similarity.flatten(), c=colors)
plt.hist2d(x=pose_similiarty.flatten().cpu().numpy(), y=similarity.flatten().cpu().numpy())
plt.ylabel('Cosine similarity of CLIP ViT embeddings')
plt.xlabel('Cosine similarity of camera origins')
# plt.colorbar()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'Cosine similarity of camera origins')

In [18]:
colors = torch.cat([thetas for _ in range(len(thetas))])

plt.figure()
plt.scatter(torch.acos(pose_similiarty).flatten(), similarity.flatten(), c=colors)
plt.ylabel('Cosine similarity of CLIP ViT embeddings')
plt.xlabel('Angle between cameras')
plt.colorbar()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.colorbar.Colorbar at 0x7f0ffe353820>

In [19]:
colors = torch.cat([thetas for _ in range(len(thetas))])

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
idx = np.random.choice(len(thetas) ** 2, size=1000, replace=False)
ax.scatter(torch.acos(pose_similiarty).flatten()[idx], pose_distances.flatten()[idx], similarity.flatten()[idx], c=colors[idx])
ax.set_zlabel('Cosine similiarity of CLIP ViT embeddings')
ax.set_xlabel('Angle between cameras')
ax.set_ylabel('Euclidean distance between cameras')
# plt.colorbar()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'Euclidean distance between cameras')

In [29]:
plt.figure()
_ = plt.hist(pose_distances.flatten().numpy(), bins=100)
plt.title('Distribution of pairwise euclidean distance between camera origins')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Distribution of pairwise euclidean distance between camera origins')

In [33]:
plt.figure()
_ = plt.hist(similarity.numpy().flatten(), bins=100)
plt.title('Distribution of pairwise cosine similarity\nbetween training image ViT embeddings')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Distribution of pairwise cosine similarity\nbetween training image ViT embeddings')

In [22]:
plt.figure()
_ = plt.hist(-similarity.numpy().flatten(), bins=100, cumulative=True, density=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Dimensionality reduction

In [23]:
def dim_reduction(X, color='blue', n_neighbors=10, n_components=2, mds_max_iter=100):
    from collections import OrderedDict
    from functools import partial
    from time import time

    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    from matplotlib.ticker import NullFormatter

    from sklearn import manifold, datasets

    # Next line to silence pyflakes. This import is needed.
    Axes3D

    n_points = len(X)

    # Create figure
    fig = plt.figure(figsize=(15, 8))
    fig.suptitle("Manifold Learning with %i points, %i neighbors"
                 % (1000, n_neighbors), fontsize=14)

    # Add 3d scatter plot
    ax = fig.add_subplot(251, projection='3d')
    ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
    ax.view_init(4, -72)

    # Set-up manifold methods
    LLE = partial(manifold.LocallyLinearEmbedding,
                  n_neighbors, n_components, eigen_solver='auto')

    methods = OrderedDict()
    methods['LLE'] = LLE(method='standard')
    methods['LTSA'] = LLE(method='ltsa')
    methods['Hessian LLE'] = LLE(method='hessian')
    methods['Modified LLE'] = LLE(method='modified')
    methods['Isomap'] = manifold.Isomap(n_neighbors, n_components)
    methods['MDS'] = manifold.MDS(n_components, max_iter=mds_max_iter, n_init=1)
    methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
                                               n_neighbors=n_neighbors)
    methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca',
                                     random_state=0)

    # Plot results
    for i, (label, method) in enumerate(methods.items()):
        t0 = time()
        Y = method.fit_transform(X)
        t1 = time()
        print("%s: %.2g sec" % (label, t1 - t0))
        ax = fig.add_subplot(2, 5, 2 + i + (i > 3))
        ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
        ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
        ax.xaxis.set_major_formatter(NullFormatter())
        ax.yaxis.set_major_formatter(NullFormatter())
        ax.axis('tight')

    plt.show()

In [24]:
# Dimensionality reduction of camera origin. Colored by theta (rotation around vertical axis)
dim_reduction(xyz, thetas, n_neighbors=10, n_components=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …



LLE: 0.15 sec
LTSA: 0.056 sec
Hessian LLE: 0.064 sec
Modified LLE: 0.057 sec
Isomap: 0.038 sec
MDS: 0.028 sec
SE: 0.01 sec
t-SNE: 0.59 sec


In [25]:
# Dimensionality reduction of c2w matrices. Colored by theta (rotation around vertical axis)
dim_reduction(poses.flatten(1), thetas, n_neighbors=10, n_components=2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

LLE: 0.045 sec
LTSA: 0.06 sec




Hessian LLE: 0.079 sec
Modified LLE: 0.065 sec
Isomap: 0.053 sec
MDS: 0.024 sec
SE: 0.012 sec
t-SNE: 0.51 sec


In [26]:
# Dimensionality reduction of CLIP ViT embeddings. Colored by theta (rotation around vertical axis)
dim_reduction(embedding, thetas, n_neighbors=10, n_components=2, mds_max_iter=1000)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

LLE: 0.043 sec
LTSA: 0.067 sec




Hessian LLE: 0.078 sec
Modified LLE: 0.068 sec
Isomap: 0.032 sec
MDS: 0.19 sec
SE: 0.011 sec
t-SNE: 0.77 sec


In [27]:
embedding.shape

torch.Size([138, 512])