In [1]:
import torch

from semantic_memory import vsm
from tqdm import trange

In [2]:
embeddings = []
with open("spose_embedding_66d_sorted.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        vector = [float(x) for x in line.split()]
        embeddings.append(vector)

embeddings = torch.tensor(embeddings)

In [3]:
vocab = open("unique_id.txt", "r").readlines()
vocab = [x.strip() for x in vocab]

In [4]:
things = vsm.VectorSpaceModel("THINGS 66d")
things.load_vectors_from_tensor(embeddings, vocab)

In [5]:
things.neighbor("mouse1",k=10)

[('rat', 0.98600834608078),
 ('warthog', 0.9813143014907837),
 ('chipmunk', 0.980313241481781),
 ('rhinoceros', 0.9765945076942444),
 ('bear', 0.9756324291229248),
 ('fox', 0.9725315570831299),
 ('chinchilla', 0.9714034795761108),
 ('cougar', 0.9698286652565002),
 ('coyote', 0.9682605266571045),
 ('mongoose', 0.9668706655502319)]

In [6]:
def dot_product_similarity(v1, v2):
    return torch.dot(v1, v2)

In [7]:
dot_product_similarity(things("mouse1").squeeze(0), things("rat").squeeze(0))

tensor(7.5295)

In [8]:
def embedding2sim(embedding):
    # Convert embedding to similarity matrix
    n_objects = embedding.size(0)

    # Compute the similarity matrix
    sim = torch.mm(embedding, embedding.T)
    esim = torch.exp(sim)

    print("Initialized similarities")

    # Initialize the cp matrix
    cp = torch.zeros(n_objects, n_objects)

    print("Initialized matrix")

    # Compute the similarity matrix
    for i in range(n_objects):
        for j in trange(i + 1, n_objects):
            ctmp = torch.zeros(n_objects)
            for k in range(n_objects):
                if k == i or k == j:
                    continue
                ctmp[k] = esim[i, j] / (esim[i, j] + esim[i, k] + esim[j, k])
            cp[i, j] = torch.sum(ctmp)

    # Normalize the cp matrix
    cp = cp / (n_objects - 2)
    cp = cp + cp.T  # Make the matrix symmetric
    cp[torch.eye(n_objects).bool()] = 1  # Set the diagonal to 1

    return cp

In [9]:
1854 * 1853

3435462

In [11]:
torch.mm(things.embeddings, things.embeddings.T).shape

torch.Size([1854, 1854])

In [9]:
spose_sim = embedding2sim(embeddings)

Initialized similarities
Initialized matrix


 40%|████      | 746/1853 [00:12<00:18, 59.53it/s]


KeyboardInterrupt: 

In [13]:
n_objects = embeddings.size(0)
c=0
for i in range(n_objects):
    for j in range(i + 1, n_objects):
        # c+=1
        # ctmp = torch.zeros(n_objects)
        for k in range(n_objects):
            if k == i or k == j:
                continue
            c+=1

KeyboardInterrupt: 

In [15]:
1854 * 1854 * 1854

6372783864

In [10]:
def embedding2sim_optimized(embedding):
    # Number of objects
    n_objects = embedding.size(0)

    # Compute the similarity matrix
    sim = torch.mm(embedding, embedding.T)
    esim = torch.exp(sim)

    # Initialize the cp matrix
    cp = torch.zeros(n_objects, n_objects)

    # Use broadcasting to create a mask that avoids diagonal elements
    mask = ~torch.eye(n_objects, dtype=bool, device=embedding.device)

    # Calculate for all pairs (i, j)
    for i in trange(n_objects):
        # For each i, calculate the contribution to all j simultaneously
        esim_i = esim[i, :].unsqueeze(1)
        denom = esim_i + esim + esim[i, i].unsqueeze(0).unsqueeze(1)

        # Avoid self-interactions by masking
        ctmp = esim[i, :] / denom
        ctmp[~mask] = 0

        # Sum across all k for each (i, j)
        cp[i, :] = ctmp.sum(dim=0)

    # Normalize the cp matrix
    cp = cp / (n_objects - 2)
    cp = cp + cp.T  # Make the matrix symmetric
    cp[torch.eye(n_objects).bool()] = 1  # Set the diagonal to 1

    return cp

In [11]:
spose_sim_optimized = embedding2sim_optimized(embeddings)

100%|██████████| 1854/1854 [00:06<00:00, 305.25it/s]


In [13]:
spose_sim_optimized

tensor([[1.0000, 0.0065, 0.0042,  ..., 0.9605, 0.0067, 0.0186],
        [0.0065, 1.0000, 0.1884,  ..., 0.0114, 0.1025, 0.0135],
        [0.0042, 0.1884, 1.0000,  ..., 0.0089, 0.0688, 0.0057],
        ...,
        [0.9605, 0.0114, 0.0089,  ..., 1.0000, 0.0072, 0.0119],
        [0.0067, 0.1025, 0.0688,  ..., 0.0072, 1.0000, 0.0071],
        [0.0186, 0.0135, 0.0057,  ..., 0.0119, 0.0071, 1.0000]])

In [43]:
things.vocab2idx['mouse1']

1026

In [14]:
spose_sim_optimized[1026, 1285]

tensor(1.9245)

In [18]:
(spose_sim_optimized > 1.0)

tensor([70,  0,  2,  ..., 20,  1, 23])

In [26]:
def embedding2sim_optimized_corrected(embedding):
    # Number of objects
    n_objects = embedding.size(0)

    # Compute the similarity matrix
    sim = torch.mm(embedding, embedding.T)
    esim = torch.exp(sim)

    # Initialize the cp matrix
    cp = torch.zeros(n_objects, n_objects)  

    for i in range(n_objects):
        # Compute similarity scores for all pairs (i, j) where j > i
        esim_ij = esim[i, i+1:]  # esim[i, j] for all j > i
        esim_i = esim[i, :]      # esim[i, k] for all k
        esim_j = esim[i+1:, :]   # esim[j, k] for all k where j > i
        
        if esim_ij.numel() == 0:
            continue  # Skip if there are no valid j > i
        
        # Calculate ctmp for all j > i at once
        denom = esim_ij.unsqueeze(1) + esim_i[i+1:].unsqueeze(0) + esim_j[:, i+1:]
        ctmp = esim_ij.unsqueeze(1) / denom
        
        # Exclude contributions where k == i or k == j
        ctmp[:, 0] = 0  # Exclude the first element corresponding to the current i
        for j_idx in range(ctmp.size(0)):
            ctmp[j_idx, j_idx] = 0  # Exclude j == k (diagonal of ctmp)

        # Sum ctmp and store in cp
        cp[i, i+1:] = ctmp.sum(dim=1)

    # Normalize the cp matrix
    cp = cp / (n_objects - 2)
    cp = cp + cp.T  # Make the matrix symmetric
    cp[torch.eye(n_objects).bool()] = 1  # Set the diagonal to 1

    return cp

In [27]:
spose_sim_optimized_corrected = embedding2sim_optimized_corrected(embeddings)

In [51]:
spose_sim_optimized_corrected[0][:10]

tensor([1.0000, 0.0873, 0.1030, 0.4509, 0.0986, 0.1185, 0.1033, 0.1188, 0.2154,
        0.1588])

In [52]:
def embedding2sim(embedding):
    # This function converts an embedding to a similarity matrix
    n_objects = embedding.shape[0]
    
    # Compute similarity matrix
    sim = torch.matmul(embedding, embedding.t())
    esim = torch.exp(sim)
    
    # Create mask to exclude i==k and j==k cases
    mask = torch.ones(n_objects, n_objects, n_objects, dtype=torch.bool)
    mask.diagonal(dim1=0, dim2=1).fill_(False)
    mask.diagonal(dim1=0, dim2=2).fill_(False)
    
    # Compute cp matrix
    i, j = torch.triu_indices(n_objects, n_objects, offset=1)
    esim_ij = esim[i, j].unsqueeze(-1)
    esim_ik = esim[i].unsqueeze(1)
    esim_jk = esim[j].unsqueeze(1)
    
    ctmp = esim_ij / (esim_ij + esim_ik + esim_jk)
    ctmp = ctmp.masked_fill(~mask[i, j], 0)
    
    cp = torch.zeros(n_objects, n_objects)
    cp[i, j] = ctmp.sum(dim=-1) / (n_objects - 2)
    
    # Make cp symmetric
    cp = cp + cp.t()
    cp.diagonal().fill_(1)
    
    return cp

In [53]:
spose_sim = embedding2sim(embeddings)

RuntimeError: [enforce fail at alloc_cpu.cpp:83] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 21881648030485176 bytes. Error code 12 (Cannot allocate memory)