In [None]:
import copy 
from typing import Optional

import torch
from torch import nn
from torch import Tensor, device

import numpy as np
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

torch.manual_seed(0)

<torch._C.Generator at 0x7fca3e066f90>

## Faster and batched cosine similarity

In [None]:
n_emb = 10000 #bigger than this will cause RAM and GPU memory issues
emb_size = 768
a = torch.rand(n_emb, emb_size)
a.dtype, a.shape

(torch.float32, torch.Size([10000, 768]))

In [None]:
def original_cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [None]:
from pynvml import *

def get_size_of_tensor(a):
    return a.element_size() * a.nelement()

def get_free_gpu_memory():
    nvmlInit()
    h = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(h)
    # print(f'total    : {info.total/1e6}MB')
    # print(f'free     : {info.free/1e6}MB')
    # print(f'used     : {info.used/1e6}MB')
    return info.free

get_free_gpu_memory()/1e6

14715.45344

In [None]:
def suggest_batch_size(a, b, fp16):
    import sys

    dim = a.shape[-1]
    single_emb = torch.rand(1, dim)
    size_of_emb = get_size_of_tensor(single_emb)

    if fp16:
        size_of_emb = size_of_emb/2

    free_gpu_mem = get_free_gpu_memory()
    size_of_float = 4
    safety_factor = 2
    #assume a is batched
    # (free_mem - size_a - size_b)*batches = len(a)*len(b)*size_of_float
    # (free_mem - size_a - size_b)/size_of_float/len(b) = len(a)/batches = batch_size

    batch_size = (free_gpu_mem - len(a)*size_of_emb - len(b)*size_of_emb)/size_of_float/len(b) 
    batch_size = int(min(len(a), batch_size)/8)*8/safety_factor
    return batch_size

suggest_batch_size(a, a, fp16=False)

10000.0

In [None]:
def get_tensor_for_cos_sim(a: Tensor, compute_device: Optional[str]=None, fp16:bool=False, normalize:bool=True):
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if compute_device:
        a = a.to(compute_device)

    if fp16:
        a = a.type(torch.float16)

    return torch.nn.functional.normalize(a, p=2, dim=1) if normalize else a


def cos_sim(a: Tensor, b: Optional[Tensor]=None, compute_device: Optional[str]=None, return_device: str="cpu", fp16:bool=False, batch_size="auto", normalize:bool=True):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    a = get_tensor_for_cos_sim(a, compute_device, fp16, normalize)

    if b is None:
        b = a
    else:
        b = get_tensor_for_cos_sim(b, compute_device, fp16)

    sim = []
    if batch_size == "auto":
        batch_size = suggest_batch_size(a, b, fp16=fp16)
    chunks = int(len(a)/batch_size)
    for chunk in torch.chunk(a, chunks):
        sim_batch = torch.mm(chunk, b.transpose(0, 1))
        sim.append(sim_batch.to("cpu"))
    sim = torch.cat(sim)

    del chunk; del sim_batch
    del a; del b

    if fp16:
      sim = sim.type(torch.float32)

    if return_device:
      sim = sim.to(return_device)
      
    return sim

In [None]:
%%timeit
original_cos_sim(a, a)

1 loop, best of 5: 1.8 s per loop


In [None]:
%timeit x = cos_sim(a, compute_device=None, fp16=False)

1 loop, best of 5: 1.89 s per loop


In [None]:
%timeit x = cos_sim(a, a, compute_device=None, fp16=False)

1 loop, best of 5: 1.92 s per loop


In [None]:
%timeit y = cos_sim(a, a, compute_device="cuda", fp16=False)

1 loop, best of 5: 274 ms per loop


In [None]:
%timeit y = cos_sim(a, compute_device="cuda", fp16=False)

1 loop, best of 5: 261 ms per loop


In [None]:
%timeit z = cos_sim(a, a, compute_device="cuda", fp16=True)

1 loop, best of 5: 209 ms per loop


In [None]:
x = cos_sim(a, a, compute_device=None, fp16=False)
y = cos_sim(a, a, compute_device="cuda", fp16=False)
z = cos_sim(a, a, compute_device="cuda", fp16=True)

In [None]:
torch.all(torch.isclose(x,y))

tensor(True)

In [None]:
torch.all(torch.isclose(x,z, atol=1e-06)) #cosine sim upto 6 decimal places

tensor(False)

In [None]:
torch.all(torch.isclose(x,z, atol=1e-03)) #cosine sim upto 3 decimal places is same which is good enough

tensor(True)

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

Sun Jun 13 19:19:53 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    29W /  70W |   1054MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
cos_sim(a, a, compute_device="cuda")

tensor([[1.0000, 0.7659, 0.7367,  ..., 0.7177, 0.7564, 0.7208],
        [0.7659, 1.0000, 0.7562,  ..., 0.7554, 0.7495, 0.7510],
        [0.7367, 0.7562, 1.0000,  ..., 0.7476, 0.7662, 0.7431],
        ...,
        [0.7177, 0.7554, 0.7476,  ..., 1.0000, 0.7458, 0.7512],
        [0.7564, 0.7495, 0.7662,  ..., 0.7458, 1.0000, 0.7495],
        [0.7208, 0.7510, 0.7431,  ..., 0.7512, 0.7495, 1.0000]])

In [None]:
!nvidia-smi

Sun Jun 13 19:19:53 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    29W /  70W |   1528MiB / 15109MiB |     13%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
a.shape

torch.Size([10000, 768])

In [None]:
z = cos_sim(a, a, compute_device="cpu", fp16=False, return_device=None)
z.shape

torch.Size([10000, 10000])

In [None]:
z[:2, :10]

tensor([[1.0000, 0.7659, 0.7367, 0.7538, 0.7710, 0.7362, 0.7396, 0.7318, 0.7641,
         0.7414],
        [0.7659, 1.0000, 0.7562, 0.7546, 0.7559, 0.7533, 0.7500, 0.7396, 0.7458,
         0.7540]])

In [None]:
batch_size = len(a)/5
z = cos_sim(a, a, compute_device="cpu", fp16=False, return_device=None, batch_size=batch_size)
z.shape

torch.Size([10000, 10000])

In [None]:
z[:2, :10]

tensor([[1.0000, 0.7659, 0.7367, 0.7538, 0.7710, 0.7362, 0.7396, 0.7318, 0.7641,
         0.7414],
        [0.7659, 1.0000, 0.7562, 0.7546, 0.7559, 0.7533, 0.7500, 0.7396, 0.7458,
         0.7540]])

In [None]:
z = cos_sim(a, a, compute_device="cuda", fp16=True, return_device='cpu', batch_size=batch_size)
z.shape

torch.Size([10000, 10000])

In [None]:
z[:2, :10]

tensor([[1.0000, 0.7661, 0.7368, 0.7539, 0.7710, 0.7363, 0.7397, 0.7319, 0.7642,
         0.7412],
        [0.7661, 1.0000, 0.7563, 0.7549, 0.7559, 0.7534, 0.7500, 0.7397, 0.7461,
         0.7539]])

In [None]:
z = original_cos_sim(a, a)
z.shape

torch.Size([10000, 10000])

In [None]:
z[:2, :10]

tensor([[1.0000, 0.7659, 0.7367, 0.7538, 0.7710, 0.7362, 0.7396, 0.7318, 0.7641,
         0.7414],
        [0.7659, 1.0000, 0.7562, 0.7546, 0.7559, 0.7533, 0.7500, 0.7396, 0.7458,
         0.7540]])