In [20]:
import torch
import pandas
import matplotlib.pyplot as plt
import os

In [8]:
EMBEDDING_PATH = "embeddings"
LANGUAGES = ["java", "javascript", "php", "python"]
MODEL = "codet5"
EMBEDDING_DIM = 256

In [15]:
def stream_embeddings(lang: str):
    lang_embedding_path = os.path.join(EMBEDDING_PATH, MODEL, lang)
    filenames = os.listdir(lang_embedding_path)

    for filename in filenames:
        yield torch.load(os.path.join(lang_embedding_path, filename), map_location=torch.device('cpu'))

In [17]:
def calculate_mean_embedding(lang: str):
    n = 0
    accum = torch.zeros(EMBEDDING_DIM)

    for embedding in stream_embeddings(lang):
        accum += embedding
        n += 1
    
    return accum / n

In [18]:
mean_embeddings: dict[str, torch.Tensor] = dict()
for lang in LANGUAGES:
    mean_embeddings[lang] = calculate_mean_embedding(lang)

In [27]:
def calculate_similarity_matrix(lang2embedding: dict[str, torch.Tensor]):
    langs = list(lang2embedding.keys())
    num_langs = len(langs)
    sim_matrix = torch.empty((num_langs, num_langs))

    for i in range(num_langs):
        for j in range(i, num_langs):
            if i == j: sim_matrix[i][j] = 1

            lang1 = langs[i]
            lang2 = langs[j]

            similarity = torch.nn.functional.cosine_similarity(lang2embedding[lang1], lang2embedding[lang2], dim = 0)
            sim_matrix[i][j] = similarity
            sim_matrix[j][i] = similarity

    return sim_matrix

In [None]:
sim_matrix = calculate_similarity_matrix(mean_embeddings)

print(list(mean_embeddings.keys()))
print(sim_matrix)

In [35]:
torch.save(mean_embeddings, "codet5_mean_embeddings.dict")