In [None]:
import torch
from transformers import AutoTokenizer

from load import load_test_dataset
from models.diffpool import DiffPoolModel
from metrics import Metrics
from utils import get_embeddings
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

In [None]:
# define the different models in the ensemble

models = [
    DiffPoolModel(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        num_node_features=300,
        nout=384,
    ),
    DiffPoolModel(
        model_name="sentence-transformers/all-mpnet-base-v2",
        num_node_features=300,
        nout=768,
    ),
    DiffPoolModel(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        num_node_features=300,
        nout=384,
        d_pooling_layers=[30, 10, 3, 1],
        d_encoder_hidden_dims=[300, 600, 1200, 1200],
        d_encoder_linear_layers=[[300], [600], [1200, 600], [1200, 600]],
        d_encoder_num_heads=[3, 6, 12, 12],
        d_encoder_num_layers=[10, 5, 3, 1],
        d_linear=1200,
        dropout=0,
    ),
]

tokenizers = [
    AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
    AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2"),
    AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
]

saved_paths = [
    "./outputs/saved/circle_loss/circle70.pt",
    "./outputs/saved/mpnet/model60.pt",
    "./outputs/saved/diffpool30M/model63.pt",
]

metrics = [Metrics(), Metrics(), Metrics()]

In [None]:
for k, (model, tokenizer, saved_path, metric) in enumerate(
    zip(models, tokenizers, saved_paths, metrics)
):
    print(f"Processing model {k+1}")
    test_loader, test_text_loader = load_test_dataset(tokenizer, batch_size=8)

    checkpoint = torch.load(saved_path)
    model.to(device)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()

    graph_embeddings, text_embeddings = get_embeddings(
        model.get_graph_encoder(),
        model.get_text_encoder(),
        test_loader,
        test_text_loader,
        device,
        low_memory=True,
    )

    similarity = metric.similarity(text_embeddings, graph_embeddings)

    # Normalize this matrix of shape (n_samples, n_labels) to [0, 1] for each row
    similarity = (similarity - similarity.min(axis=1)) / (
        similarity.max(axis=1) - similarity.min(axis=1)
    )

    # save to file (because it is huge)
    torch.save(similarity, f"./outputs/similarities/similarity{k}.pt")

    # free memory
    del similarity
    model.to("cpu")
    del model
    torch.cuda.empty_cache()

In [None]:
# take the average of the similarities
for k in range(len(models)):
    similarity = torch.load(f"./outputs/similarities/similarity{k}.pt")
    if k == 0:
        avg_similarity = similarity
    else:
        avg_similarity += similarity
    del similarity

avg_similarity /= len(models)

In [None]:
solution = pd.DataFrame(avg_similarity)
solution["ID"] = solution.index
solution = solution[["ID"] + [col for col in solution.columns if col != "ID"]]
solution.to_csv("outputs/ensemble_solution.csv", index=False)