In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import clip
import re
import time
import yaml
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from networks import SCLIPNN, SCLIPNN3
from utils import EmbeddingsDataset, get_models_to_train

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading Models")
clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sbert_model.eval()
bertin_model = SentenceTransformer('hackathon-pln-es/bertin-roberta-base-finetuning-esnli')
bertin_model.eval()

Loading Models


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [8]:
def get_test_file_path(directory):
    test_txt = 'test_sentences.txt'
    test_path  = os.path.join(directory, test_txt)
    return test_path
    
def get_sentences_from_file(filename):
    sentences = []
    with open(filename, mode='rt', encoding='utf-8') as file_object:
        for line in file_object:
            sentences.append(line)    
    return sentences

def regexification(sentences):
    regex = [r"[^A-Za-z0-9]+|[a-zA-Z][0-9]", r"(?<!\d)[0]\d*(?!\d)", r"\s+", r"[0-9]+"]
    for r in regex:
        sentences = list(map(lambda sentence: re.sub(r, " ", sentence), sentences))
    return sentences

def get_clip_embeddings(sentences):
    tokenized_text = clip.tokenize(sentences).to(device)
    with torch.no_grad():
        clip_embeddings = clip_model.encode_text(tokenized_text)
        clip_embeddings.to('cpu')
    return clip_embeddings

def get_sbert_embeddings(sentences):
    with torch.no_grad():  
        sbert_embeddings = torch.from_numpy(sbert_model.encode(sentences))
    return sbert_embeddings

def get_bertin_embeddings(sentences):
    with torch.no_grad():  
        bertin_embeddings = torch.from_numpy(bertin_model.encode(sentences))
    return bertin_embeddings

def show_embeddings_return_size(sentences, clip_embeddings, sbert_embeddings):
    ("-"*10)
    for sentence, clip_embedding, sbert_embedding in zip(sentences[:1], clip_embeddings[:1], sbert_embeddings[:1]):
        print("Sentence:", sentence)
        input_size = sbert_embedding.size()[0]    
        print("Sbert Embedding: ", input_size)
        print("Clip Embedding: ", clip_embedding.size()[0])
        print("-"*10)
    return input_size

In [11]:
def get_test_embeddings(directory,transformer):
    test_file = get_test_file_path(directory)
    test_sentences = regexification(get_sentences_from_file(test_file))
    print("CLIP encoding...")
    test_clip_embeddings = get_clip_embeddings(test_sentences)
    if transformer == 'sbert':
        print("SBERT encoding...")
        test_sbert_embeddings = get_sbert_embeddings(test_sentences)
    elif transformer == 'bertin':
        print("BERTIN encoding...")
        test_sentences = get_sentences_from_file(test_file)
        test_embeddings = get_bertin_embeddings(test_sentences)
    return test_clip_embeddings, test_embeddings

def get_train_embeddings(directory,transformer = 'sbert'):
    train_file, valid_file = get_files_paths(directory)
    train_sentences = regexification(get_sentences_from_file(train_file))
    valid_sentences = regexification(get_sentences_from_file(valid_file))
    print("CLIP encoding...")    
    train_clip_embeddings = get_clip_embeddings(train_sentences)
    valid_clip_embeddings = get_clip_embeddings(valid_sentences)
    if transformer == 'sbert':
        print("SBERT encoding...")
        train_embeddings = get_sbert_embeddings(train_sentences)
        valid_embeddings = get_sbert_embeddings(valid_sentences)
    elif transformer == 'bertin':
        print("BERTIN encoding...")
        train_sentences = get_sentences_from_file(train_file)
        valid_sentences = get_sentences_from_file(valid_file)
        train_embeddings = get_bertin_embeddings(train_sentences)
        valid_embeddings = get_bertin_embeddings(valid_sentences)
    else:
        print("Given transformer is not valid")
    return train_clip_embeddings, valid_clip_embeddings, train_embeddings, valid_embeddings

def cosin_calculator(targets, predictions):    
    cosines = []
    cos = nn.CosineSimilarity(dim=0, eps=1e-6)
    for tar, pred in zip(targets, predictions):        
        cosine = cos(tar, pred)
        cosines.append(cosine.item())
    return np.array(cosines)

def evaluate(models, input_size, test_dataset, trainset, transformer = 'sbert'):
    cosines = []
    euclideans = []
    with torch.no_grad():           
        for name, model in models.items():
            path = os.path.join('models',transformer + '_' + trainset + '_' + name+'.pt')
            if 'NN3' in name:
                loaded_model = SCLIPNN3(input_size,int(name[-3:])).to(device)
            else:
                loaded_model = SCLIPNN(input_size,int(name[-3:])).to(device)
            loaded_model.load_state_dict(torch.load(path))            
            sum_cos = 0
            count = 0
            predictions =[]
            test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)
            for inputs, labels in test_loader:     
                tclip = labels.to(device)
                tsbert = inputs.to(device)
                prediction = loaded_model(tsbert)
                predictions.append(prediction)
                sum_cos += np.mean(cosin_calculator(tclip, prediction))
                count += 1
            cosines.append(round(sum_cos/count,3))
            stacked_predictions = torch.stack(predictions)
            euclidean = torch.cdist(test_dataset.Y.to(float), stacked_predictions.to(float))
            avg_euclidean = torch.mean(euclidean)
            euclideans.append(round(avg_euclidean.item(),3))    
    return cosines, euclideans

In [12]:
print("Evaluating...")
directories = ['europarl_es'] #params['test_dataset']
model_dict = {}
trans = 'bertin'
for directory in directories:
    print(f'Evaluating with test dataset {directory}...')    
    test_clip_emb, test_emb = get_test_embeddings(directory, trans)
    input_size = test_emb[0].size()[0]
    test_dataset = EmbeddingsDataset(test_emb, test_clip_emb)
    for train_directory in directories:
        model_dict[train_directory] = get_models_to_train(input_size)
        print(f'...the model trained on {train_directory}')
        start_time = time.time()       
        cosines, euclideans = evaluate(model_dict[train_directory],input_size,test_dataset,trainset=train_directory,transformer = trans)
        end_time = time.gmtime(time.time() - start_time)
        evaluation_time = time.strftime("%H:%M:%S", end_time)
        print("Evaluation Time: {}".format(evaluation_time))
        data = {"Cosin":cosines, "Euclidean":euclideans}
        indices = []
        for km in model_dict[train_directory].keys():
            indices.append(train_directory+'_'+km)        
        results = pd.DataFrame(data, index=indices)
        display(results)
print("End of Evaluation")

Evaluating...
Evaluating with test dataset europarl_es...
CLIP encoding...
BERTIN encoding...
Creating Models to train...
4 models created.
...the model trained on europarl_es
Evaluation Time: 00:00:01


Unnamed: 0,Cosin,Euclidean
europarl_es_NN_700,0.897,5.145
europarl_es_NN3_700,0.923,4.581
europarl_es_NN_800,0.899,5.105
europarl_es_NN3_800,0.924,4.566


End of Evaluation
