In [1]:
from sentence_transformers import SentenceTransformer, util
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean, cityblock
from scipy.stats import pearsonr
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# L1 Normalization
def normalize_l1(vector):
    norm = np.sum(np.abs(vector))
    return vector / norm

# L2 Normalization
def normalize_l2(vector):
    norm = np.sqrt(np.sum(np.square(vector)))
    return vector / norm

In [3]:
# Loading Models and Tokenizer
model_path = './all-MiniLM-L6-v2'  
file_path_gd = './data-noprompt/Cr-Ti-gd.txt'
# file_path_answer = './data-noprompt/Al-Co-gd.txt'
# file_path_improved = './data-traditionmethods/Al-Co.txt'
file_path_tradition = './data-traditionmethods/Cr-Ti.txt'

model = AutoModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Read text from TXT files
with open(file_path_gd, 'r', encoding='utf-8') as file:
    text1 = file.read()

# with open(file_path_answer, 'r', encoding='utf-8') as file:
#     text2 = file.read()

# with open(file_path_improved, 'r', encoding='utf-8') as file:
#     text2 = file.read()

with open(file_path_tradition, 'r', encoding='utf-8') as file:
    text2 = file.read()

# Processing Sentences with tokenizers
inputs1 = tokenizer(
                    text1
                    # sentence1
                    , return_tensors="pt"
                    , padding=False, truncation=True
                    , max_length=512
)
inputs2 = tokenizer(
                    text2
                    # sentence2
                    , return_tensors="pt"
                    , padding=False, truncation=True
                    , max_length=512
)

# print total tokens
print("Total number of tokens for text1:", inputs1['input_ids'].size(1))
print("Total number of tokens for text2:", inputs2['input_ids'].size(1))

# Get the last layer of hidden state of the model output
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

# Using average pooling
embeddings1 = outputs1.last_hidden_state.mean(dim=1)
embeddings2 = outputs2.last_hidden_state.mean(dim=1)
print(embeddings1.shape, embeddings2.shape)

# Calculate the cosine similarity between two word vectors 
vec1 = embeddings1.squeeze().numpy()
vec2 = embeddings2.squeeze().numpy()
cosine_sim = 1 - cosine(vec1, vec2)
print("CS based on numpy:", cosine_sim)

cosine_sim = cosine_similarity(embeddings1,embeddings2)
print("CS based on sklearn:", cosine_sim)

# Calculate the Euclidean Distance
norml2_vec1 = normalize_l2(vec1)
norml2_vec2 = normalize_l2(vec2)
euclidean_dist = euclidean(norml2_vec1, norml2_vec2)
print("ED:", euclidean_dist)

# Calculate the Manhattan Distance
norml1_vec1 = normalize_l1(vec1)
norml1_vec2 = normalize_l1(vec2)
manhattan_dist = cityblock(norml1_vec1, norml1_vec2)
print("MD:", manhattan_dist)

Total number of tokens for text1: 293
Total number of tokens for text2: 342
torch.Size([1, 384]) torch.Size([1, 384])
CS based on numpy: 0.9052212834358215
CS based on sklearn: [[0.9052214]]
ED: 0.4353819787502289
MD: 0.445708
