In [1]:
#add embedding
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/LaBSE')

import pandas as pd

#import dev, test, train (first row is column names)
location = "../Fixed_dataset_with_embeddings/"

#import dev, test, train (first row is column names)
dev = pd.read_csv(location + "dev.tsv", sep='\t', header=0)
test = pd.read_csv(location + "test.tsv", sep='\t', header=0)
train = pd.read_csv(location + "train.tsv", sep='\t', header=0)

#import atomic
location = "../sloatomic2020/"

#read in the data
atomic = pd.read_csv(location + "sloatomic_train_fixed_embedding_final.tsv", sep="\t", header=0)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np

#function that converts a string (that represents the sentence embedding) to a numpy array
def convert_to_array(string):
	try:
		#remove the brackets
		string = string.replace("[", "")
		string = string.replace("]", "")
		#split the string
		string = string.split()
		#convert to float
		string = [float(i) for i in string]
		#convert to numpy array
		string = np.array(string)
	except:
		#array of length 768 with all nan values
		string = np.array([np.nan]*768)
	return string

#convert the atomic embeddings to numpy arrays
atomic_embedding = atomic['embedding']
atomic_embedding = atomic_embedding.apply(convert_to_array)
atomic_embedding = np.vstack(atomic_embedding)

print(atomic_embedding.shape)

(1026498, 768)


In [3]:
from scipy import spatial

test_embedding = dev['premise_embedding'][10]
test_embedding = convert_to_array(test_embedding)
a = atomic_embedding

""" test_embedding = np.array([1,1])
a = np.array([[1,1],[2,1]]) """

print(test_embedding.shape)
print(a.shape)

similarity = 1 - spatial.distance.cdist([test_embedding], a, 'cosine')
print(similarity)

#closest (disregard nan values)
closest = np.nanargmax(similarity)
print(closest)
print(atomic.iloc[closest])

(768,)
(1026498, 768)
[[0.19194544        nan 0.18364071 ...        nan        nan        nan]]
216367
head_event    [A] odgovori [B] oseba se po odgovoru verjetno...
relation                                                 oReact
tail_event    oseba se po odgovoru verjetno počuti srečno al...
embedding     [-3.54067013e-02  3.78127471e-02  1.26430318e-...
Name: 216367, dtype: object


In [4]:
#add embedding
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

print("get all available devices:", torch.cuda.device_count())

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

atomic_embedding_gpu = torch.tensor(atomic_embedding, device=device).to(device)

PyTorch version: 2.1.0+cu121
CUDA available: True
CUDA version: 12.1
get all available devices: 1
Using device: cuda


In [5]:
from scipy import spatial

#function that goes trough dataset and finds closest sentence in atomic
def add_closest_sentence(df, name=""):
	#output: premise, hypothesis, premise_atomic, hypothesis_atomic, label
	output = pd.DataFrame(columns=["premise", "hypothesis", "premise_atomic", "hypothesis_atomic", "label"])

	#go trough the dataset
	for index, row in df.iterrows():
		premise = row['premise']
		osebek_premise = row['osebek_premise']
		predmet_premise = row['predmet_premise']
		premise_embedding = row['premise_embedding']
		premise_embedding = convert_to_array(premise_embedding)

		hypothesis = row['hypothesis']
		osebek_hypothesis = row['osebek_hypothesis']
		predmet_hypothesis = row['predmet_hypothesis']
		hypothesis_embedding = row['hypothesis_embedding']
		hypothesis_embedding = convert_to_array(hypothesis_embedding)

		""" print(premise_embedding.shape)
		print(atomic_embedding.shape) """

		#CPU
		#calculate similarities
		similarities_premise = 1 - spatial.distance.cdist([premise_embedding], atomic_embedding, 'cosine')
		similarities_hypothesis = 1 - spatial.distance.cdist([hypothesis_embedding], atomic_embedding, 'cosine')

		#find the closest sentence
		closest_sentence_premise = atomic.iloc[np.nanargmax(similarities_premise)]["head_event"]
		closest_sentence_hypothesis = atomic.iloc[np.nanargmax(similarities_hypothesis)]["head_event"]
		
		""" #GPU
		#calculate similarities on GPU
		premise_embedding_gpu = torch.tensor(premise_embedding, device=device).to(device)
		hypothesis_embedding_gpu = torch.tensor(hypothesis_embedding, device=device).to(device)
		
		similarities_premise = torch.matmul(premise_embedding_gpu, atomic_embedding_gpu.T)
		similarities_hypothesis = torch.matmul(hypothesis_embedding_gpu, atomic_embedding_gpu.T)

		#find the closest sentence on GPU
		closest_idx_premise = torch.argmax(similarities_premise).item()
		closest_idx_hypothesis = torch.argmax(similarities_hypothesis).item()

		closest_sentence_premise = atomic.iloc[closest_idx_premise]["head_event"]
		closest_sentence_hypothesis = atomic.iloc[closest_idx_hypothesis]["head_event"] """


		#add osebek and predmet to the output (in closest sentence osebek is marked with [A], predmet with [B])
		#if not nan
		if osebek_premise == osebek_premise:
			closest_sentence_premise = closest_sentence_premise.replace("[A]", osebek_premise)
		else:
			closest_sentence_premise = closest_sentence_premise.replace("[A]", "osebek")
		if predmet_premise == predmet_premise:
			closest_sentence_premise = closest_sentence_premise.replace("[B]", predmet_premise)
		else:
			closest_sentence_premise = closest_sentence_premise.replace("[B]", "predmet")

		if osebek_hypothesis == osebek_hypothesis:
			closest_sentence_hypothesis = closest_sentence_hypothesis.replace("[A]", osebek_hypothesis)
		else:
			closest_sentence_hypothesis = closest_sentence_hypothesis.replace("[A]", "osebek")
		if predmet_hypothesis == predmet_hypothesis:
			closest_sentence_hypothesis = closest_sentence_hypothesis.replace("[B]", predmet_hypothesis)
		else:
			closest_sentence_hypothesis = closest_sentence_hypothesis.replace("[B]", "predmet")
		
		#add to the output
		new_row = pd.DataFrame([[premise, hypothesis, closest_sentence_premise, closest_sentence_hypothesis, ""]], columns=["premise", "hypothesis", "premise_atomic", "hypothesis_atomic", "label"])
		output = pd.concat([output, new_row], ignore_index=True)

		#progress
		print(name+":" + str(index) + "/" + str(len(df)))

	return output

location = "../Fixed_dataset_with_embeddings/"

dev_out = add_closest_sentence(dev, "dev")
test_out = add_closest_sentence(test, "test")
train_out = add_closest_sentence(train, "train")

dev_out.to_csv(location + "dev_atomic.tsv", sep="\t", index=False)
test_out.to_csv(location + "test_atomic.tsv", sep="\t", index=False)
train_out.to_csv(location + "train_atomic.tsv", sep="\t", index=False)

dev:0/440
dev:1/440
dev:2/440
dev:3/440
dev:4/440
dev:5/440
dev:6/440
dev:7/440
dev:8/440
dev:9/440
dev:10/440
dev:11/440
dev:12/440
dev:13/440
dev:14/440
dev:15/440
dev:16/440
dev:17/440
dev:18/440
dev:19/440
dev:20/440
dev:21/440
dev:22/440
dev:23/440
dev:24/440
dev:25/440
dev:26/440
dev:27/440
dev:28/440
dev:29/440
dev:30/440
dev:31/440
dev:32/440
dev:33/440
dev:34/440
dev:35/440
dev:36/440
dev:37/440
dev:38/440
dev:39/440
dev:40/440
dev:41/440
dev:42/440
dev:43/440
dev:44/440
dev:45/440
dev:46/440
dev:47/440
dev:48/440
dev:49/440
dev:50/440
dev:51/440
dev:52/440
dev:53/440
dev:54/440
dev:55/440
dev:56/440
dev:57/440
dev:58/440
dev:59/440
dev:60/440
dev:61/440
dev:62/440
dev:63/440
dev:64/440
dev:65/440
dev:66/440
dev:67/440
dev:68/440
dev:69/440
dev:70/440
dev:71/440
dev:72/440
dev:73/440
dev:74/440
dev:75/440
dev:76/440
dev:77/440
dev:78/440
dev:79/440
dev:80/440
dev:81/440
dev:82/440
dev:83/440
dev:84/440
dev:85/440
dev:86/440
dev:87/440
dev:88/440
dev:89/440
dev:90/440
dev:91/44