In [7]:
%pip install -U sentence-transformers

import torch
# Import sentence transformer package. More information can be found here: https://www.sbert.net/
from sentence_transformers import SentenceTransformer, util
from google.colab import drive

drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# If you have a cuda capable device we will send the tensors to that
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

# Read in the test data into vectors.
readFileName = "/content/drive/MyDrive/Datasets/Test_Data/mergedInputFiles.txt"
sentences1, sentences2 = [], []
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        sentences = line.split('\t')
        sentences1.append(sentences[0])
        sentences2.append(sentences[1])

cuda:0


In [9]:
# Reading the training data into vectors.
from sentence_transformers import SentencesDataset, InputExample, losses

training_sentences_1, training_sentences_2, training_labels = [], [], []

# Now we read in the sentences and their labels
# Read in the input files
readFileName = "/content/drive/MyDrive/Datasets/Training_Data/mergedInputFiles.txt"
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        sentences = line.split('\t')
        training_sentences_1.append(sentences[0])
        training_sentences_2.append(sentences[1])

readFileName = "/content/drive/MyDrive/Datasets/Training_Data/mergedGoldStandard.txt"
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        training_labels.append(float(line))

train_examples = []
for sent_1, sent_2, label in zip(training_sentences_1, training_sentences_2, training_labels):
    train_examples.append(InputExample(texts=[sent_1, sent_2], label=label))

In [10]:
# In this module we, will train one of the transformer models and leave the other in its pre-trained state.
# We train this model using the following steps: https://www.sbert.net/examples/training/sts/README.html, https://www.sbert.net/docs/training/overview.html
from torch.utils.data import DataLoader

# The following are SBERT models. More information here: https://www.sbert.net/docs/package_reference/models.html#main-classes
model_name = 'distiluse-base-multilingual-cased-v2'
# Place the models into an array so we can iterate over them
models = {'With_Training': SentenceTransformer(model_name), 'Without_Training': SentenceTransformer(model_name)}


# Train the model
training_model = models['With_Training']

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(training_model)

training_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, show_progress_bar=True)

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/188 [00:00<?, ?it/s]

In [11]:
# Run each model on the test data and write their output to a file
print(len(sentences1))
print(len(sentences2))
for key, model in models.items():
    resultsFileName = f'/content/drive/MyDrive/Datasets/{key}_{model_name}.txt'
    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True, device=device)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True, device=device)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    # TODO: Confirm taking the absolute value is correct here
    # Normalize cosine_scores by taking the absolute value and multiplying by 5 then rounding. Note, the multiplication broadcasts over the tensor.
    cosine_scores = torch.round(abs(cosine_scores) * 5)

    with open(resultsFileName, 'w', encoding='utf8') as outfile:
        for i in range(len(sentences1)):
            outfile.writelines(str(int(cosine_scores[i][i].item())) + '\n')

print('Done testing phase.')

1186
1186
Done testing phase.


In [12]:
gold_standard, with_training, without_training = [], [], []

readFileName = "/content/drive/MyDrive/Datasets/Test_Data/mergedGoldStandard.txt"
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        gold_standard.append(int(line))

readFileName = "/content/drive/MyDrive/Datasets/With_Training_distiluse-base-multilingual-cased-v2.txt"
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        with_training.append(int(line))

readFileName = "/content/drive/MyDrive/Datasets/Without_Training_distiluse-base-multilingual-cased-v2.txt"
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        without_training.append(int(line))

# convert them all to tensors
print(f'{len(gold_standard)}, {len(with_training)}, {len(without_training)}')
gold_standard = torch.tensor(gold_standard)
with_training = torch.tensor(with_training)
without_training = torch.tensor(without_training)

# subtract both tensors from the gold standard
print(f'With training: {abs(torch.sum(gold_standard.sub(with_training)))}')
print(f'Without training: {abs(torch.sum(gold_standard.sub(without_training)))}')



1186, 1186, 1186
With training: 1102
Without training: 1133
