In [1]:
import torch
# Import sentence transformer package. More information can be found here: https://www.sbert.net/
from sentence_transformers import SentenceTransformer, util

In [2]:
# Preprocessing the test data code. Concatenate all the gold standard results into one file.
# So long as the input and gold standard files are in the same order then this is safe.
import glob
import shutil

# Concatenate all the results into one file
outFileName = "data/Test_Data/Result_Files/mergedGoldStandard.txt"
with open(outFileName, 'w', encoding='utf8') as outfile:
    for filename in glob.glob('data/Test_Data/Gold_Standard_Files/*.txt'):
        if filename == outFileName:
            # don't want to copy the output into the output
            continue
        with open(filename, 'r', encoding='utf8') as readfile:
            for line in readfile:
                # Remove all the empty lines
                if not line.isspace():
                    outfile.write(line)

# Merge all the input files into one file
outFileName = "data/Test_Data/Result_Files/mergedInputFiles.txt"
with open(outFileName, 'w', encoding='utf8') as outfile:
    for filename in glob.glob('data/Test_Data/Input_Files/*.txt'):
        if filename == outFileName:
            # don't want to copy the output into the output
            continue
        with open(filename, 'r', encoding='utf8') as readfile:
            shutil.copyfileobj(readfile, outfile)


# Note that we need to modify the gold standard files slightly so that the values are compressed to a range 0-1.
# As they are currently on a scale from 0-5 we need only divide them by 5. Rounded to 5 decimal places.

# Concatenate all the results into one file
outFileName = "data/Training_Data/Result_Files/mergedGoldStandard.txt"
with open(outFileName, 'w', encoding='utf8') as outfile:
    for filename in glob.glob('data/Training_Data/Gold_Standard_Files/*.txt'):
        with open(filename, 'r', encoding='utf8') as readfile:
            for line in readfile:
                # Remove all the empty lines
                if not line.isspace():
                    # Convert the string to a double and then round it to the nearest integer
                    val = str(round((float(line) / 5), 5)) + "\n"
                    outfile.write(val)

# Merge all the input files into one file
outFileName = "data/Training_Data/Result_Files/mergedInputFiles.txt"
with open(outFileName, 'w', encoding='utf8') as outfile:
    for filename in glob.glob('data/Test_Data/Input_Files/*.txt'):
        with open(filename, 'r', encoding='utf8') as readfile:
            shutil.copyfileobj(readfile, outfile)

print('Done pre-processing training and test data.')

Done pre-processing training and test data.


In [3]:
# If you have a cuda capable device we will send the tensors to that
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

# Read in the test data into vectors.
readFileName = "data/Test_data/Result_Files/mergedInputFiles.txt"
sentences1, sentences2 = [], []
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        sentences = line.split('\t')
        sentences1.append(sentences[0])
        sentences2.append(sentences[1])

cuda:0


In [4]:
# Reading the training data into vectors.
from sentence_transformers import SentencesDataset, InputExample, losses

training_sentences_1, training_sentences_2, training_labels = [], [], []

# Now we read in the sentences and their labels
# Read in the input files
readFileName = "data/Training_Data/Result_Files/mergedInputFiles.txt"
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        sentences = line.split('\t')
        training_sentences_1.append(sentences[0])
        training_sentences_2.append(sentences[1])

readFileName = "data/Training_Data/Result_Files/mergedGoldStandard.txt"
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        training_labels.append(float(line))

train_examples = []
for sent_1, sent_2, label in zip(training_sentences_1, training_sentences_2, training_labels):
    train_examples.append(InputExample(texts=[sent_1, sent_2], label=label))

In [5]:
# In this module we, will train one of the transformer models and leave the other in its pre-trained state.
# We train this model using the following steps: https://www.sbert.net/examples/training/sts/README.html, https://www.sbert.net/docs/training/overview.html
from torch.utils.data import DataLoader

# The following are SBERT models. More information here: https://www.sbert.net/docs/package_reference/models.html#main-classes
model_name = 'distiluse-base-multilingual-cased-v2'
# Place the models into an array so we can iterate over them
models = {'With_Training': SentenceTransformer(model_name), 'Without_Training': SentenceTransformer(model_name)}


# Train the model
training_model = models['With_Training']

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=5)
train_loss = losses.CosineSimilarityLoss(training_model)

training_model.fit(train_objectives=[(train_dataloader, train_loss)], show_progress_bar=True)

In [6]:
# Run each model on the test data and write their output to a file
for key, model in models.items():
    resultsFileName = f'data/Result_Files/{key}_{model_name}.txt'
    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True, device=device)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True, device=device)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    # TODO: Confirm taking the absolute value is correct here
    # Normalize cosine_scores by taking the absolute value and multiplying by 5 then rounding. Note, the multiplication broadcasts over the tensor.
    cosine_scores = torch.round(abs(cosine_scores) * 5)

    with open(resultsFileName, 'w', encoding='utf8') as outfile:
        for i in range(len(sentences1)):
            outfile.writelines(str(int(cosine_scores[i][i].item())) + '\n')

print('Done testing phase.')

RuntimeError: CUDA out of memory. Tried to allocate 1.26 GiB (GPU 0; 4.00 GiB total capacity; 2.34 GiB already allocated; 0 bytes free; 2.70 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF