In [None]:
# Preprocessing code. Concatenate all the gold standard results into one file.
# So long as the input and gold standard files are in the same order then this is safe.
import glob
import shutil
import os

# Concatenate all the results into one file
outFileName = "data/Result_Files/mergedGoldStandard.txt"
with open(outFileName, 'w', encoding='utf8') as outfile:
    for filename in glob.glob('data/Gold_Standard_Files/*.txt'):
        if filename == outFileName:
            # don't want to copy the output into the output
            continue
        with open(filename, 'r', encoding='utf8') as readfile:
            for line in readfile:
                # Remove all the empty lines
                # if not line.isspace():
                    outfile.write(line)

# Merge all the input files into one file
outFileName = "data/Input_Files/mergedInputFiles.txt"
with open(outFileName, 'w', encoding='utf8') as outfile:
    for filename in glob.glob('data/Input_Files/*.txt'):
        if filename == outFileName:
            # don't want to copy the output into the output
            continue
        with open(filename, 'r', encoding='utf8') as readfile:
            shutil.copyfileobj(readfile, outfile)

print('Done pre-processing.')

Done pre-processing.


In [None]:
!pip install nltk
!pip install sentence_transformers

In [None]:
# Import sentence transformer package. More information can be found here: https://www.sbert.net/
from sentence_transformers import SentenceTransformer, util
import torch

# If you have a cuda capable device we will send the tensors to that
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

# Read in the input files
# TODO: Do this over a pandas dataframe
readFileName = "data/Input_Files/mergedInputFiles.txt"
sentences1, sentences2 = [], []
with open(readFileName, 'r', encoding='utf8') as readFileName:
    for line in readFileName.readlines():
        sentences = line.split('\t')
        sentences1.append(sentences[0])
        sentences2.append(sentences[1])

# The following are SBERT models. More information here: https://www.sbert.net/docs/package_reference/models.html#main-classes
model_names = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'paraphrase-mpnet-base-v2', 'distiluse-base-multilingual-cased-v2']
# Place the models into an array so we can iterate over them
models = [SentenceTransformer(name) for name in model_names]

# Run each model on the test data and write their output to a file
for name, model in zip(model_names, models):
    resultsFileName = f'data/Result_Files/{name}.txt'
    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True, device=device)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True, device=device)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    # TODO: Confirm taking the absolute value is correct here
    # Normalize cosine_scores by taking the absolute value and multiplying by 5 then rounding. Note, the multiplication broadcasts over the tensor.
    cosine_scores = torch.round(abs(cosine_scores) * 5)

    with open(resultsFileName, 'w', encoding='utf8') as outfile:
        for i in range(len(sentences1)):
            outfile.writelines(str(int(cosine_scores[i][i].item())) + '\n')

print('Done testing phase.')

In [None]:
!pip install gensim
!pip install torch

In [None]:
# This is the doc2Vec model. It comes from a different package so it requires some different steps.
import gensim
import gensim.downloader as api
dataset = api.load("text8")
data = [d for d in dataset]
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])
data_for_training = list(tagged_document(data))
doc2Vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
doc2Vec_model.build_vocab(data_for_training)
doc2Vec_model.train(data_for_training, total_examples=doc2Vec_model.corpus_count, epochs=doc2Vec_model.epochs)

print('Done training doc2Vec model.')

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from numpy import dot



index = 0
cosine_scores = torch.zeros(len(sentences1))

def similarity_unseen_docs( model, doc_words1, doc_words2, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Compute cosine similarity between two post-bulk out of training documents.

        Document should be a list of (word) tokens.
        """
        d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps)
        d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps)
        return dot(gensim.matutils.unitvec(d1), gensim.matutils.unitvec(d2))


for sent_1, sent_2 in zip(sentences1, sentences2):
    vec1 = word_tokenize(sent_1)
    vec2 = word_tokenize(sent_2)
    #cos_distance = doc2Vec_model.similarity_unseen_docs(vec1, vec2)
    cos_distance = similarity_unseen_docs(doc2Vec_model, vec1, vec2)

    # Append to the score tensor
    cosine_scores[index] = cos_distance.item()
    index += 1

cosine_scores = torch.round(abs(cosine_scores) * 5)
print(cosine_scores)
resultsFileName = 'data/Result_Files/word2vec.txt'
with open(resultsFileName, 'w', encoding='utf8') as outfile:
    for i in range(len(sentences1)):
        outfile.writelines(str(int(cosine_scores[i].item())) + '\n')


In [None]:
filenames = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'paraphrase-mpnet-base-v2', 'distiluse-base-multilingual-cased-v2','word2vec']

goldStandard = []
goldStandardFileName = "data/Result_Files/mergedGoldStandard.txt"
with open(goldStandardFileName, 'r') as readfile:
    for line in readfile.readlines():
        goldStandard.append(line.strip())

print(goldStandard)


for files in filenames:
    modelResults = []
    modelFileName = f"data/Result_Files/{files}.txt"
    with open(modelFileName, 'r') as readfile:
        for line in readfile.readlines():
            modelResults.append(line.strip())
    
    # We now have removed from the model results the line that also do not exist in the merged gold standard 
    with open("data/Result_Files/"+files+"_cleaned.txt", "w") as txt_file:
        index = 0
        for elem in modelResults:
            if goldStandard[index]=='':
                txt_file.write("\n")
            else:
                txt_file.write(elem+ "\n") # works with any number of elements in a line
            index=index+1
