# Embedding Metrics

### Importing libraries

*here you can use any other gensim.model as you like*

In [1]:
from gensim.models import KeyedVectors
import numpy as np
import docx
import json
import os

*use any vector embeddings, but GoogleNews-vectors-negative300.bin.gz is highly recommended*

In [2]:
# Download the pre-trained Word2Vec model (you can also use your own trained model)
# This example uses the Google News Word2Vec embeddings (large model)
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'  # Update this with the correct path
# Load the Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)


### Calculate here your scores: 

#### If using a String: 

In [9]:
chatbot_answer = "Every student have two attempts to finish teh assesments, Good luck! "
reference_answer = "Students have two attempts available for each module."

similarity_score_greedy = calculate_similarity(chatbot_answer, reference_answer, model='greedy')
print("Your Greedy Matching score: ", similarity_score_greedy)
similarity_score_greedy = calculate_similarity(chatbot_answer, reference_answer, model='extrema')
print("Your Vector Extrema score: ", similarity_score_greedy)
similarity_score_greedy = calculate_similarity(chatbot_answer, reference_answer, model='average')
print("Your Embedding Average score: ", similarity_score_greedy)

Your Greedy Matching score:  -0.63866776
Your Vector Extrema score:  0.90642244
Your Embedding Average score:  0.732627


#### If using files: 

In [10]:
chatbot_answer_files = "LONG_REFERENCE.docx"
reference_answer_files = "GPT-3_LONG.txt"

similarity_score_greedy = calculate_similarity_from_files(chatbot_answer_files, reference_answer_files, model='greedy')
print("Your Greedy Matching score: ", similarity_score_greedy)
similarity_score_greedy = calculate_similarity_from_files(chatbot_answer_files, reference_answer_files, model='extrema')
print("Your Vector Extrema score: ", similarity_score_greedy)
similarity_score_greedy = calculate_similarity_from_files(chatbot_answer_files, reference_answer_files, model='average')
print("Your Embedding Average score: ", similarity_score_greedy)

Your Greedy Matching score:  -0.014854658
Your Vector Extrema score:  0.99419194
Your Embedding Average score:  0.99313843


### Preprocessing, tokenising sentences and reading files

In [4]:
def preprocess_sentence(sentence):
    # Tokenize and preprocess the sentence
    return [token.lower() for token in sentence.split()]

def read_text_from_file(filepath):
    # Read text content from a text file
    with open(filepath, 'r') as file:
        text = file.read()
    return text

def read_text_from_docx(filepath):
    # Read text content from a docx file
    doc = docx.Document(filepath)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def read_text_from_json(filepath, key):
    # Read text content from a JSON file under a specific key
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data.get(key, "")

### Greedy Matching score

In [5]:
def find_closest_word_in_reference(token, reference_sentence, model):
    # Find the word in the reference sentence that is closest in meaning to the given token
    similarities = [(word, model.similarity(token, word)) for word in reference_sentence if word in model]
    if similarities:
        closest_word, _ = max(similarities, key=lambda x: x[1])
        return closest_word
    else:
        # Return the token itself if no words in the reference sentence are present in the model
        return token

def sentence_to_vector_greedy(sentence, reference_sentence, model):
    # Convert a sentence to a vector representation using word embeddings
    vectors = [model[token] - model[find_closest_word_in_reference(token, reference_sentence, model)] for token in sentence if token in model]
    if vectors:
        # Take the sum of the word vectors to represent the sentence
        sentence_vector = np.sum(vectors, axis=0)
    else:
        # Return zero vector if none of the tokens are present in the model
        sentence_vector = np.zeros(model.vector_size)
    return sentence_vector


### Vector Extrema score

In [6]:
def sentence_to_vector_extrema(sentence, model):
    # Convert a sentence to a vector representation using word embeddings
    vectors = [model[token] for token in sentence if token in model]
    if vectors:
        # Take the element-wise maximum and minimum of word vectors to represent the sentence
        max_vector = np.max(vectors, axis=0)
        min_vector = np.min(vectors, axis=0)
        # Combine the maximum and minimum vectors (e.g., concatenation)
        sentence_vector = np.concatenate([max_vector, min_vector])
    else:
        # Return zero vector if none of the tokens are present in the model
        sentence_vector = np.zeros(model.vector_size * 2)  # Double the dimension for max and min vectors
    return sentence_vector

### Embeddings Average score 

In [7]:
def sentence_to_vector_average(sentence, model):
    # Convert a sentence to a vector representation using word embeddings
    vectors = [model[token] for token in sentence if token in model]
    if vectors:
        # Take the average of word vectors to represent the sentence
        sentence_vector = np.mean(vectors, axis=0)
    else:
        # Return zero vector if none of the tokens are present in the model
        sentence_vector = np.zeros(model.vector_size)
    return sentence_vector

### Core function Calculator

In [8]:
def calculate_similarity(chatbot_answer, reference_answer, model='extrema'):
    # Preprocess the sentences
    chatbot_answer = preprocess_sentence(chatbot_answer)
    reference_answer = preprocess_sentence(reference_answer)

    if model == 'extrema':
        # Convert sentences to vectors using Extrema method
        chatbot_vector = sentence_to_vector_extrema(chatbot_answer, word2vec_model)
        reference_vector = sentence_to_vector_extrema(reference_answer, word2vec_model)
    elif model == 'average':
        # Convert sentences to vectors using Average method
        chatbot_vector = sentence_to_vector_average(chatbot_answer, word2vec_model)
        reference_vector = sentence_to_vector_average(reference_answer, word2vec_model)
    elif model == 'greedy':
        # Convert sentences to vectors using Greedy Matching method
        chatbot_vector = sentence_to_vector_greedy(chatbot_answer, reference_answer, word2vec_model)
        reference_vector = sentence_to_vector_greedy(reference_answer, chatbot_answer, word2vec_model)
    else:
        raise ValueError("Invalid model. Choose 'extrema', 'average', or 'greedy'.")

    # Check if either vector is a zero vector
    if np.all(chatbot_vector == 0) or np.all(reference_vector == 0):
        similarity_score = 0.0
    else:
        # Calculate cosine similarity between the two vectors
        similarity_score = np.dot(chatbot_vector, reference_vector) / (np.linalg.norm(chatbot_vector) * np.linalg.norm(reference_vector))
    return similarity_score

def calculate_similarity_from_files(chatbot_file, reference_file, model='extrema'):
    # Read content from chatbot and reference files
    if chatbot_file.endswith('.docx'):
        chatbot_answer = read_text_from_docx(chatbot_file)
    elif chatbot_file.endswith('.json'):
        chatbot_answer = read_text_from_json(chatbot_file, "chatbot")
    else:
        chatbot_answer = read_text_from_file(chatbot_file)

    if reference_file.endswith('.docx'):
        reference_answer = read_text_from_docx(reference_file)
    elif reference_file.endswith('.json'):
        reference_answer = read_text_from_json(reference_file, "reference")
    else:
        reference_answer = read_text_from_file(reference_file)

    return calculate_similarity(chatbot_answer, reference_answer, model)
