In [10]:
import csv
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np

In [11]:
questions = []
human_answers = []
bot_answers = []

In [12]:
with open('accuracy.csv') as f:
    reader = csv.reader(f)
    next(reader) # discard csv headers
    for row in reader:
        bot_answer, human_answer, question = row.pop(), row.pop(), row.pop()
        bot_answers.append(bot_answer)
        human_answers.append(human_answer)
        questions.append(question)

In [13]:
# create a CountVectorizer object
vectorizer = CountVectorizer()
cosines = []
# fit the vectorizer on the sentences
for i in range(len(questions)):
    vectorizer.fit_transform([human_answers[i], bot_answers[i]])
    # transform the sentences into vectors
    vector1 = vectorizer.transform([human_answers[i]]).toarray()
    vector2 = vectorizer.transform([bot_answers[i]]).toarray()
    # calculate cosine similarity between the two vectors
    cosine_sim = cosine_similarity(vector1, vector2)
    cosines.append(cosine_sim.flatten()[0])
    
print(cosines)

[0.9655402033694289, 0.9999999999999999, 0.7915594835766295, 0.9545454545454546, 0.6607934284572003, 0.5956098817361238, 0.6546536707079771, 0.7071067811865476, 0.618688224889746, 0.8261527758779903]


In [14]:
# compute predicted labels
# if the value of cosine similarity is bigger than 0.7 consider as correct ('1')
threshold = 0.7
predicted_labels = [1 if value > threshold else 0 for value in cosines]

# Print the predicted values
print(predicted_labels)

[1, 1, 1, 1, 0, 0, 0, 1, 0, 1]


In [15]:
# true labels are the control group of correct answers, all values in this list are correct ('1')
true_labels = [1 for i in range(len(predicted_labels))]

In [16]:
predicted_classes = np.where(np.array(predicted_labels) >= threshold, 1, 0)

In [20]:
# Compute confusion matrix
confusion = confusion_matrix(np.array(true_labels), predicted_classes)
# true negatives (TN) false positives (FP) false negatives (FN) true positives (TP)
tn, fp, fn, tp = confusion.ravel()

[0 0 4 6]


In [19]:
# calculate f1_score given the true and predicted labels
f1_simple = f1_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_classes)
print("F1 score simple:", f1_simple)
print("F1 score:", f1)

F1 score simple: 0.7499999999999999
F1 score: 0.7499999999999999
