In [2]:
import json
import numpy as np

from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

import math

from sklearn.metrics import f1_score

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /cfs/home/u021320/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green', 'others']

#characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']


#test set 1 = season 8

with open('sets/test_set1.json') as f:
    test_set1 = json.load(f)

with open('sets/train_set1.json') as f:
    train_set1 = json.load(f)

################################################################

#test set 2 = 10% of each season

with open('sets/test_set2.json') as f:
    test_set2 = json.load(f)

with open('sets/train_set2.json') as f:
    train_set2 = json.load(f)


#CHANGE HERE
test_set = test_set1
train_set = train_set1
#number = 1

all_lines = []

#corpus
for line in train_set:
    all_lines.append(line['transcript'])

In [4]:
def compute_similarity(vector):

    angles = {}

    for character in characters:
        angles[character] = cosine_similarity(vector.reshape(1,-1), embeddings[character].reshape(1,-1))[0][0]

    
    
    #Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity

    character = [i for i in angles if angles[i]==max(angles.values())]


    return character[0]

Approach 1

In [10]:
#change x for number of lines 
values = [5,10,20,50,100,150,200,500,1000,2000,5000]
#values = [2000]

for value in values:
    sentences = []

    for character in characters:
        f = open("embeddings" + "/" + character + str(value) + ".txt", "r")
        lines = f.readlines()
        sentences.append(lines)

    i=0
    embeddings = {}
    for character in characters:
        embeddings[character] = model.encode(sentences[i])
        i+=1

    #to confirm number of lines used to create the embedding
    print("size embedding", len(embeddings['Monica Geller']))

    for character in characters:

        mean = embeddings[character][0]
        for embedding in embeddings[character]:
            mean += embedding
        
        embeddings[character] = mean
    

    #####test set computation
    
    predicted = []
    real = []

    correct = 0
    total = 0

    for utterance in test_set:
        if len(utterance['speakers']) == 1:

            total += 1

            line_embed = model.encode(utterance['transcript'])
        
            pred = compute_similarity(line_embed)

            predicted.append(pred)
            real.append(utterance['speakers'][0])

            if pred == utterance['speakers'][0]:
                correct+=1
            elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                correct+=1

                        
    print(accuracy_score(real, predicted)*100)

    accuracy = (float(correct)/total)*100

    print("accuracy: ", accuracy)

    #print("f1 score: ", f1_score(real, predicted, average='macro'))

    print(len(characters))

size embedding 2000
16.557484441532917
accuracy:  21.274156567310843
7


Approach 2

In [5]:
def tf_idf_calculator(line, character_lines, idf):

    counts = 0

    for sentence in character_lines:
        if line in sentence:
            counts += 1

    tf = counts / len(character_lines)

    tf_idf = tf * idf[line]

    return tf_idf

In [15]:
def calculate_custom_idf(all_lines, all_lines_set):
    idf = {}
    total_documents = len(all_lines)

    term_document_count = {}
    for line in all_lines:
        counts = 0
        for sentence in all_lines_set:
            if line in sentence:
                counts += 1
        term_document_count[line] = counts

    # Compute the IDF values with customization
    for line, count in term_document_count.items():

        idf[line] = math.log(total_documents / (count + 1))  # Adding 1 to avoid division by zero

        # Customization: Lower the IDF for frequently occurring terms
        if count > total_documents * 0.007:  # Adjust the threshold as needed
            idf[line] *= 0.05  # Adjust the penalty factor as needed

    return idf

In [17]:
import string

test_lines = []

for line in test_set:
    test_lines.append(line['transcript'])

idf = {}

all_lines_set = [sentence.translate(str.maketrans('', '', string.punctuation)).lower() for sentence in test_lines]

all_lines_set_no_stopwords = []

for line in all_lines_set:
    if line not in stopwords.words('english'):
        all_lines_set_no_stopwords.append(line)

idf = calculate_custom_idf(all_lines_set_no_stopwords, all_lines_set)
 
tf_idfs = []

lines_processed = []

for line in test_lines:
    line = line.translate(str.maketrans('', '', string.punctuation)).lower()
    if line not in stopwords.words('english'):
        lines_processed.append(line)

for line in lines_processed:
    tf_idfs.append(tf_idf_calculator(line, lines_processed, idf))


#associate values with sentences
sentences_tfidf = []
for i in range(len(lines_processed)):
    sentences_tfidf.append({'sentence': test_lines[i], 'value': tf_idfs[i]})

#sort sentences by value
sentences_tfidf = sorted(sentences_tfidf, key = lambda i: i['value'], reverse=True)

sentences_only = [sentence['sentence'] for sentence in sentences_tfidf]

for line in sentences_tfidf[:-10]:
    print(line)

{'sentence': 'Look, I understand if you came by to hit me, I deserve it.', 'value': 0.034218328993015516}
{'sentence': 'I know.', 'value': 0.03223829062742065}
{'sentence': 'Okay. Years ago, when I was backpacking across Western Europe...', 'value': 0.030897345742822425}
{'sentence': "Just then or-or all the time, 'cause we-we have jobs y'know.", 'value': 0.030897345742822425}
{'sentence': 'Yeah, we tried them all. We went for a walk, uh we tried a special tea, caster oil, spicy food nothing has worked.', 'value': 0.030897345742822425}
{'sentence': 'Oh. (He takes the picture and hugs her.', 'value': 0.030220264390600743}
{'sentence': "I know, but I don't think that's what she wants.", 'value': 0.030220264390600743}
{'sentence': 'Yeah!', 'value': 0.030220264390600743}
{'sentence': 'Welcome back!', 'value': 0.030220264390600743}
{'sentence': 'Hey!', 'value': 0.030220264390600743}
{'sentence': 'Are you two talking about the same baby? Hey! Have you started off thinking of names yet?', 'va

In [None]:
#change x for number of lines 
values = [100,150,500,1000,2000]

threshold = 30

for value in values:
    sentences = []

    for character in characters:
        f = open("embeddings" + "/" + character + str(value) + ".txt", "r")
        lines = f.readlines()
        sentences.append(lines)

    i=0
    embeddings = {}
    for character in characters:
        embeddings[character] = model.encode(sentences[i])
        i+=1

    #to confirm number of lines used to create the embedding
    print("size embedding", len(embeddings['Monica Geller']))

    for character in characters:

        mean = embeddings[character][0]
        for embedding in embeddings[character]:
            mean += embedding
        
        embeddings[character] = mean
    

    #####test set computation
    
    predicted = []
    real = []

    correct = 0
    total = 0

    for utterance in test_set:
        if len(utterance['speakers']) == 1 and utterance['transcript'] != "":

            total += 1

            index = sentences_only.index(utterance['transcript'])

        if sentences_tfidf[index][1] >= threshold:

            line_embed = model.encode(utterance['transcript'])
        
            pred = compute_similarity(line_embed)

            predicted.append(pred)
            real.append(utterance['speakers'][0])

            if pred in utterance['speakers']:
                correct+=1
            elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                correct+=1

                        
    print(accuracy_score(real, predicted)*100)

    accuracy = (float(correct)/total)*100

    print("accuracy: ", accuracy)

    #print("f1 score: ", f1_score(real, predicted, average='macro'))

    print(len(characters))

In [None]:
all_lines = []
speakers = []


for utterance in test_set:
    if utterance['speakers'] != []:
        all_lines.append(utterance['transcript'])

for utterance in test_set:
    if utterance['speakers'] != []:
        speakers.append(utterance['speakers'][0])

#remove duplicates
#all_lines = list( dict.fromkeys(all_lines) )


tf_idf = tf_idf_calculator(all_lines)

all_lines_tf_idf = []

#format: all_lines_tf_idf = [...,[tf-idf score ,sentence],...]
for i,sentence in enumerate(all_lines):
    cell = []
    cell= [tf_idf[i], sentence, speakers[i]]
    all_lines_tf_idf.append(cell)
    
# lower -> higher     
sorted_list = sorted(all_lines_tf_idf)

tf_idfs_utterances = []

#get only the strings
for elem in sorted_list:
    tf_idfs_utterances.append(elem[1])


for elem in sorted_list:
    print(elem)


In [104]:
predicted = []
real = []

correct = 0
total = 0

for utterance in test_set:
    if len(utterance['speakers']) == 1:

        total += 1

        line_embed = model.encode(utterance['transcript'])
    
        pred = compute_similarity(line_embed)

        predicted.append(pred)
        real.append(utterance['speakers'][0])

        if pred == utterance['speakers'][0]:
            correct+=1
        elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
            correct+=1

                    
print(accuracy_score(real, predicted)*100)

accuracy = (float(correct)/total)*100

print("accuracy: ", accuracy)

print("f1 score: ", f1_score(real, predicted, average='macro'))

print(len(characters))

15.067147068457254
accuracy:  19.84932852931543
f1 score:  0.0034750003354247954
7


classifier WITH filter in the test set

In [15]:
predicted = []
real = []

correct = 0
total = 0


#threshold = 80
threshold = 30

all = 0
for utterance in test_set:
    if len(utterance['speakers']) == 1 :
        all += 1

        index = tf_idfs_utterances.index(utterance['transcript'])

        #if sorted_list[index][0] <= threshold:
        if sorted_list[index][0] >= threshold:

            total += 1

            line_embed = model.encode(utterance['transcript'])
        
            pred = compute_similarity(line_embed)

            predicted.append(pred)
            real.append(utterance['speakers'][0])

            if pred == utterance['speakers'][0]:
                correct+=1
            elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                correct+=1


                    
print(accuracy_score(real, predicted)*100)

accuracy = (float(correct)/total)*100

print(accuracy)
print(len(characters))
print(threshold)

print(100 - float(total/all)*100)

14.319952067106051
18.184541641701617
7
30
46.25664144260183
