In [93]:
import json
import numpy as np

from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

import math

from sklearn.metrics import f1_score

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/catarina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [99]:
model = SentenceTransformer('all-MiniLM-L6-v2')

characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green', 'others']

#characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']


#test set 1 = season 8

with open('sets/test_set1.json') as f:
    test_set1 = json.load(f)

with open('sets/train_set1.json') as f:
    train_set1 = json.load(f)

################################################################

#test set 2 = 10% of each season

with open('sets/test_set2.json') as f:
    test_set2 = json.load(f)

with open('sets/train_set2.json') as f:
    train_set2 = json.load(f)


#CHANGE HERE
test_set = test_set1
train_set = train_set1
#number = 1

all_lines = []

#corpus
for line in train_set:
    all_lines.append(line['transcript'])

In [79]:
def compute_similarity(vector):

    angles = {}

    for character in characters:
        angles[character] = cosine_similarity(vector.reshape(1,-1), embeddings[character].reshape(1,-1))[0][0]

    
    
    #Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity

    character = [i for i in angles if angles[i]==max(angles.values())]


    return character[0]

Approach 1

In [92]:
#change x for number of lines 
#values = [5,10,20,50,100,150,200,500,1000,2000,5000]
#values = [2000]

values = [100,250,500, 1000]

for value in values:
    sentences = []

    for character in characters:
        f = open("embeddings" + "/" + character + str(value) + ".txt", "r")
        lines = f.readlines()
        sentences.append(lines)

    i=0
    embeddings = {}
    for character in characters:
        embeddings[character] = model.encode(sentences[i])
        i+=1

    #to confirm number of lines used to create the embedding
    print("size embedding", len(embeddings['Monica Geller']))

    for character in characters:

        mean = embeddings[character][0]
        for embedding in embeddings[character]:
            mean += embedding
        
        embeddings[character] = mean
    

    #####test set computation
    
    predicted = []
    real = []

    correct = 0
    total = 0

    for utterance in test_set:
        if len(utterance['speakers']) == 1:

            total += 1

            line_embed = model.encode(utterance['transcript'])
        
            pred = compute_similarity(line_embed)

            predicted.append(pred)
            real.append(utterance['speakers'][0])

            if pred == utterance['speakers'][0]:
                correct+=1
            elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                correct+=1

                        
    print(accuracy_score(real, predicted)*100)

    accuracy = (float(correct)/total)*100

    print("accuracy: ", accuracy)

    #print("f1 score: ", f1_score(real, predicted, average='macro'))

    print(len(characters))

size embedding 200
15.640353750409433
accuracy:  19.079593842122502
7
size embedding 500
14.657713724205697
accuracy:  18.53914182771045
7
size embedding 1000
15.705863085489682
accuracy:  19.931215198165738
7
size embedding 2000
14.330167048804455
accuracy:  20.340648542417295
7


Filtration

In [100]:
import string

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit(all_lines)

feature_names = vectorizer.get_feature_names_out()

test_lines = []
speakers = []

for line in test_set:
    test_lines.append(line['transcript'])
    speakers.append(line['speakers'][0])

tf_idf = vectorizer.transform(test_lines)

dense = tf_idf.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

values = []

for sentence in dense:
    values.append(sentence.sum())


#associate values with sentences
sentences_tfidf = []
for i in range(len(test_lines)):
    sentences_tfidf.append({'sentence': test_lines[i], 'value': values[i], 'speaker': speakers[i]})

#sort sentences by value
sentences_tfidf = sorted(sentences_tfidf, key = lambda i: i['value'], reverse=True)

sentences_only = [sentence['sentence'] for sentence in sentences_tfidf]

In [103]:
total = 0
removed = 0

for line in sentences_tfidf:
    total+=1

    if line['value'] < 3:
        removed+=1


print(removed/total)


0.6994052403150619


In [91]:
#change x for number of lines 
values = [100,250,500, 1000]

threshold = [1,2,3]

for t in threshold:

    print("threshold", t)

    for value in values:
        sentences = []

        for character in characters:
            f = open("embeddings" + "/" + character + str(value) + ".txt", "r")
            lines = f.readlines()
            sentences.append(lines)

        i=0
        embeddings = {}
        for character in characters:
            embeddings[character] = model.encode(sentences[i])
            i+=1

        #to confirm number of lines used to create the embedding
        print("size embedding", len(embeddings['Monica Geller']))

        for character in characters:

            mean = embeddings[character][0]
            for embedding in embeddings[character]:
                mean += embedding
            
            embeddings[character] = mean
        

        #####test set computation
        
        predicted = []
        real = []

        correct = 0
        total = 0

        for utterance in test_set:
            if len(utterance['speakers']) == 1 and utterance['transcript'] != "":

                index = sentences_only.index(utterance['transcript'])

            if sentences_tfidf[index]['value'] >= t:

                total += 1

                line_embed = model.encode(utterance['transcript'])
            
                pred = compute_similarity(line_embed)

                predicted.append(pred)
                real.append(utterance['speakers'][0])

                if pred == utterance['speakers'][0]:
                    correct+=1
                elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                    correct+=1

                        
        print(accuracy_score(real, predicted)*100)

        accuracy = (float(correct)/total)*100

        print("accuracy: ", accuracy)

    print('\n')


threshold 1
size embedding 200
15.551921504497138
accuracy:  18.969746524938675
size embedding 500
14.63614063777596
accuracy:  18.47914963205233
size embedding 1000
15.682747342600164
accuracy:  19.869174161896975
size embedding 2000
14.292722812755521
accuracy:  20.29435813573181


threshold 2
size embedding 200
16.865284974093264
accuracy:  20.803108808290155
size embedding 500
16.06217616580311
accuracy:  19.533678756476682
size embedding 1000
17.875647668393782
accuracy:  20.544041450777204
size embedding 2000
17.564766839378237
accuracy:  20.958549222797927


threshold 3
size embedding 200
17.805953693495038
accuracy:  21.499448732083792
size embedding 500
17.585446527012127
accuracy:  20.286659316427784
size embedding 1000
20.17640573318633
accuracy:  22.105843439911794
size embedding 2000
19.570011025358326
accuracy:  21.775082690187432


