In [5]:
import requests
import json
import numpy as np

from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

import math

In [164]:
model = SentenceTransformer('all-MiniLM-L6-v2')

#characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green', 'others']

characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']

sentences = []

for character in characters:
    f = open("embeddings/" + character +"1000"+ ".txt", "r")
    lines = f.readlines()
    sentences.append(lines)

i=0
embeddings = {}
for character in characters:
    embeddings[character] = model.encode(sentences[i])
    i+=1


#to confirm number of lines used to create the embedding
print(len(embeddings['Monica Geller']))

for character in characters:

    mean = embeddings[character][0]
    for embedding in embeddings[character]:
        mean += embedding
    
    embeddings[character] = mean



1000


In [7]:
#test set 1 = season 10

json_file = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/friends_season_10.json'
    
r = requests.get(json_file)
    
test_set = json.loads(r.text)

In [8]:
def tf_idf_calculator(lines):    
    # Compute the term frequency of each word in each sentence
    tf = []
    
    for sentence in lines:
        tf_sentence = {}
        for word in sentence.split():
            tf_sentence[word] = tf_sentence.get(word, 0) + 1
        tf.append(tf_sentence)

    # Compute the inverse document frequency of each word
    idf = {}
    for sentence in lines:
        for word in sentence.split():
            idf[word] = idf.get(word, 0) + 1
    for word in idf:
        idf[word] = math.log(len(lines) / idf[word])

    # Compute the tf-idf of each sentence
    tf_idf = []
    for i, sentence in enumerate(lines):
        tf_idf_sentence = 0
        for word in tf[i]:
            tf_idf_sentence += tf[i][word] * idf[word]
        tf_idf.append(tf_idf_sentence)
    
    return tf_idf

In [9]:
all_lines = []
for episode in test_set['episodes']:
    for scene in episode['scenes']:
        for utterance in scene['utterances']:
            if utterance['speakers'] != []:
                all_lines.append(utterance['transcript'])

#remove duplicates
all_lines = list( dict.fromkeys(all_lines) )

tf_idf = tf_idf_calculator(all_lines)

all_lines_tf_idf = []

#format: all_lines_tf_idf = [...,[tf-idf score ,sentence],...]
for i,sentence in enumerate(all_lines):
    cell = []
    cell= [tf_idf[i], sentence]
    all_lines_tf_idf.append(cell)
    
# lower -> higher     
sorted_list = sorted(all_lines_tf_idf)

tf_idfs_utterances = []

#get only the strings
for elem in sorted_list:
    tf_idfs_utterances.append(elem[1])


#for elem in sorted_list:
#    print(elem)


In [10]:
def compute_similarity(vector):

    angles = {}

    for character in characters:
        angles[character] = cosine_similarity(vector.reshape(1,-1), embeddings[character].reshape(1,-1))[0][0]

    
    
    #Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity

    character = [i for i in angles if angles[i]==max(angles.values())]


    return character[0]


all_sentences = []

for s in sentences:
    for l in s:
        all_sentences.append(l)


In [60]:
predicted = []
real = []

correct = 0
total = 0

for episode in test_set['episodes']:
    for scene in episode['scenes']:
        for utterance in scene['utterances']:
            if len(utterance['speakers']) == 1 and utterance['transcript'] not in all_sentences:

                total += 1

                line_embed = model.encode(utterance['transcript'])
    
                pred = compute_similarity(line_embed)

                predicted.append(pred)
                real.append(utterance['speakers'][0])

                if pred == utterance['speakers'][0]:
                    correct+=1
                elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                    correct+=1

                #print("predicted :", pred)
                #print("real:", utterance['speakers'][0])

                    
print(accuracy_score(real, predicted)*100)

accuracy = (float(correct)/total)*100

print(accuracy)

print(len(characters))

13.525923091639564
18.691409986607997
7


classifier WITH filter in the test set

In [169]:
predicted = []
real = []

correct = 0
total = 0


threshold = 10

all = 0
for episode in test_set['episodes']:
    for scene in episode['scenes']:
        for utterance in scene['utterances']:
            if len(utterance['speakers']) == 1 and utterance['transcript'] not in all_sentences:
                all += 1

                index = tf_idfs_utterances.index(utterance['transcript'])

                if sorted_list[index][0] >= threshold:

                    total += 1

                    line_embed = model.encode(utterance['transcript'])
        
                    pred = compute_similarity(line_embed)

                    predicted.append(pred)
                    real.append(utterance['speakers'][0])

                    if pred == utterance['speakers'][0]:
                        correct+=1
                    elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                        correct+=1


                    
print(accuracy_score(real, predicted)*100)

accuracy = (float(correct)/total)*100

print(accuracy)
print(len(characters))
print(threshold)

print(100 - float(total/all)*100)

17.54855994641661
17.54855994641661
6
10
14.310311842356995
