In [26]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

import requests
import json

import math
from collections import Counter

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

In [2]:
characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green', 'others']

#test set 1 = season 8

with open('sets/test_set1.json') as f:
    test_set1 = json.load(f)

with open('sets/train_set1.json') as f:
    train_set1 = json.load(f)

################################################################

#test set 2 = 10% of each season

with open('sets/test_set2.json') as f:
    test_set2 = json.load(f)


In [3]:
def tf_idf_calculator(lines):    
    # Compute the term frequency of each word in each sentence
    tf = []
    
    for sentence in lines:
        tf_sentence = {}
        for word in sentence.split():
            tf_sentence[word] = tf_sentence.get(word, 0) + 1
        tf.append(tf_sentence)

    # Compute the inverse document frequency of each word
    idf = {}
    for sentence in lines:
        for word in sentence.split():
            idf[word] = idf.get(word, 0) + 1
    for word in idf:
        idf[word] = math.log(len(lines) / idf[word])

    # Compute the tf-idf of each sentence
    tf_idf = []
    for i, sentence in enumerate(lines):
        tf_idf_sentence = 0
        for word in tf[i]:
            tf_idf_sentence += tf[i][word] * idf[word]
        tf_idf.append(tf_idf_sentence)
    
    return tf_idf

In [13]:
#substitue train_set1/2

values = []


for character in characters:

    lines_withcontext = []
    lines = []

    for idx, utterance in enumerate(train_set1):
        if utterance['speakers'] != [] and character in utterance['speakers'] and idx !=0:
            lines_withcontext.append([utterance['transcript'], train_set1[idx-1]['transcript']])
            lines.append(utterance['transcript'])
        elif idx ==0:
            lines_withcontext.append([utterance['transcript'], ""])
            lines.append(utterance['transcript'])
    
    
    tf_idf = tf_idf_calculator(lines)

    final = []

    #format: final = [...,[sentence, tf-idf score],...]
    for i,sentence in enumerate(lines):
        cell = []
        cell= [tf_idf[i], sentence]
        final.append(cell)
    
    # lower -> higher     
    sorted_list = sorted(final)

    l = []

    #get only the strings
    for elem in sorted_list:
        l.append(elem[1])

    #remove duplicates
    l = list( dict.fromkeys(l) )

    ''''
    final = []
    for utterance in l[-5:]:
        index = lines.index(utterance)
        final.append(lines_withcontext[index])
    

    values.append(final)'''

    #change x for number of lines 
    values = [5,10,20,50,100,150,200,500,1000,2000,5000, 6000]

    for x in values:

        higher = l[-x:]
        lower = l[:x]

        #substitute embeddings1/2

        f = open("embeddingscontext1/" + character + str(x)+ ".txt", "w")
        for line in higher:
            index = lines.index(line)

            for utterance in lines_withcontext[index]:
                f.write(utterance +"\n")
        f.close()

        f = open("embeddingscontext1/" + "others" + str(x)+ ".txt", "w")
        for line in lower:
            index = lines.index(line)
            
            for utterance in lines_withcontext[index]:
                f.write(utterance +"\n")
        f.close()




In [33]:
# Load pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2Model.from_pretrained(model_name)

def compute_embedding(previous_sentence,current_sentence):
    # Example input sentences
    # previous_sentence = "We were on a break."
    # current_sentence = "How you doin'?"

    previous_tokens = tokenizer.tokenize(previous_sentence)
    current_tokens = tokenizer.tokenize(current_sentence)

    input_tokens = ['[CLS]'] + previous_tokens + ['[SEP]'] + current_tokens + ['[SEP]']

    # Convert tokens to token IDs
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)

    # Convert the token IDs to a PyTorch tensor
    input_tensor = torch.tensor([input_ids])

    # Generate contextual embeddings
    with torch.no_grad():
        model_output = model(input_tensor)

    # Get the contextual embeddings for the current sentence
    embedding = model_output.last_hidden_state.mean(dim=1)[0]  # Mean pooling over tokens

    return embedding


In [30]:
sentences = []

for character in characters:
    f = open("embeddingscontext1/" + character +"100"+ ".txt", "r")
    lines = f.readlines()
    sentences.append(lines)


i=0
embeddings = {}
for character in characters:
    embeddings[character] = []
    for idx, utterance in enumerate(sentences[i]):
        if (idx % 2)==0:
            embeddings[character].append(compute_embedding(sentences[i][idx+1], utterance))

    i+=1

#print(embeddings)

for character in characters:

    sum = 0
    for embedding in embeddings[character]:
        sum += embedding

    
    embeddings[character] = sum


In [27]:
def compute_similarity(vector):

    angles = {}

    for character in characters:
        angles[character] = cosine_similarity(vector.reshape(1,-1), embeddings[character].reshape(1,-1))[0][0]

    
    
    #Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity

    character = [i for i in angles if angles[i]==max(angles.values())]


    return character[0]

In [31]:
predicted = []
real = []

correct = 0
total = 0

for idx, utterance in enumerate(test_set1):
    if len(utterance['speakers']) == 1:

        total += 1
        if idx == 0:
            line_embed = compute_embedding("", utterance['transcript'])
        else:
            line_embed = compute_embedding(test_set1[idx-1]['transcript'], utterance['transcript'])
    
        pred = compute_similarity(line_embed)

        predicted.append(pred)
        real.append(utterance['speakers'][0])

        if pred == utterance['speakers'][0]:
            correct+=1
        elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
            correct+=1

                    
print(accuracy_score(real, predicted)*100)

accuracy = (float(correct)/total)*100

print("accuracy: ", accuracy)

#print("f1 score: ", f1_score(real, predicted, average='macro'))

print(len(characters))

11.012719368861697
accuracy:  14.168410883915634
7
