In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

from sklearn.metrics import accuracy_score

from scipy.sparse import hstack
from sklearn.metrics import confusion_matrix

import requests
import json

import numpy as np

import random

from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green', 'others']

#test set 1 = season 8

with open('sets/test_set1.json') as f:
    test_set1 = json.load(f)

with open('sets/train_set1.json') as f:
    train_set1 = json.load(f)

################################################################

#test set 2 = 10% of each season

with open('sets/test_set2.json') as f:
    test_set2 = json.load(f)

with open('sets/train_set2.json') as f:
    train_set2 = json.load(f)


#CHANGE HERE
test_set = test_set2
train_set = train_set2

all_lines = []

#corpus
for line in train_set:
    all_lines.append(line['transcript'])

Filtration

In [3]:
import string
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit(all_lines)

feature_names = vectorizer.get_feature_names_out()

test_lines = []
speakers = []

for line in test_set:
    test_lines.append(line['transcript'])
    speakers.append(line['speakers'][0])

tf_idf = vectorizer.transform(test_lines)

dense = tf_idf.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

values = []

for sentence in dense:
    values.append(sentence.sum())


#associate values with sentences
sentences_tfidf = []
for i in range(len(test_lines)):
    sentences_tfidf.append({'sentence': test_lines[i], 'value': values[i], 'speaker': speakers[i]})

#sort sentences by value
sentences_tfidf = sorted(sentences_tfidf, key = lambda i: i['value'], reverse=True)

sentences_only = [sentence['sentence'] for sentence in sentences_tfidf]

Random Embeddings

In [4]:
for character in characters:
    character_lines = []
    for line in train_set:
        if character != 'others' and character in line['speakers']:
            character_lines.append(line['transcript'])
        elif character == 'others' and line['speakers'][0] not in characters:
            character_lines.append(line['transcript'])

    #change x for number of lines 
    values = [2000]

    for x in values:

        randomlist = random.sample(character_lines, x)

        f = open("embeddings" + "/" + character + str(x)+ ".txt", "w")
        for line in randomlist:
            f.write(line+"\n")
        f.close()

In [4]:
def compute_similarity(vector):

    angles = {}

    for character in characters:
        angles[character] = cosine_similarity(vector.reshape(1,-1), embeddings[character].reshape(1,-1))[0][0]

    
    
    #Smaller angles between vectors produce larger cosine values, indicating greater cosine similarity

    character = [i for i in angles if angles[i]==max(angles.values())]


    return character[0]

In [5]:
#change x for number of lines 
#values = [5,10,20,50,100,150,200,500,1000,2000,5000]
#values = [2000]

values = [2000]

for value in values:
    sentences = []

    for character in characters:
        f = open("embeddings" + "/" + character + str(value) + ".txt", "r")
        lines = f.readlines()
        sentences.append(lines)

    i=0
    embeddings = {}
    for character in characters:
        embeddings[character] = model.encode(sentences[i])
        i+=1

    #to confirm number of lines used to create the embedding
    print("size embedding", len(embeddings['Monica Geller']))

    for character in characters:

        mean = embeddings[character][0]
        for embedding in embeddings[character]:
            mean += embedding
        
        embeddings[character] = mean
    

    #####test set computation
    
    predicted = []
    real = []

    correct = 0
    total = 0

    for utterance in test_set:
        #filtration
        index = sentences_only.index(utterance['transcript'])
        if sentences_tfidf[index]['value'] >= 3:
            if len(utterance['speakers']) == 1:

                total += 1

                line_embed = model.encode(utterance['transcript'])
            
                pred = compute_similarity(line_embed)

                predicted.append(pred)
                real.append(utterance['speakers'][0])

                if pred == utterance['speakers'][0]:
                    correct+=1
                elif pred == "others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                    correct+=1

                        
    print(accuracy_score(real, predicted)*100)

    accuracy = (float(correct)/total)*100

    print("accuracy: ", accuracy)

    #print("f1 score: ", f1_score(real, predicted, average='macro'))

    print(len(characters))

size embedding 2000
20.133111480865225
accuracy:  21.686078757626177
7


Machine Learning: Nayve Bayes

Filtration

In [3]:
import string
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit(all_lines)

feature_names = vectorizer.get_feature_names_out()

test_lines = []
speakers = []

for line in test_set:
    test_lines.append(line['transcript'])
    speakers.append(line['speakers'][0])

tf_idf = vectorizer.transform(test_lines)

dense = tf_idf.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

values = []

for sentence in dense:
    values.append(sentence.sum())


#associate values with sentences
sentences_tfidf = []
for i in range(len(test_lines)):
    sentences_tfidf.append({'sentence': test_lines[i], 'value': values[i], 'speaker': speakers[i]})

#sort sentences by value
sentences_tfidf = sorted(sentences_tfidf, key = lambda i: i['value'], reverse=True)

sentences_only = [sentence['sentence'] for sentence in sentences_tfidf]

In [11]:
train_set_lines = []
train_set_speakers = []

for utterance in train_set:
    #ensure it is NOT a scene annotation and it is an actual line
    if utterance['speakers']!= []:
        train_set_lines.append(utterance['transcript'])
        train_set_speakers.append(utterance['speakers'][0])


test_set_lines = []
test_set_speakers = []

for utterance in test_set:
    #filtration
    index = sentences_only.index(utterance['transcript'])
    if sentences_tfidf[index]['value'] >= 3:
        #ensure it is NOT a scene annotation and it is an actual line
        if utterance['speakers']!= []:
            test_set_lines.append(utterance['transcript'])
            test_set_speakers.append(utterance['speakers'][0])

# Word Counts
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_set_lines)
test_vectors = vectorizer.transform(test_set_lines)

# Train the Naive Bayes classifier
clf = MultinomialNB()
clf.fit(train_vectors, train_set_speakers)

# Classify the test set
test_predictions = clf.predict(test_vectors)

accuracy = accuracy_score(test_set_speakers, test_predictions)

print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 31.45%


In [1]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch
import json

# Load the trained RoBERTa model and tokenizer
model_name = "roberta-large"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Load your test set JSON
with open('sets/train_set1.json', 'r') as json_file:
    test_data = json.load(json_file)

# Define a function to predict the speaker for a given transcript
def predict_speaker(transcript):
    input_ids = tokenizer.encode(transcript, return_tensors="pt")
    with torch.no_grad():
        logits = model(input_ids).logits
    predicted_class = logits.argmax().item()

    # Map predicted_class to speaker names or "others"
    speakers = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green', 'others']
    predicted_speaker = speakers[predicted_class]
    
    return predicted_speaker

# Make predictions on the test set
predictions = []

for data_point in test_data:
    transcript = data_point['transcript']
    predicted_speaker = predict_speaker(transcript)
    predictions.append({"transcript": transcript, "predicted_speaker": predicted_speaker})

# Save the predictions to a JSON file
with open('predictions.json', 'w') as json_file:
    json.dump(predictions, json_file, indent=4)

print("Predictions saved to 'predictions.json'")


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.88MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.41MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 482/482 [00:00<00:00, 1.22MB/s]
Downloading model.safetensors: 100%|██████████| 1.42G/1.42G [05:55<00:00, 4.00MB/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
