In [2]:
import string
import operator

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

import pandas as pd
import json

from sklearn.metrics import accuracy_score

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /cfs/home/u021320/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from nltk.stem import PorterStemmer

def generate_ngrams(sentence, n):
    # Convert sentence to lowercase and remove punctuation
    import string
    sentence = sentence.lower().translate(str.maketrans('', '', string.punctuation)).lower()

    # Split sentence into words
    words = sentence.split()

    # Remove stop words

    #words = [w for w in words if not w in stop_words]

    #words = [PorterStemmer().stem(w) for w in words]

    # Generate N-grams
    ngrams = []
    for i in range(len(words) - n + 1):
        ngrams.append(' '.join(words[i:i+n]))

    return ngrams

In [4]:
def build_dict(train_set, characters,n):
    values = []
    
    for character in characters:
            lines = []
            counts = []
            words = []

            for utterance in train_set:
                if character in utterance['speakers'] and utterance['transcript'] != "":
                    lines.append(utterance['transcript'])
                elif character == 'Others' and utterance['speakers'][0] not in characters and utterance['transcript'] != "":
                    lines.append(utterance['transcript'])

            n_grams = []

            for line in lines:
                #remve punctuation
                line = line.translate(str.maketrans('', '', string.punctuation)).lower()

                if n==1:
                    line = word_tokenize(line)

                    for word in line:
                        if word not in stop_words and word != "’":
                            words.append(word.lower())    
                
                else:
                    n_gram = generate_ngrams(line,n)
                    for word in n_gram:
                        n_grams.append(word)


            if n==1:
                counts = Counter(words)
                counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
            
            else:
                counter = {}
                for elem in n_grams:
                    if elem in counter:
                        counter[elem]+=1
                    else:
                        counter[elem]=1
                counter_sorted = sorted(counter.items(), key=operator.itemgetter(1))
                counts = counter_sorted[::-1]

            values.append(counts)

    dict = {key: value for key, value in zip(characters, values)}

    return dict

In [24]:
with open('sets/train_set1.json') as f:
    train_set1 = json.load(f)

with open('sets/train_set2.json') as f:
    train_set2 = json.load(f)

with open('sets/test_set1.json') as f:
    test_set1 = json.load(f)

with open('sets/test_set2.json') as f:
    test_set2 = json.load(f)


#CHANGE HERE
test_set = test_set2
train_set = train_set2


#we can do this manually
characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green', 'Others']


In [12]:
#change 3rd argument to 1 for unigrams, 2 for bigrams, 3 for trigrams etc
dict = build_dict(train_set2, characters, 8)

df = pd.DataFrame.from_dict(dict, orient='index')

df_new = df.iloc[:, 0:5]

# displaying the DataFrame
df_new.T.style

Unnamed: 0,Monica Geller,Joey Tribbiani,Chandler Bing,Phoebe Buffay,Ross Geller,Rachel Green,Others
0,"('get it ill get it ill get it', 12)","('i am there i am there i am', 6)","('no no no no no no no no', 6)","('you cry and you cry and you cry', 4)","('ago there were these people called the maccabees', 4)","('ow ow ow ow ow ow ow ow', 9)","('confident woman who does not need to smoke', 4)"
1,"('ill get it ill get it ill get', 12)","('there i am there i am there i', 6)","('ow ow ow ow ow ow ow ow', 5)","('cat smelly cat what are they feeding you', 3)","('years ago there were these people called the', 4)","('my god oh my god oh my god', 6)","('strong confident woman who does not need to', 4)"
2,"('it ill get it ill get it ill', 11)","('can i talk to you for a second', 5)","('good game good game good game good game', 4)","('smelly cat smelly cat what are they feeding', 3)","('and years ago there were these people called', 4)","('oh my god oh my god oh my', 6)","('a strong confident woman who does not need', 4)"
3,"('no no no no no no no no', 4)","('am there i am there i am there', 4)","('get out get out get out get out', 4)","('cat what are they feeding you smelly cat', 3)","('years and years ago there were these people', 4)","('god oh my god oh my god oh', 5)","('see whats going on here this man is', 4)"
4,"('why wont i be married when im 40', 4)","('quack tweet tweet quack quack tweet tweet quack', 3)","('game good game good game good game good', 3)","('cat smellly cat what are they feeding you', 3)","('lets cool off okay lets get some frozen', 3)","('i am yes i am yes i am', 4)","('you see whats going on here this man', 4)"


In [11]:
def compute_similarity(profiles_character, profile_utterance, characters):
    similarities = []
    for profile_character in profiles_character:
        similarity = 0
        matches = 0
        for ngram in profile_character:
            # only common n-grams are taken into account
            if ngram in profile_utterance:
                matches += 1
                # similarity = (2*(profile_character[ngram] - profile_utterance[ngram])/ (profile_character[ngram] + profile_utterance[ngram]))²
                similarity += ((2*(profile_character[ngram] - profile_utterance[ngram])) / (profile_character[ngram] + profile_utterance[ngram]))**2
        
        if matches != 0:
            similarities.append(similarity)
        else:
            # there are no common N-grams (intersection is empty)
            similarities.append(None)

    if similarities == [None,None,None,None,None,None,None]:
        return None

    return characters[similarities.index(min([x for x in similarities if x is not None]))]

In [7]:
def build_profile_utterance(utterance, n):
    n_grams = []

    utterance = utterance.translate(str.maketrans('', '', string.punctuation)).lower()

    n_gram = generate_ngrams(utterance,n)
    
    for word in n_gram:
        n_grams.append(word)
        
    counter = {}
    for elem in n_grams:
        if elem in counter:
            counter[elem]+=1
        else:
            counter[elem]=1
    
    counter_sorted = sorted(counter.items(), key=operator.itemgetter(1))
    counts = counter_sorted[::-1]

    return {key: value/len(counts) for key, value in counts}

In [26]:
#change x for number of lines 
values = [5,10,20,50,100,150,200,500,1000,2000,5000]  # profile size
#values = [3]
n = [2,3,4,5,6]  # n gram size
#n = [2]

for value in values:

    print("profile size (M): " + str(value))

    for size in n:
        profiles = []
        
        print("n-gram size (N): " + str(size))
        # results for n gram with n=size
        dict = build_dict(train_set, characters, size)
        for character in dict:
            profile = {}
            total_n_grams = len(dict[character])

            for elem in dict[character][0:value]:
                profile[elem[0]] = elem[1]/total_n_grams

            profiles.append(profile)
        
        #####test set computation
        
        predicted = []
        real = []

        correct = 0
        total = 0

        for utterance in test_set:
            if len(utterance['speakers']) == 1 and utterance['transcript'] != "":

                profile_utterance = build_profile_utterance(utterance['transcript'], size)

                if profile_utterance != {}: # if the utterance is not empty, i.e, has n-grams

                    profile_utterance = {k: profile_utterance[k] for k in list(profile_utterance.keys())[:value]}
            
                    pred = compute_similarity(profiles, profile_utterance, characters)

                    if pred is not None:

                        total += 1

                        predicted.append(pred)
                        real.append(utterance['speakers'][0])

                        if pred in utterance['speakers']:
                            correct+=1
                        elif pred == "Others" and utterance['speakers'][0] not in ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']:
                            correct+=1

        print("correct: ", correct)
        print("total: ", total)
        accuracy = (float(correct)/total)*100

        print("accuracy: ", accuracy)

    print('\n')
    

profile size (M): 5
n-gram size (N): 2
correct:  122
total:  622
accuracy:  19.614147909967848
n-gram size (N): 3
correct:  38
total:  168
accuracy:  22.61904761904762
n-gram size (N): 4
correct:  16
total:  61
accuracy:  26.229508196721312
n-gram size (N): 5
correct:  11
total:  25
accuracy:  44.0
n-gram size (N): 6
correct:  4
total:  8
accuracy:  50.0


profile size (M): 10
n-gram size (N): 2
correct:  208
total:  1113
accuracy:  18.68823000898473
n-gram size (N): 3
correct:  90
total:  381
accuracy:  23.62204724409449
n-gram size (N): 4
correct:  29
total:  117
accuracy:  24.786324786324787
n-gram size (N): 5
correct:  11
total:  44
accuracy:  25.0
n-gram size (N): 6
correct:  7
total:  18
accuracy:  38.88888888888889


profile size (M): 20
n-gram size (N): 2
correct:  332
total:  2022
accuracy:  16.419386745796242
n-gram size (N): 3
correct:  149
total:  666
accuracy:  22.372372372372375
n-gram size (N): 4
correct:  46
total:  220
accuracy:  20.909090909090907
n-gram size (N): 5
c