In [2]:
import pandas as pd
import numpy as np
import math

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

import time

## functions

In [3]:
# remove stopwords from a sentece and lemmatize it
# lemmatizer = WordNetLemmatizer()
# stopwords = stopwords.words()

def clean_sentence(sentence, lemmatizer, stopwords_list):
    # remove special characters and lower letters
    sentence = re.sub("[^A-Za-z0-9]+", ' ', sentence).lower()
    sentence_new = []

    for word in sentence.split():
        if word not in stopwords_list and len(word) > 1:
            sentence_new.append(lemmatizer.lemmatize(word))
        else:
            continue
    
    if len(sentence_new) > 2:
        return ' '.join(sentence_new)

## preparing data

In [4]:
df = pd.read_csv('data/simpsons_dataset.csv')
df = df.dropna().reset_index(drop=True)
df.shape

(131853, 2)

In [4]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
# lemmatization of words, removing special characters, lower words and remove stopwords

lemmatizer = WordNetLemmatizer()
stopwords_list = stopwords.words()

start_time = time.time()

spoken_words = df.spoken_words.apply(lambda x: clean_sentence(x, lemmatizer, stopwords_list))
spoken_words.dropna(inplace = True)
spoken_words.drop_duplicates(inplace = True)

print(time.time() - start_time)

59.02401065826416


In [65]:
spoken_words.shape

(90002,)

## tfidf version 1

In [66]:
# using tf idf in order to remove frequent words

sentences = spoken_words.values
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(sentences)

In [83]:
# argument percentage indicates how many percent of the most important words we want to keep in each sentece (values between 0 and 1)
# sentences is a list of strings

def remove_frequent_words(sentences, percentage):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(sentences)
    
    # vectors[i, j] is a tf-ids value for word feature_names[j] for sentence sentences[i]
    feature_names = vectorizer.get_feature_names()
    
    new_sentences = []
    
    for i in range(vectors.shape[0]):
        # vector form of a sentence
        sentence_vector = vectors[i].toarray()[0]
        indexes = np.where(sentence_vector != 0)[0]
        # words which occurs in a sentence with corresponding tf-idf values
        words_values = np.array([[feature_names[index], sentence_vector[index]] for index in indexes])
        # sort words by their tf-idf value and take only a percentage of words with the biggest tf-idf value
        try:
            new_sentence = words_values[words_values[:,1].argsort()][:,0]
            new_sentence = new_sentence[- int(len(new_sentence) * percentage) : ]
            # new_sentence = ' '.join(new_sentence)
        except:
            new_sentence = sentences[i].split()
        
        new_sentences.append(new_sentence)
        
    return new_sentences

In [84]:
new_sentences = remove_frequent_words(sentences, 0.8)

In [96]:
i = 4

print(new_sentences[i])
print(sentences[i])

['say' 'anything' 'left']
think anything left say


In [97]:
for word in sentences[i].split():
    no_occurances = 0

    for sentence in sentences:
        no_occurances += len(np.where(np.array(sentence.split()) == word)[0])

    print(word, no_occurances)

think 2653
anything 752
left 557
say 2398


## tfidf version 2

In [19]:
# suppose we have given a list of documents (each document is just a single string)
# bow argument is a bag of words for a single document
# word_dict is dictionary with all words from all documents as keys and 
# values indicates how many times these words appears in a single document for which we have given bow
# it returns a dictionary with values indicating tf value for each word in a word_dict and for given document

def computeTF(bow, word_dict):
    tfDict = {}
    bowCount = len(bow)
    for word, count in word_dict.items():
        tfDict[word] = count / float(bowCount)

    return tfDict


def computeIDF(word_dict_List):
    N = len(word_dict_List)
    
    # count the number of documents that contain a word w
    idfDict = dict.fromkeys(word_dict_List[0].keys(), 0)
    for doc in word_dict_List:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
                
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
        
    return idfDict

# computing tf-idf values for each document in a documents_list
# it returns a list with dictionaries, i-th dictionary contains words which appears in all documents as keys 
# and values indicates tf-idf values for given word for i-th document

def computeTFIDF(documents_list):
    # word set is a set with words from all documents
    word_set = set()
    for doc in documents_list:
        bow = doc.split()
        word_set = word_set.union(set(bow))
    
    # list of tf values for each document
    tf_documents = []
    # word_dict_List will be used to calculate idf
    word_dict_List = []
    
    # calculating tf values for each document
    for doc in documents_list:
        # bag of words for a document
        bow = doc.split()
        # word_dict is a dictionary which indicates how many times each word (from word_set) appears in a document
        word_dict = dict.fromkeys(word_set, 0)
        for word in bow:
            word_dict[word] += 1
            
        word_dict_List.append(word_dict)
        tf_documents.append(computeTF(bow, word_dict))
        
    # calculating idf for each word
    idfs =  computeIDF(word_dict_List)
    
    # calculating tf-idf values for all words in each document
    tfidf_list = []
    
    for tfDict in tf_documents:
        tfidf = {}
        for word, val in tfDict.items():
            tfidf[word] = val * idfs[word]
            
        tfidf_list.append(tfidf)
        
    return tfidf_list

In [16]:
doc_list = spoken_words.values[:2]

start_time = time.time()

tf_idfs = computeTFIDF(doc_list)

print(start_time - time.time())

0.0


In [17]:
tf_idfs

[{'show': 0.0,
  'sometimes': 0.07701635339554948,
  'disease': 0.07701635339554948,
  'think': 0.07701635339554948,
  'little': 0.07701635339554948,
  'lesson': 0.0,
  'teachshow': 0.0,
  'like': 0.0,
  'actually': 0.07701635339554948,
  'talk': 0.0,
  'although': 0.0,
  'natural': 0.07701635339554948,
  'news': 0.07701635339554948,
  'touch': 0.0,
  'magazine': 0.07701635339554948,
  'know': 0.0,
  'plan': 0.0,
  'sure': 0.0},
 {'show': 0.0,
  'sometimes': 0.0,
  'disease': 0.0,
  'think': 0.0,
  'little': 0.0,
  'lesson': 0.06931471805599453,
  'teachshow': 0.06931471805599453,
  'like': 0.06931471805599453,
  'actually': 0.0,
  'talk': 0.06931471805599453,
  'although': 0.06931471805599453,
  'natural': 0.0,
  'news': 0.0,
  'touch': 0.06931471805599453,
  'magazine': 0.0,
  'know': 0.06931471805599453,
  'plan': 0.06931471805599453,
  'sure': 0.06931471805599453}]

In [18]:
doc_list

array(['actually little sometimes disease magazine news show natural think',
       'know although sure like talk touch lesson plan teachshow show'],
      dtype=object)

## word embeddings model with word2vec

In [7]:
# finding phrases in spoken_words

sent = [row.split() for row in spoken_words]
# the bigger threshold parameter, the fewer phrases, it depends on min_count also
phrases = Phrases(sent, min_count = 30, threshold = 10)
# use Phraser to cut down memory consumption
bigram = Phraser(phrases)
sentences = list(bigram[sent])

In [15]:
# check what phrases a model has found
print(len(bigram.find_phrases(sent)))
bigram.find_phrases(sent)

125


{'capital_city': 156.57407407407408,
 'feel_like': 10.07566438138214,
 'little_girl': 21.5048803764934,
 'let_go': 12.686611423893615,
 'last_year': 11.496963931764087,
 'dr_hibbert': 467.58712121212125,
 'last_night': 43.36695329229122,
 'good_lord': 13.496176933223298,
 'next_week': 30.333402061855672,
 'homer_simpson': 20.336465149180203,
 'mr_simpson': 31.12648738301149,
 'young_lady': 12.761946516365196,
 'uh_huh': 20.25305448597987,
 'wait_wait': 28.819873624765414,
 'wait_minute': 113.8391940450399,
 'five_hundred': 30.95109373305866,
 'little_bit': 19.135373009220455,
 'god_bless': 22.764261126137203,
 'go_ahead': 36.55549802231905,
 'haw_haw': 640.7076124567474,
 'hee_hee': 1089.696016296967,
 'look_like': 11.149331297707505,
 'three_hundred': 12.484653792104927,
 'let_see': 13.553745928338762,
 'comic_book': 199.39813266124412,
 'ooh_ooh': 13.553097270393225,
 'leave_alone': 31.102659574468085,
 'hundred_dollar': 59.83576674500587,
 'woo_hoo': 814.1322293623331,
 'patty_selma

In [26]:
w2v_model = Word2Vec(sentences,
                     min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

In [27]:
w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 30, report_delay = 1)

(6809110, 17400750)

In [144]:
w2v_model.wv.most_similar('marge')

[('homer', 0.5787737369537354),
 ('abe', 0.5406563878059387),
 ('homie', 0.5033149719238281),
 ('badly', 0.498450368642807),
 ('livin', 0.48590725660324097),
 ('mom', 0.4801368713378906),
 ('settled', 0.47922274470329285),
 ('sorry', 0.47424599528312683),
 ('fault', 0.4683988690376282),
 ('sweetheart', 0.4633442461490631)]

In [31]:
w2v_model.wv['badly']

array([-7.82764927e-02, -1.04250707e-01, -9.59689766e-02,  7.13768229e-02,
        2.44302392e-01, -2.89452463e-01,  2.62507141e-01,  3.25873405e-01,
        2.52810687e-01, -7.94884488e-02, -1.10158287e-01, -2.13204414e-01,
       -1.26720360e-02, -2.02310309e-01, -7.16938227e-02,  3.05563654e-03,
        1.90176740e-01,  2.48435184e-01,  1.26867741e-01, -1.76872879e-01,
        1.51332229e-01, -1.70148730e-01, -1.43923024e-02,  9.82186347e-02,
        3.65428701e-02,  1.41942948e-01, -1.07179031e-01,  1.94095522e-01,
       -2.97538415e-02, -1.40040785e-01,  1.97938114e-01,  1.43576384e-01,
        3.50714549e-02, -3.54532897e-02, -8.84125102e-03, -7.70406872e-02,
       -5.39456494e-02, -7.72691816e-02,  7.70236403e-02,  7.00644404e-02,
       -2.37879232e-02, -2.87032891e-02, -3.30884866e-02, -4.17106152e-02,
        2.98272640e-01,  1.99935794e-01,  5.96361868e-02, -1.81941882e-01,
       -1.55064955e-01,  7.60004818e-02, -1.28048226e-01, -5.84208332e-02,
       -1.85599953e-01, -