In [5]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [6]:
df = pd.read_csv("simpsons_dataset.csv")
df.shape

(158314, 2)

In [7]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [8]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [9]:
# Removing the missing values:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [10]:
# We are lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue
nlp = spacy.load("en",disable=["ner","parser"])    # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    
    txt = [token.lemma_ for token in doc if not token.is_stop]
    
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    
    if len(txt) > 2:
        return ' '.join(txt)

In [11]:
# Removes non-alphabetic characters:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [12]:
# Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 2.32 mins


In [13]:
# Put the results in a DataFrame to remove missing values and duplicates:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(92412, 1)

In [31]:
from gensim.models.phrases import Phrases, Phraser

In [32]:
sent = [row.split() for row in df_clean['clean']]

In [57]:
phraser = Phrases(sent, min_count=30, progress_per=10000)

INFO - 23:51:42: collecting all words and their counts
INFO - 23:51:42: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 23:51:42: PROGRESS: at sentence #10000, processed 67396 words and 50551 word types
INFO - 23:51:42: PROGRESS: at sentence #20000, processed 140465 words and 95808 word types
INFO - 23:51:43: PROGRESS: at sentence #30000, processed 207950 words and 132011 word types
INFO - 23:51:43: PROGRESS: at sentence #40000, processed 270207 words and 164407 word types
INFO - 23:51:43: PROGRESS: at sentence #50000, processed 334085 words and 196195 word types
INFO - 23:51:43: PROGRESS: at sentence #60000, processed 400877 words and 228659 word types
INFO - 23:51:44: PROGRESS: at sentence #70000, processed 467802 words and 260712 word types
INFO - 23:51:44: PROGRESS: at sentence #80000, processed 534361 words and 292095 word types
INFO - 23:51:44: PROGRESS: at sentence #90000, processed 602037 words and 321944 word types
INFO - 23:51:44: collected 328658 word typ

In [59]:
sentence = bigram(sent)

# most freuent words
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

TypeError: 'Phrases' object is not callable

In [44]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

[]

In [45]:
# Training the model
import multiprocessing

from gensim.models import Word2Vec

In [47]:
cores = multiprocessing.cpu_count()      # Count the number of cores in a computer

In [48]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [49]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

NameError: name 'sentences' is not defined