# This jupyter notebook book is basically a "copy" of the [one made by Pierre Megret](https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial)

# Imports

In [None]:
import re
import spacy
import numpy as np
import pandas as pd
import multiprocessing

from time import time
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

from utils.tsne import tsne_scatterplot

%load_ext autoreload
%autoreload 2

## Load

In [None]:
df = pd.read_csv('inputs/simpsons_dataset.csv')
print(df.shape)

In [None]:
df.head(n=10)

## Search for some values

In [None]:
df[df['spoken_words'] == 'I love you!']

## Show null values

In [None]:
df.isnull().sum()

## Remove null values

In [None]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

# After removing null values

In [None]:
print(df.shape)

## Cleaning

In [None]:
def custom_cleaning(doc):

    # lemmatizes and remove stop words
    # doc needs to be a spacy Doc object
    # TODO: consider to not filter stop words
    txt = [token.lemma_ for token in doc if not token.is_stop]

    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long, the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

## removes non-alphabetic characters
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [None]:
## Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:
t = time()
txt = [custom_cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

## Show the new data structure

In [None]:
txt[:10]

## New DataFrame

In [None]:
df_clean = pd.DataFrame({'clean': txt})
df_clean.shape

## Remove missing values and duplicates

In [None]:
# TODO: consider not dropping duplicates
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

## Store the sentences in a list as a list of words

In [None]:
sentence_stream = [row.split() for row in df_clean['clean']]
print(sentence_stream[0])

## Find relevant phrases
Bigrams like "Homer Simpson"

In [None]:
phrases = Phrases(sentence_stream, min_count=30, progress_per=10000)
bigram = Phraser(phrases)

## Transform the corpus based on the bigrams detected

In [None]:
# This will replace bigrams like "Homer Simpson" as a single token
sentences = bigram[sentence_stream]

## Most frequent words

In [None]:
word_freq = defaultdict(int)
for sentence in sentences:
    for word in sentence:
        word_freq[word] += 1
len(word_freq)

In [None]:
most_freq_words = sorted(word_freq, key=word_freq.get, reverse=True)
for idx, word in enumerate(most_freq_words):
    print(f'{word}: {word_freq[word]}')
    if idx > 10:
        break

## Check for the bigrams

In [None]:
idx = 0
for word in most_freq_words:
    if '_' in word:
        idx += 1
        print(word)
    if idx > 10:
        break

## Number of cores in the computer

In [None]:
cores = multiprocessing.cpu_count()
print(cores)

# Word2Vec
- Word2Vec Model
- Build Vocab
- Train

## Word2Vec Model

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

## Build Vocab

In [None]:
t = time()
w2v_model.build_vocab(sentences, progress_per=1e4)
print(f'Time to build vocab: {round((time() - t)/60, 2)} mins')

## Train

In [None]:
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print(f'Time to train the model: {round((time() - t) / 60, 2)} mins')

## For memory efficiency

In [None]:
w2v_model.init_sims(replace=True)

## Exploring the model

### Most similar words to homer

In [None]:
print(w2v_model.wv.most_similar(positive=['homer'], topn=3))
print(w2v_model.wv.most_similar(positive=['homer_simpson'], topn=3))
print(w2v_model.wv.most_similar(positive=['marge'], topn=3))
print(w2v_model.wv.most_similar(positive=['bart'], topn=3))

## Similarities between words

In [None]:
def print_similarity(words):
    similarity = w2v_model.wv.similarity(words[0], words[1])
    print(f'The similarity bewteen {words[0]} and {words[1]} is {similarity:.3f}')

print_similarity(['moe', 'tavern'])
print_similarity(['maggie', 'baby'])
print_similarity(['bart', 'nelson'])

## Odd-one-out
Which word does not belong to the group?

In [None]:
# list of bullies
# It should be milhouse, but it got wrong!
print(w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney']))

print(w2v_model.wv.doesnt_match(['nelson', 'bart', 'milhouse']))
print(w2v_model.wv.doesnt_match(['homer', 'patty', 'selma']))

## Analogies

In [None]:
print(w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3))
print(w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3))

In [None]:
w2v_model.wv.most_similar(positive=["woman", "king"], negative=["man"], topn=20)

## Trigrams and fourgrams (chunk of code added by me)

In [None]:
bigrams = Phrases(sentence_stream, min_count=5, delimiter=b' ')
trigrams = Phrases(bigrams[sentence_stream], min_count=5, delimiter=b' ')
fourgrams = Phrases(trigrams[bigrams[sentence_stream]], min_count=5, delimiter=b' ')

In [None]:
all_bigrams, all_trigrams, all_fourgrams = dict(), dict(), dict()
for sentence in sentence_stream:

    for bigram in bigrams[sentence]:
        if bigram.count(' ') == 1:
            all_bigrams[bigram] = all_bigrams.get(bigram, 0) + 1

    for trigram in trigrams[bigrams[sentence]]:
        if trigram.count(' ') == 2:
            all_trigrams[trigram] = all_trigrams.get(trigram, 0) + 1

    for fourgram in fourgrams[trigrams[bigrams[sentence]]]:
        if fourgram.count(' ') == 3:
            all_fourgrams[fourgram] = all_fourgrams.get(fourgram, 0) + 1

for idx, (word, freq) in enumerate(all_fourgrams.items()):
    print(word, freq)
    if idx >= 10:
        break

## Visualization

### 10 most similar words vs. 8 Random words

In [None]:
tsne_scatterplot(w2v_model, 'homer', ['dog', 'bird', 'ah', 'maude', 'bob', 'mel', 'apu', 'duff'])

In [None]:
tsne_scatterplot(w2v_model, 'maggie', [i[0] for i in w2v_model.wv.most_similar(negative=["maggie"])])

## 10 most similar words vs. 11th to 20th most similar words

In [None]:
tsne_scatterplot(w2v_model, "mr_burns", [t[0] for t in w2v_model.wv.most_similar(positive=["mr_burns"], topn=20)][10:])