# Goal: Create a word vector from a game of thrones dataset and analyze them to see semantic similarity

In [None]:
from __future__ import absolute_import, division, print_function

# for word encoding
import codecs

# regex
import glob

# concurrency
import multiprocessing

# operating system
import os

# pretty printing
import pprint

# regular expressions
import re

# natural language toolkit
import nltk

# word2vec
import gensim.models.word2vec as w2v

# dimensionality reduction
import sklearn.manifold

# math
import numpy as np

#plotting
import matplotlib.pyplot as plt

# pandas
import pandas as pd

# visualization
import seaborn as sns

# Step 1: process our data
### Clean data

In [None]:
nltk.download('punkt') #pre-train tokenizer (we take a piece of text and we split it in sentences (in this case))
nltk.download('stopwords') # words like and, the, an, of

## get the book names, matching txt file

In [None]:
book_filenames = sorted(glob.glob("./data/*.txt"))
print(book_filenames)

## Combine the books into one string

In [None]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{}'...".format(book_filename))
    with codecs.open(book_filename, 'r', 'utf-8') as book_file:
        corpus_raw += book_file.read()
    print("Corpus raw is now {} characters long".format(len(corpus_raw)))
    print()

## split the corpus into sentences

## contert the sentences into a list of words

In [None]:
# remove unnecessary , split into words, no hyphens
# list of words
def sentence_to_wordlist(raw):
    clean = re.sub('[^a-zA-Z]', ' ', raw)
    words = clean.split()
    return words

# sentences where each word is tokenized
sentences = []

for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
print(raw_sentences[60])
print(sentences[60])

In [None]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

# Step 2: Build Word2Vec

## Word Embedding

![Linear relationships](./images/linear-relationships.png)

### Once we have vectors, 3 main tasks that vectors help with:
- DISTANCE
- SIMILARITY
- RANKING

### Hyperparameters

In [None]:
# Dimensionality of the resulting word vectors
# the more dimensions, more computationally expensive to train, 
# but also more accurate
# more dimensions = more generalized
num_features = 300

# minimum word count threshold
min_word_count = 3

# number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# contexxt window length
context_size = 7

# Downsample setting for frequent words
# 0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for random number generator, to make the result reproducible.
# deterministic, good for debugging
seed = 1

## Actual model using gensim
Docstring:     
Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/

- `sg` defines the training algorithm. By default (`sg=0`), CBOW is used.
Otherwise (`sg=1`), skip-gram is employed.

- `size` is the dimensionality of the feature vectors.

- `window` is the maximum distance between the current and predicted word within a sentence.

- `seed` = for the random number generator. Initial vectors for each
word are seeded with a hash of the concatenation of word + str(seed).
Note that for a fully deterministically-reproducible run, you must also limit the model to
a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python
3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED
environment variable to control hash randomization.)

- `min_count` = ignore all words with total frequency lower than this.

- `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
    default is 1e-3, useful range is (0, 1e-5).

- `workers` = use this many worker threads to train the model (=faster training with multicore machines).

In [None]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [None]:
thrones2vec.build_vocab(sentences)

# Step 3: Train the model

In [None]:
thrones2vec.train(sentences, total_examples=thrones2vec.corpus_count, epochs=thrones2vec.iter)

### Save to file

In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

# Step 4: Explore the trained model

In [None]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

## Compress the word vectors into 2D space and plot them

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [None]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = thrones2vec.wv.syn0

## train t-SNE