In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re


In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
# nltk.download("punkt")
# nltk.download("stopwords")

In [6]:
book_filenames = sorted(glob.glob("./data/*.txt"))

In [7]:
print("Found books:")
book_filenames

Found books:


['./data/got1.txt',
 './data/got2.txt',
 './data/got3.txt',
 './data/got4.txt',
 './data/got5.txt']

In [8]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading './data/got1.txt'...
Corpus is now 1770659 characters long

Reading './data/got2.txt'...
Corpus is now 4071041 characters long

Reading './data/got3.txt'...
Corpus is now 6391405 characters long

Reading './data/got4.txt'...
Corpus is now 8107945 characters long

Reading './data/got5.txt'...
Corpus is now 9719485 characters long



In [9]:
tokenizer  = nltk.data.load('tokenizers/punkt/english.pickle')

In [10]:
raw_sentences = tokenizer.tokenize(corpus_raw)


In [11]:
#convert into a list of words
#rtemove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words


In [12]:

#sentence where each word is tokenized#sentenc 
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))


In [13]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))


Heraldic crest by Virginia Norey.
['Heraldic', 'crest', 'by', 'Virginia', 'Norey']


In [14]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,818,103 tokens


In [15]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [16]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [17]:
thrones2vec.build_vocab(sentences)

2019-03-14 12:34:42,691 : INFO : collecting all words and their counts
2019-03-14 12:34:42,692 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-14 12:34:42,730 : INFO : PROGRESS: at sentence #10000, processed 140984 words, keeping 10280 word types
2019-03-14 12:34:42,774 : INFO : PROGRESS: at sentence #20000, processed 279730 words, keeping 13558 word types
2019-03-14 12:34:42,814 : INFO : PROGRESS: at sentence #30000, processed 420336 words, keeping 16598 word types
2019-03-14 12:34:42,849 : INFO : PROGRESS: at sentence #40000, processed 556581 words, keeping 18324 word types
2019-03-14 12:34:42,881 : INFO : PROGRESS: at sentence #50000, processed 686247 words, keeping 19714 word types
2019-03-14 12:34:42,920 : INFO : PROGRESS: at sentence #60000, processed 828497 words, keeping 21672 word types
2019-03-14 12:34:42,961 : INFO : PROGRESS: at sentence #70000, processed 973830 words, keeping 23093 word types
2019-03-14 12:34:43,000 : INFO : PROGRESS: at 

In [18]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

Word2Vec vocabulary length: 17277


In [19]:
thrones2vec.train(sentences,total_words=token_count, epochs=100)


2019-03-14 12:34:50,595 : INFO : training model with 8 workers on 17277 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2019-03-14 12:34:51,687 : INFO : EPOCH 1 - PROGRESS: at 26.90% words, 349722 words/s, in_qsize 15, out_qsize 0
2019-03-14 12:34:52,693 : INFO : EPOCH 1 - PROGRESS: at 55.47% words, 373576 words/s, in_qsize 15, out_qsize 0
2019-03-14 12:34:53,707 : INFO : EPOCH 1 - PROGRESS: at 85.11% words, 385639 words/s, in_qsize 15, out_qsize 0
2019-03-14 12:34:54,116 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-03-14 12:34:54,120 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-03-14 12:34:54,125 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-03-14 12:34:54,130 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-03-14 12:34:54,142 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-03-14 12:34:54,154 : INFO : worker thread finished; await

(140439602, 181810300)

In [20]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [21]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

2019-03-15 09:44:14,276 : INFO : saving Word2Vec object under trained/thrones2vec.w2v, separately None
2019-03-15 09:44:14,283 : INFO : not storing attribute vectors_norm
2019-03-15 09:44:14,286 : INFO : not storing attribute cum_table
2019-03-15 09:44:14,895 : INFO : saved trained/thrones2vec.w2v


In [22]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [23]:
all_word_vectors_matrix  = thrones2vec.wv.vectors

In [24]:
# all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [25]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[thrones2vec.vocab[word].index])
            for word in thrones2vec.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

AttributeError: 'Word2Vec' object has no attribute 'vocab'

In [26]:
points.head(10)

NameError: name 'points' is not defined

In [27]:
thrones2vec.most_similar("khaleesi")

  """Entry point for launching an IPython kernel.
2019-03-15 09:44:48,095 : INFO : precomputing L2-norms of word weight vectors


[('Unburnt', 0.4277729094028473),
 ('dosh', 0.3785097002983093),
 ('khaleen', 0.37776172161102295),
 ('Stormborn', 0.369415283203125),
 ('decrees', 0.3501581847667694),
 ('Drogo', 0.3439655900001526),
 ('Cohollo', 0.3422265350818634),
 ('khal', 0.3406676650047302),
 ('freeborn', 0.3384665250778198),
 ('Custom', 0.3382281959056854)]

In [28]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.wv.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [29]:
nearest_similarity_cosmul("Stark", "Arya", "Tyrion")

Stark is related to Arya, as Lannister is related to Tyrion


'Lannister'

In [33]:
thrones2vec.wv.word_vec("Stark")

array([-0.06165697, -0.3293274 ,  0.22624172,  0.00830119,  0.20641072,
        0.30942324,  0.07763162, -0.07171744, -0.01893655, -0.07911871,
        0.0665457 ,  0.11606652, -0.00208107, -0.09011709, -0.18714705,
       -0.22201811,  0.12116595, -0.11905318, -0.286598  , -0.08385568,
        0.06675431,  0.04649086,  0.25603536, -0.15616813,  0.11828049,
        0.27480122, -0.2532734 , -0.12636727, -0.34013882, -0.18799742,
       -0.16478088,  0.16576071,  0.0092566 ,  0.16176093, -0.14885361,
       -0.39490294, -0.2776572 ,  0.4688105 , -0.34504762, -0.3339576 ,
        0.26457655, -0.20786762, -0.02293019,  0.03249054,  0.14647329,
        0.21113123, -0.08296027,  0.09239488,  0.10389933,  0.33918548,
        0.0883604 , -0.44662791,  0.05458034,  0.3469728 , -0.1727128 ,
        0.15748732,  0.43677452, -0.11845618, -0.536396  ,  0.15173092,
       -0.09977715,  0.0144603 ,  0.06117773,  0.0132992 , -0.16897514,
       -0.25916383,  0.31721932, -0.28560653, -0.24640004,  0.06

In [2]:
# thrones2vec.wv.most_similar??