In [3]:
# Import standard libraries
from __future__ import absolute_import, division, print_function

import codecs # for word encoding
import glob # for regular expressions
import multiprocessing # concurrency
import os # os stuff, like reading a file
import pprint # pretty printing
import re # regular expressions


In [1]:
# Import external libraries
import nltk # natural language procession
import gensim.models.word2vec as w2v # word 2 vec
import sklearn.manifold #dimensionality reduction
import numpy as np # math
import matplotlib.pyplot as plt # plotting
import pandas as pd
import seaborn as sns

In [4]:
# Step 1 - process the data
# clean data

nltk.download('punkt') # pretrained tokenizer
nltk.download('stopwords') # words like and, the, a, an, of

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bergsfamily/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bergsfamily/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#get the book filenames
book_filenames = sorted(glob.glob("data/*.txt"))
book_filenames

['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

In [5]:
corpus_raw = u""

for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus in now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/got1.txt'...
Corpus in now 1770659 characters long

Reading 'data/got2.txt'...
Corpus in now 4071041 characters long

Reading 'data/got3.txt'...
Corpus in now 6391405 characters long

Reading 'data/got4.txt'...
Corpus in now 8107945 characters long

Reading 'data/got5.txt'...
Corpus in now 9719485 characters long



In [6]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [8]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words
#returns list of words, removes puncutation and hyphens

In [9]:
# sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [10]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

Heraldic crest by Virginia Norey.
[u'Heraldic', u'crest', u'by', u'Virginia', u'Norey']


In [11]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,818,103 tokens


# Train Word 2 Vec

In [12]:
# Step 2 - build the models
# 3 tasks vectors help with
# Distance, Similarity, Ranking


# define hyperparameters
num_features = 300 # more features = more expensive to train, but more accurate
min_word_count = 3
num_workers = multiprocessing.cpu_count() # more workers = faster training
context_size = 7

# Downsample setting for frequent words.
# between 0 and 1e-5 
# how often to use
downsampling = 1e-3

# Seed for random number generator
seed = 42

In [13]:
thrones2vec = w2v.Word2Vec(
    sg = 1,
    seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling,
)

In [14]:
thrones2vec.build_vocab(sentences)

In [16]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

Word2Vec vocabulary length: 17277


In [17]:
thrones2vec.train(sentences)

7021920

## Save the model to file

In [18]:
# save the model
if not os.path.exists("trained"):
    os.makedirs("trained")

In [19]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

## Load the trained model - Start here

In [5]:
# load the model - in case this is re-run
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

## SKIP
## Compress word vectors into 2D space and plot

**TSNE has been giving me headaches on my mac, so skip the plotting**

See https://github.com/scikit-learn/scikit-learn/issues/7089

In [5]:
#tsne = sklearn.manifold.TSNE(n_components=2, random_state=42)

In [6]:
#all_word_vectors_matrix = thrones2vec.wv.syn0

In [None]:
#all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

## Explore semantic similarities between book characters

In [6]:
thrones2vec.most_similar("Stark")

[(u'Eddard', 0.7412210702896118),
 (u'Winterfell', 0.6834571361541748),
 (u'Karstark', 0.6381300687789917),
 (u'Lyanna', 0.6353175044059753),
 (u'direwolf', 0.6259804964065552),
 (u'beheaded', 0.6252360343933105),
 (u'executed', 0.61602783203125),
 (u'Hornwood', 0.6131395697593689),
 (u'Benjen', 0.607450008392334),
 (u'Dustin', 0.6061956286430359)]

In [7]:
thrones2vec.most_similar("Aerys")

[(u'Jaehaerys', 0.7929835915565491),
 (u'Daeron', 0.7739320993423462),
 (u'reign', 0.7709693908691406),
 (u'Mad', 0.7663239240646362),
 (u'Usurper', 0.7441689372062683),
 (u'Unworthy', 0.7324290871620178),
 (u'Elia', 0.7324237823486328),
 (u'appointment', 0.7277305126190186),
 (u'Conqueror', 0.7227622270584106),
 (u'Cruel', 0.717417299747467)]

In [8]:
thrones2vec.most_similar("direwolf")

[(u'wolf', 0.6745015382766724),
 (u'Rickon', 0.6481422781944275),
 (u'Ghost', 0.6333140134811401),
 (u'Stark', 0.6259804964065552),
 (u'pup', 0.620887041091919),
 (u'SHAGGYDOG', 0.6180346608161926),
 (u'ranger', 0.6159267425537109),
 (u'GHOST', 0.6146520376205444),
 (u'wight', 0.5989304780960083),
 (u'standard', 0.5985522866249084)]

In [10]:
thrones2vec.most_similar("four")

[(u'five', 0.7831581830978394),
 (u'thirty', 0.7753496170043945),
 (u'Six', 0.7512350678443909),
 (u'twenty', 0.7498271465301514),
 (u'forty', 0.7492700815200806),
 (u'six', 0.7487969994544983),
 (u'Two', 0.7451422214508057),
 (u'sixty', 0.7449865937232971),
 (u'eight', 0.7405045032501221),
 (u'Eight', 0.734088659286499)]

In [11]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive = [end2, start1],
        negative = [end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [12]:
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
nearest_similarity_cosmul("Jaime", "sword", "wine")
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")

Stark is related to Winterfell, as Tully is related to Riverrun
Jaime is related to sword, as dreamwine is related to wine
Arya is related to Nymeria, as Dany is related to dragons


u'Dany'