In [1]:
#Aim: To create word vectors from a game of thrones dataset
# and analayze them to see sematic similarity

from __future__ import absolute_import, division, print_function

#for word encoding
import codecs
import glob
import multiprocessing
import os
import pprint
import re
import nltk
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import gensim.models.word2vec as w2v
import pandas as pd
import seaborn as sns



In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jarvis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jarvis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
book_filenames = sorted(glob.glob("/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/*.txt"))

In [6]:
print ("Found books:", book_filenames)

Found books: ['/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got1.txt', '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got2.txt', '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got3.txt', '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got4.txt', '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got5.txt']


In [7]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} charachters loong".format(len(corpus_raw)))
    print()

Reading '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got1.txt'...
Corpus is now 1770659 charachters loong

Reading '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got2.txt'...
Corpus is now 4071041 charachters loong

Reading '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got3.txt'...
Corpus is now 6391405 charachters loong

Reading '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got4.txt'...
Corpus is now 8107945 charachters loong

Reading '/home/jarvis/My projects/Machine Learning/Siraj_Akash/word_vectors_game_of_thrones-LIVE-master/data/got5.txt'...
Corpus is now 9719485 charachters loong



In [8]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [10]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]" , " ", raw)
    words = clean.split()
    return words

In [11]:
# Technically we tokenize sentences 
# text = u'This, is a sentence with weird» symbols… appearing everywhere¿' 
# print (mtokenizer.tokenize(text, return_str=True))
# u'This , is a sentence with weird » symbols … appearing everywhere ¿'
# After that we clean the tokenized sentence we spilt each word and store it into a list

sentences = []
for raw_sentence in raw_sentences:
    if len (raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [12]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

Heraldic crest by Virginia Norey.
[u'Heraldic', u'crest', u'by', u'Virginia', u'Norey']


In [13]:
token_count = sum([len(sentence) for sentence in sentences])
print (" The book corpus contains {0:,} tokens".format(token_count))

 The book corpus contains 1,818,103 tokens


In [14]:
num_features = 300

min_word_count = 3

num_workers = multiprocessing.cpu_count()

context_size = 7

downsampling = 1e-3

seed = 1


In [16]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [17]:
thrones2vec.build_vocab(sentences)

2017-03-06 17:03:28,279 : INFO : collecting all words and their counts
2017-03-06 17:03:28,280 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-03-06 17:03:28,339 : INFO : PROGRESS: at sentence #10000, processed 140984 words, keeping 10280 word types
2017-03-06 17:03:28,381 : INFO : PROGRESS: at sentence #20000, processed 279730 words, keeping 13558 word types
2017-03-06 17:03:28,426 : INFO : PROGRESS: at sentence #30000, processed 420336 words, keeping 16598 word types
2017-03-06 17:03:28,463 : INFO : PROGRESS: at sentence #40000, processed 556581 words, keeping 18324 word types
2017-03-06 17:03:28,500 : INFO : PROGRESS: at sentence #50000, processed 686247 words, keeping 19714 word types
2017-03-06 17:03:28,541 : INFO : PROGRESS: at sentence #60000, processed 828497 words, keeping 21672 word types
2017-03-06 17:03:28,584 : INFO : PROGRESS: at sentence #70000, processed 973830 words, keeping 23093 word types
2017-03-06 17:03:28,635 : INFO : PROGRESS: at 

In [18]:
print("word2vec vocabulary length: ", len(thrones2vec.vocab))



word2vec vocabulary length:  17277


In [19]:
thrones2vec.train(sentences)

2017-03-06 17:03:33,685 : INFO : training model with 4 workers on 17277 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2017-03-06 17:03:33,686 : INFO : expecting 128868 sentences, matching count from corpus used for vocabulary survey
2017-03-06 17:03:34,695 : INFO : PROGRESS: at 2.41% examples, 168999 words/s, in_qsize 8, out_qsize 0
2017-03-06 17:03:35,698 : INFO : PROGRESS: at 4.64% examples, 161371 words/s, in_qsize 8, out_qsize 0
2017-03-06 17:03:36,724 : INFO : PROGRESS: at 6.84% examples, 155334 words/s, in_qsize 8, out_qsize 0
2017-03-06 17:03:37,774 : INFO : PROGRESS: at 9.97% examples, 168393 words/s, in_qsize 8, out_qsize 0
2017-03-06 17:03:38,826 : INFO : PROGRESS: at 12.55% examples, 170080 words/s, in_qsize 8, out_qsize 0
2017-03-06 17:03:39,862 : INFO : PROGRESS: at 14.67% examples, 165215 words/s, in_qsize 8, out_qsize 0
2017-03-06 17:03:40,891 : INFO : PROGRESS: at 17.69% examples, 170442 words/s, in_qsize 8, out_qsize 0
2017-03-06 17:03:4

7021536

In [20]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [21]:
thrones2vec.save(os.path.join("trained","thrones2vec.w2v"))

2017-03-06 17:05:47,173 : INFO : saving Word2Vec object under trained/thrones2vec.w2v, separately None
2017-03-06 17:05:47,175 : INFO : not storing attribute syn0norm
2017-03-06 17:05:47,176 : INFO : not storing attribute cum_table
2017-03-06 17:05:47,724 : INFO : saved trained/thrones2vec.w2v


In [22]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

2017-03-06 17:05:53,313 : INFO : loading Word2Vec object from trained/thrones2vec.w2v
2017-03-06 17:05:53,479 : INFO : loading wv recursively from trained/thrones2vec.w2v.wv.* with mmap=None
2017-03-06 17:05:53,479 : INFO : setting ignored attribute syn0norm to None
2017-03-06 17:05:53,480 : INFO : setting ignored attribute cum_table to None
2017-03-06 17:05:53,481 : INFO : loaded trained/thrones2vec.w2v


In [23]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [24]:
all_word_vectors_matrix = thrones2vec.wv.syn0

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[thrones2vec.vocab[word].index])
            for word in thrones2vec.vocab
        ]
    ],
    columns=["word", "x", "y"]
)