In [1]:
#natural language toolkit
import nltk
#word 2 vec
import gensim.models.word2vec as w2v
#dimensionality reduction
import sklearn.manifold
#math
import numpy as np
#plotting
import matplotlib.pyplot as plt
#parse dataset
import pandas as pd
#visualization
import seaborn as sns



In [2]:
#encoding. word encodig
import codecs
#finds all pathnames matching a pattern, like regex
import glob
#log events for libraries
import logging
#concurrency
import multiprocessing
#dealing with operating system , like reading file
import os
#pretty print, human readable
import pprint
#regular expressions
import re

In [3]:
from __future__ import absolute_import, division, print_function


In [5]:
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
book_filenames = sorted(glob.glob("*.txt"))
book_filenames

['alice.txt', 'book1.txt', 'book2.txt', 'shakeboys.txt', 'shakespeare.txt']

In [13]:
corpus_raw=""
for book_filename in book_filenames[1:3]:
    with codecs.open(book_filename,"r","utf-8") as fp:
        corpus_raw+=fp.read()
       

﻿This edition contains the complete text of the original hardcover edition.

NOT ONE WORD HAS BEEN


In [14]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words 

In [15]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


In [18]:
raw_sentences = tokenizer.tokenize(corpus_raw)


['\ufeffThis edition contains the complete text of the original hardcover edition.', 'NOT ONE WORD HAS BEEN OMITTED.', 'A CLASH OF KINGS\r\n\r\nA Bantam Spectra Book\r\n\r\nPUBLISHING HISTORY\r\n\r\nBantam Spectra hardcover edition published February 1999\r\n\r\nBantam Spectra paperback edition / September 2000\r\n\r\nSPECTRA and the portrayal of a boxed “s” are trademarks of Bantam Books, a division of Random House, Inc.\r\n\r\nAll rights reserved.', 'Copyright © 1999 by George R. R. Martin.', 'Maps by James Sinclair.', 'Heraldic crest by Virginia Norey.', 'Library of Congress Catalog Card Number: 98-37954.', 'No part of this book may be reproduced or transmitted in any form or by any means, electronic or mechanical, including photocopying, recording, or by any information storage and retrieval system, without permission in writing from the publisher.', 'Visit our website at www.bantamdell.com\r\n\r\nBantam Books, the rooster colophon, Spectra and the portrayal of a boxed “s” are regi

In [21]:
sentences=[]
for raw_sentence in raw_sentences:
    if len(raw_sentence)>0:
           sentences.append(sentence_to_wordlist(raw_sentence))
           
       

In [22]:
token_count = sum([len(sentence) for sentence in sentences])


In [25]:
num_features=300

min_word_count=3
num_workers=multiprocessing.cpu_count()

context_size=7

downsampling=1e-3
seed=1

In [26]:

thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [29]:
thrones2vec.train(sentences, total_examples=thrones2vec.corpus_count, epochs=thrones2vec.iter)

if not os.path.exists("trained"):
    os.makedirs("trained")
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))


In [30]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))


In [31]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)


In [33]:
all_word_vectors_matrix = thrones2vec.wv.syn0


In [34]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)


In [37]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[thrones2vec.wv.vocab[word].index])
            for word in thrones2vec.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

In [38]:
points.head(10)


Unnamed: 0,word,x,y
0,moods,-1.616215,-6.849025
1,room,-1.680754,-5.192649
2,watching,-6.656103,3.101132
3,ceased,2.675542,-0.979468
4,spearwife,-3.228305,-0.018428
5,replace,-1.373847,2.740165
6,cleared,-5.682993,2.713599
7,whisperers,1.68994,-2.390344
8,Rushing,-3.678174,1.633046
9,fostered,6.494853,1.927716


In [39]:
sns.set_context("poster")


In [42]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))


<matplotlib.axes._subplots.AxesSubplot at 0x2b381bc5630>

In [43]:

def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [44]:
plot_region(x_bounds=(4.0, 4.2), y_bounds=(-0.5, -0.1))
