In [1]:
# Import standard libraries
from __future__ import absolute_import, division, print_function

import codecs # for word encoding
import glob # for regular expressions
import multiprocessing # concurrency
import os # os stuff, like reading a file
import pprint # pretty printing
import re # regular expressions


In [2]:
# Import external libraries
import nltk # natural language procession
import gensim.models.word2vec as w2v # word 2 vec
import sklearn.manifold #dimensionality reduction
import numpy as np # math
import matplotlib.pyplot as plt # plotting
import pandas as pd
import seaborn as sns

In [3]:
# Step 1 - process the data
# clean data

nltk.download('punkt') # pretrained tokenizer
nltk.download('stopwords') # words like and, the, a, an, of

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bergsfamily/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bergsfamily/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#get the book filenames
book_filenames = sorted(glob.glob("data_war/*.txt"))
book_filenames

['data_war/A Dangerous Path - Erin Hunter.txt',
 "data_war/Bluestar's Prophecy - Erin Hunter.txt",
 'data_war/Dark River - Erin Hunter.txt',
 'data_war/Dawn - Erin Hunter.txt',
 'data_war/Eclipse - Erin Hunter.txt',
 'data_war/Fading Echoes - Erin Hunter.txt',
 'data_war/Fire and Ice - Erin Hunter.txt',
 'data_war/Fire in the Sky - Erin Hunter.txt',
 'data_war/Forest of Secrets - Erin Hunter.txt',
 'data_war/Into the Wild - Erin Hunter.txt',
 'data_war/Long Shadows - Erin Hunter.txt',
 'data_war/Midnight - Erin Hunter.txt',
 'data_war/Moonrise - Erin Hunter.txt',
 'data_war/Night Whispers - Erin Hunter.txt',
 'data_war/Outcast - Erin Hunter.txt',
 'data_war/Rising Storm - Erin Hunter.txt',
 'data_war/Seekers #2_ Great Bear Lake - Erin Hunter.txt',
 'data_war/Seekers #3_ Smoke Mountain - Erin Hunter.txt',
 'data_war/Seekers - Erin Hunter.txt',
 'data_war/Starlight - Erin Hunter.txt',
 'data_war/Sunrise - Erin Hunter.txt',
 'data_war/Sunset - Erin Hunter.txt',
 'data_war/The Darkest Hour

In [5]:
corpus_raw = u""

for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus in now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data_war/A Dangerous Path - Erin Hunter.txt'...
Corpus in now 462754 characters long

Reading 'data_war/Bluestar's Prophecy - Erin Hunter.txt'...
Corpus in now 1137899 characters long

Reading 'data_war/Dark River - Erin Hunter.txt'...
Corpus in now 1583229 characters long

Reading 'data_war/Dawn - Erin Hunter.txt'...
Corpus in now 2052737 characters long

Reading 'data_war/Eclipse - Erin Hunter.txt'...
Corpus in now 2492331 characters long

Reading 'data_war/Fading Echoes - Erin Hunter.txt'...
Corpus in now 2958458 characters long

Reading 'data_war/Fire and Ice - Erin Hunter.txt'...
Corpus in now 3424214 characters long

Reading 'data_war/Fire in the Sky - Erin Hunter.txt'...
Corpus in now 3780524 characters long

Reading 'data_war/Forest of Secrets - Erin Hunter.txt'...
Corpus in now 4242279 characters long

Reading 'data_war/Into the Wild - Erin Hunter.txt'...
Corpus in now 4641868 characters long

Reading 'data_war/Long Shadows - Erin Hunter.txt'...
Corpus in now 5107046 

In [6]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [8]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words
#returns list of words, removes puncutation and hyphens

In [9]:
# sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [10]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

CHAPTER 18

As Fireheart wearily pushed his way through the entrance to…

CHAPTER 19

Next morning, Fireheart watched the dawn patrol leave before going…

CHAPTER 20

“Will she live?” Fireheart asked anxiously.
[u'CHAPTER', u'As', u'Fireheart', u'wearily', u'pushed', u'his', u'way', u'through', u'the', u'entrance', u'to', u'CHAPTER', u'Next', u'morning', u'Fireheart', u'watched', u'the', u'dawn', u'patrol', u'leave', u'before', u'going', u'CHAPTER', u'Will', u'she', u'live', u'Fireheart', u'asked', u'anxiously']


In [11]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 2,755,537 tokens


# Train Word 2 Vec

In [12]:
# Step 2 - build the models
# 3 tasks vectors help with
# Distance, Similarity, Ranking


# define hyperparameters
num_features = 300 # more features = more expensive to train, but more accurate
min_word_count = 3
num_workers = multiprocessing.cpu_count() # more workers = faster training
context_size = 7

# Downsample setting for frequent words.
# between 0 and 1e-5 
# how often to use
downsampling = 1e-3

# Seed for random number generator
seed = 42

In [13]:
warriors2vec = w2v.Word2Vec(
    sg = 1,
    seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling,
)

In [14]:
warriors2vec.build_vocab(sentences)

In [15]:
print("Word2Vec vocabulary length:", len(warriors2vec.wv.vocab))

Word2Vec vocabulary length: 12490


In [16]:
warriors2vec.train(sentences)

10906774

## Save the model to file

In [17]:
# save the model
if not os.path.exists("trained"):
    os.makedirs("trained")

In [18]:
warriors2vec.save(os.path.join("trained", "warriors2vec.w2v"))

## Load the trained model - Start here

In [3]:
# load the model - in case this is re-run
warriors2vec = w2v.Word2Vec.load(os.path.join("trained", "warriors2vec.w2v"))

## Explore semantic similarities between book characters

In [4]:
warriors2vec.most_similar("Firestar")

[(u'Sunstar', 0.569300651550293),
 (u'Sharpclaw', 0.5651351809501648),
 (u'Sunfall', 0.5621132850646973),
 (u'Bluestar', 0.5600012540817261),
 (u'Goldenstar', 0.5596264600753784),
 (u'Pinestar', 0.5545787215232849),
 (u'Onestar', 0.5454298257827759),
 (u'Stoneteller', 0.5450897216796875),
 (u'Leopardstar', 0.5334157943725586),
 (u'Squirrelflight', 0.5309216976165771)]

In [5]:
warriors2vec.most_similar("Graystripe")

[(u'Sandstorm', 0.6043769121170044),
 (u'Tigerclaw', 0.6010367274284363),
 (u'Leopardfur', 0.5893921256065369),
 (u'Brackenfur', 0.5840722918510437),
 (u'Dustpelt', 0.5814228057861328),
 (u'Onewhisker', 0.5764490365982056),
 (u'Whitestorm', 0.5743676424026489),
 (u'Ravenpaw', 0.5655602216720581),
 (u'Stormfur', 0.5650084018707275),
 (u'Mistyfoot', 0.5626653432846069)]

In [6]:
warriors2vec.most_similar("Twolegs")

[(u'monsters', 0.6408638954162598),
 (u'dogs', 0.6266584396362305),
 (u'badgers', 0.5993587970733643),
 (u'Upwalkers', 0.5861537456512451),
 (u'Twoleg', 0.5844410061836243),
 (u'sheep', 0.5735294818878174),
 (u'firebeasts', 0.5730717182159424),
 (u'predators', 0.5700822472572327),
 (u'Badgers', 0.5696913003921509),
 (u'faces', 0.5696393251419067)]

In [7]:
warriors2vec.most_similar("mousebrain")

[(u'tartly', 0.8788039684295654),
 (u'peaceably', 0.8757885694503784),
 (u'meekly', 0.8730642795562744),
 (u'sobbed', 0.8711094856262207),
 (u'earnestly', 0.8709672689437866),
 (u'helpfully', 0.8688185811042786),
 (u'Ignore', 0.8685449361801147),
 (u'sourly', 0.8636438846588135),
 (u'sulkily', 0.863560676574707),
 (u'evasively', 0.8629686832427979)]

In [8]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = warriors2vec.most_similar_cosmul(
        positive = [end2, start1],
        negative = [end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [9]:
nearest_similarity_cosmul("Rusty", "Firestar", "Graystripe")
#nearest_similarity_cosmul("Thunderclan", "Riverclan", "Firestar")
nearest_similarity_cosmul("Thunderclan", "Bluestar", "Graystripe")

Rusty is related to Firestar, as guiltily is related to Graystripe
Thunderclan is related to Bluestar, as Traitors is related to Graystripe


u'Traitors'

In [10]:
warriors2vec.most_similar("Vinestar")

[(u'Calling', 0.8825877904891968),
 (u'Revenge', 0.8794381022453308),
 (u'Loudbelly', 0.8752279281616211),
 (u'Lakestorm', 0.8745911121368408),
 (u'raids', 0.8724934458732605),
 (u'frequent', 0.8707389235496521),
 (u'Facing', 0.8685536980628967),
 (u'joins', 0.8669978380203247),
 (u'appointment', 0.8662803173065186),
 (u'messengers', 0.8659096956253052)]

In [11]:
warriors2vec.most_similar("Rusty")

[(u'Firepaw', 0.6736335158348083),
 (u'mystified', 0.6225589513778687),
 (u'apologetical', 0.6201215982437134),
 (u'reappear', 0.6137369871139526),
 (u'guiltily', 0.6133610010147095),
 (u'resentfully', 0.6122925281524658),
 (u'Raincloud', 0.611646294593811),
 (u'Understand', 0.6109017729759216),
 (u'Cinderkit', 0.609039306640625),
 (u'sarcastic', 0.6079747676849365)]

In [12]:
warriors2vec.most_similar("Cinderheart")

[(u'Bumblestripe', 0.6523749828338623),
 (u'earnestly', 0.6421167850494385),
 (u'Thrushpelt', 0.6377930641174316),
 (u'worriedly', 0.6376029849052429),
 (u'Oakpaw', 0.6358127593994141),
 (u'Sally', 0.6313471794128418),
 (u'Sorreltail', 0.6303678154945374),
 (u'Dapplepaw', 0.6263765096664429),
 (u'Honeypaw', 0.6238635778427124),
 (u'Blossomfal', 0.6223466992378235)]

In [13]:
warriors2vec.most_similar("Moon")

[(u'Half', 0.8751037120819092),
 (u'Rising', 0.6436961889266968),
 (u'Shy', 0.6038526296615601),
 (u'Feather', 0.5966202616691589),
 (u'Wing', 0.5944815874099731),
 (u'Fawn', 0.5932095646858215),
 (u'ising', 0.588476300239563),
 (u'Leap', 0.5794602036476135),
 (u'Fish', 0.5789985656738281),
 (u'Snowdrop', 0.5766615867614746)]