In [16]:
# Import standard libraries
from __future__ import absolute_import, division, print_function

import codecs # for word encoding
import glob # for regular expressions
import multiprocessing # concurrency
import os # os stuff, like reading a file
import pprint # pretty printing
import re # regular expressions


In [17]:
# Import external libraries
import nltk # natural language procession
import gensim.models.word2vec as w2v # word 2 vec
import sklearn.manifold #dimensionality reduction
import numpy as np # math
import matplotlib.pyplot as plt # plotting
import pandas as pd
import seaborn as sns

In [18]:
# Step 1 - process the data
# clean data

nltk.download('punkt') # pretrained tokenizer
nltk.download('stopwords') # words like and, the, a, an, of

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bergsfamily/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bergsfamily/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
#get the book filenames
book_filenames = sorted(glob.glob("data_alt_inv/*.txt"))
book_filenames

['data_alt_inv/alt_inv_ch01.txt']

In [31]:
corpus_raw = u""

for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus in now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data_alt_inv/alt_inv_ch01.txt'...
Corpus in now 60389 characters long



In [32]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [33]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [34]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words
#returns list of words, removes puncutation and hyphens

In [35]:
# sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [36]:
print(raw_sentences[0])
print(sentence_to_wordlist(raw_sentences[0]))

Skip to content
Safari Home
Recommended
Queue
History
Topics
Tutorials
Offers & Deals
Newsletters
Highlights
Settings
Support
Sign Out
Table of Contents for  Alternative Investments: CAIA Level I, 3rd Edition
CLOSE
Cover image for Alternative Investments: CAIA Level I, 3rd Edition
 publisher logo Alternative Investments: CAIA Level I, 3rd Edition
by Donald R. Chambers; Hossein Kazemi; CAIA Association; Mark J. P. Anson; Keith H. Black
Published by John Wiley & Sons, 2015
Preface (05:45 mins)
Acknowledgments (01:09 mins)
About the Authors (02:18 mins)
PART 1 Introduction to Alternative Investments
 CHAPTER 1 What Is an Alternative Investment?
[u'Skip', u'to', u'content', u'Safari', u'Home', u'Recommended', u'Queue', u'History', u'Topics', u'Tutorials', u'Offers', u'Deals', u'Newsletters', u'Highlights', u'Settings', u'Support', u'Sign', u'Out', u'Table', u'of', u'Contents', u'for', u'Alternative', u'Investments', u'CAIA', u'Level', u'I', u'rd', u'Edition', u'CLOSE', u'Cover', u'image', 

In [37]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 8,749 tokens


# Train Word 2 Vec

In [38]:
# Step 2 - build the models
# 3 tasks vectors help with
# Distance, Similarity, Ranking


# define hyperparameters
num_features = 300 # more features = more expensive to train, but more accurate
min_word_count = 3
num_workers = multiprocessing.cpu_count() # more workers = faster training
context_size = 7

# Downsample setting for frequent words.
# between 0 and 1e-5 
# how often to use
downsampling = 1e-3

# Seed for random number generator
seed = 42

In [39]:
altinv2vec = w2v.Word2Vec(
    sg = 1,
    seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling,
)

In [40]:
altinv2vec.build_vocab(sentences)

In [41]:
print("Word2Vec vocabulary length:", len(altinv2vec.wv.vocab))

Word2Vec vocabulary length: 545


In [42]:
altinv2vec.train(sentences)



23751

## Save the model to file

In [43]:
# save the model
if not os.path.exists("trained"):
    os.makedirs("trained")

In [44]:
altinv2vec.save(os.path.join("trained", "altinv2vec.w2v"))

## Load the trained model - Start here

In [45]:
# load the model - in case this is re-run
altinv2vec = w2v.Word2Vec.load(os.path.join("trained", "altinv2vec.w2v"))

## Explore semantic similarities between book characters

In [46]:
altinv2vec.most_similar("Hedge")

[(u'Funds', 0.9999245405197144),
 (u'Equity', 0.9998708963394165),
 (u'Real', 0.9997694492340088),
 (u'PART', 0.9997391700744629),
 (u'Foundations', 0.9996857643127441),
 (u'Risk', 0.9996431469917297),
 (u'Private', 0.9994067549705505),
 (u'Credit', 0.9992772936820984),
 (u'Investments', 0.9992425441741943),
 (u'Management', 0.9991354942321777)]

In [47]:
altinv2vec.most_similar("Real")

[(u'PART', 0.9998748302459717),
 (u'Risk', 0.999846875667572),
 (u'Equity', 0.9998409152030945),
 (u'Foundations', 0.999832034111023),
 (u'Funds', 0.9998094439506531),
 (u'Hedge', 0.9997694492340088),
 (u'Private', 0.9996858835220337),
 (u'Credit', 0.9996574521064758),
 (u'Management', 0.9995602965354919),
 (u'Assets', 0.9995435476303101)]

In [52]:
altinv2vec.most_similar("commodities")

[(u'standard', 0.9998104572296143),
 (u'absolute', 0.9998100399971008),
 (u'rather', 0.9998042583465576),
 (u'category', 0.9998033046722412),
 (u'believed', 0.9998025298118591),
 (u'then', 0.9998018741607666),
 (u'arbitrage', 0.9998009204864502),
 (u'different', 0.9997997283935547),
 (u'there', 0.9997996687889099),
 (u'equities', 0.9997979402542114)]

In [54]:
altinv2vec.most_similar("investment")

[(u'not', 0.9997887015342712),
 (u'is', 0.9997864961624146),
 (u'to', 0.9997845888137817),
 (u'the', 0.9997729659080505),
 (u'a', 0.99976646900177),
 (u'or', 0.9997627139091492),
 (u'on', 0.9997608661651611),
 (u'that', 0.9997460246086121),
 (u'through', 0.9997372031211853),
 (u'which', 0.9997333288192749)]

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = altinv2vec.most_similar_cosmul(
        positive = [end2, start1],
        negative = [end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
#nearest_similarity_cosmul("Rusty", "Firestar", "Graystripe")
#nearest_similarity_cosmul("Thunderclan", "Riverclan", "Firestar")
#nearest_similarity_cosmul("Thunderclan", "Bluestar", "Graystripe")