In [1]:
#  word to vector in python

import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [4]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [5]:
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level= logging.INFO)

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akkash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akkash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
corpus_raw =u""
with codecs.open('kafka.txt',"r","utf-8") as book_file:
    corpus_raw+= book_file.read()
    print('corpus is {0} characters long'.format(len(corpus_raw)))

corpus is 137628 characters long


In [9]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentence = tokenizer.tokenize(corpus_raw)

In [10]:
def sentence_to_word(raw):
    clean = re.sub("[^a-zA-Z]"," ",raw)
    words = clean.split()
    return words

In [13]:
sentence = []
for raw_sent in raw_sentence:
    if len(raw_sent) > 0:
        sentence.append(sentence_to_word(raw_sent))

In [16]:
print raw_sentence[6]

It wasn't a dream.


In [17]:
print sentence_to_word(raw_sentence[6])

[u'It', u'wasn', u't', u'a', u'dream']


In [18]:
#Word 2 Vec

num_features = 300
min_word_count =3
num_workers = multiprocessing.cpu_count()

context_size = 7

downsampling = 1e-3

seed=1

In [28]:
kafka2vec = w2v.Word2Vec(sg=1,seed=seed,workers=num_workers,size=num_features,min_count=min_word_count,
                         window=context_size,sample=downsampling)

In [29]:
kafka2vec.build_vocab(sentence)

2018-02-02 12:13:15,338: INFO: collecting all words and their counts
2018-02-02 12:13:15,339: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-02-02 12:13:15,354: INFO: collected 3229 word types from a corpus of 25415 raw words and 917 sentences
2018-02-02 12:13:15,355: INFO: Loading a fresh vocabulary
2018-02-02 12:13:15,362: INFO: min_count=3 retains 1118 unique words (34% of original 3229, drops 2111)
2018-02-02 12:13:15,362: INFO: min_count=3 leaves 22814 word corpus (89% of original 25415, drops 2601)
2018-02-02 12:13:15,374: INFO: deleting the raw counts dictionary of 3229 items
2018-02-02 12:13:15,375: INFO: sample=0.001 downsamples 72 most-common words
2018-02-02 12:13:15,375: INFO: downsampling leaves estimated 15494 word corpus (67.9% of prior 22814)
2018-02-02 12:13:15,375: INFO: estimated required memory for 1118 words and 300 dimensions: 3242200 bytes
2018-02-02 12:13:15,385: INFO: resetting layer weights


In [30]:
kafka2vec.vector_size

300

In [31]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [33]:
kafka2vec.save(os.path.join("trained", "kafka2v.w2v"))

2018-02-02 12:13:47,756: INFO: saving Word2Vec object under trained\kafka2v.w2v, separately None
2018-02-02 12:13:47,759: INFO: not storing attribute syn0norm
2018-02-02 12:13:47,762: INFO: not storing attribute cum_table
2018-02-02 12:13:47,788: INFO: saved trained\kafka2v.w2v


In [39]:
kafka2vec = w2v.Word2Vec.load(os.path.join("trained", "kafka2v.w2v"))

2018-02-02 12:17:48,016: INFO: loading Word2Vec object from trained\kafka2v.w2v
2018-02-02 12:17:48,035: INFO: loading wv recursively from trained\kafka2v.w2v.wv.* with mmap=None
2018-02-02 12:17:48,035: INFO: setting ignored attribute syn0norm to None
2018-02-02 12:17:48,038: INFO: setting ignored attribute cum_table to None
2018-02-02 12:17:48,040: INFO: loaded trained\kafka2v.w2v


In [40]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [43]:
all_vec_word = kafka2vec.syn0_lockf

In [44]:
all_vec_word

array([ 1.,  1.,  1., ...,  1.,  1.,  1.], dtype=float32)

In [55]:
kafka2vec.wv.most_similar('strain')

[(u'forced', 0.17502176761627197),
 (u'www', 0.1726372092962265),
 (u'official', 0.16279907524585724),
 (u'their', 0.1627403199672699),
 (u'these', 0.16127660870552063),
 (u'come', 0.1602317839860916),
 (u'behind', 0.15903502702713013),
 (u'current', 0.15684396028518677),
 (u'applicable', 0.15366685390472412),
 (u'eBooks', 0.15197433531284332)]