## Word Vector

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as  plt

%matplotlib inline

In [8]:
import logging
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

```
def logging(name, level):
    logger = logging.getLogger(name)
    logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s")
    logging.root.setLevel(level=level)
```

In [17]:
def train_model(inp, out):
    logger = logging.getLogger("word2vec-training")
    logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s")
    logging.root.setLevel(level=logging.INFO)
    #logging("word2vec-training", "logging.INFO")
    
    model = Word2Vec(LineSentence(inp), size=100, window=5,
                    min_count=5, workers=multiprocessing.cpu_count())
    
    model.init_sims(replace=True)
    model.save(out)

In [18]:
train_model("data/reddit-small.txt", "word2vec.model")

2017-09-23 12:47:41,576 : INFO : collecting all words and their counts
2017-09-23 12:47:41,597 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-23 12:47:41,680 : INFO : collected 11440 word types from a corpus of 105198 raw words and 5000 sentences
2017-09-23 12:47:41,682 : INFO : Loading a fresh vocabulary
2017-09-23 12:47:41,702 : INFO : min_count=5 retains 2362 unique words (20% of original 11440, drops 9078)
2017-09-23 12:47:41,703 : INFO : min_count=5 leaves 90968 word corpus (86% of original 105198, drops 14230)
2017-09-23 12:47:41,722 : INFO : deleting the raw counts dictionary of 11440 items
2017-09-23 12:47:41,728 : INFO : sample=0.001 downsamples 55 most-common words
2017-09-23 12:47:41,731 : INFO : downsampling leaves estimated 70796 word corpus (77.8% of prior 90968)
2017-09-23 12:47:41,735 : INFO : estimated required memory for 2362 words and 100 dimensions: 3070600 bytes
2017-09-23 12:47:41,753 : INFO : resetting layer weights
2017-09-23 

In [19]:
model = Word2Vec.load("word2vec.model")

2017-09-23 12:58:34,361 : INFO : loading Word2Vec object from word2vec.model
2017-09-23 12:58:34,394 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2017-09-23 12:58:34,396 : INFO : setting ignored attribute syn0norm to None
2017-09-23 12:58:34,404 : INFO : setting ignored attribute cum_table to None
2017-09-23 12:58:34,406 : INFO : loaded word2vec.model


In [20]:
model['money'] # model will throw the error if the word is not existing in the corpus

array([ 0.04299842, -0.00210287, -0.00880035, -0.08522378, -0.08817672,
        0.24061784, -0.0367457 , -0.06310907, -0.03914295, -0.02903455,
        0.06758396, -0.10153999,  0.09833147, -0.23931736,  0.18460944,
        0.00940851,  0.03951518, -0.00881359, -0.16723104, -0.00763027,
       -0.09344571, -0.02004526, -0.11682951,  0.00800047, -0.14296281,
        0.19845377, -0.10867716,  0.03706482, -0.10137745, -0.1381615 ,
        0.06025601,  0.07049808,  0.1427194 , -0.13049455, -0.00603143,
        0.02047752, -0.02973599,  0.13603571,  0.05601817,  0.09427408,
       -0.00966859, -0.04366736,  0.07928373,  0.02855669, -0.18951257,
        0.06192216, -0.06663023, -0.02274433, -0.02263743, -0.07514915,
        0.06249481, -0.09202767,  0.01306248,  0.00564152,  0.04090673,
        0.08512877, -0.10707233,  0.02166586,  0.01318651,  0.0954906 ,
        0.05085073, -0.1216623 ,  0.19160366, -0.0363052 ,  0.13064745,
       -0.08109435,  0.19012621,  0.09782867,  0.24658716,  0.02

In [25]:
model.most_similar('money')

2017-09-23 13:09:01,528 : INFO : precomputing L2-norms of word weight vectors


[('new', 0.9998865127563477),
 ('though', 0.9998791217803955),
 ('makes', 0.9998786449432373),
 ('which', 0.9998775720596313),
 ('without', 0.9998770356178284),
 ('different', 0.9998747706413269),
 ('end', 0.9998742341995239),
 ('usually', 0.9998738765716553),
 ('yeah', 0.9998735785484314),
 ('either', 0.9998730421066284)]

In [27]:
model.similarity('money', 'amount')

0.99980049492176148

In [29]:
model.accuracy('money')

FileNotFoundError: [Errno 2] No such file or directory: 'money'