## Word Vector

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as  plt

%matplotlib inline

In [2]:
import logging
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

Using Theano backend.


```
def logging(name, level):
    logger = logging.getLogger(name)
    logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s")
    logging.root.setLevel(level=level)
```

In [3]:
def train_model(inp, out):
    logger = logging.getLogger("word2vec-training")
    logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s")
    logging.root.setLevel(level=logging.INFO)
    #logging("word2vec-training", "logging.INFO")
    
    model = Word2Vec(LineSentence(inp), size=100, window=5,
                    min_count=5, workers=multiprocessing.cpu_count())
    
    model.init_sims(replace=True)
    model.save(out)

In [4]:
train_model("data/reddit-small.txt", "word2vec.model")

2017-09-23 15:57:05,594 : INFO : collecting all words and their counts
2017-09-23 15:57:05,615 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-23 15:57:05,774 : INFO : collected 11440 word types from a corpus of 105198 raw words and 5000 sentences
2017-09-23 15:57:05,775 : INFO : Loading a fresh vocabulary
2017-09-23 15:57:05,794 : INFO : min_count=5 retains 2362 unique words (20% of original 11440, drops 9078)
2017-09-23 15:57:05,797 : INFO : min_count=5 leaves 90968 word corpus (86% of original 105198, drops 14230)
2017-09-23 15:57:05,814 : INFO : deleting the raw counts dictionary of 11440 items
2017-09-23 15:57:05,819 : INFO : sample=0.001 downsamples 55 most-common words
2017-09-23 15:57:05,822 : INFO : downsampling leaves estimated 70796 word corpus (77.8% of prior 90968)
2017-09-23 15:57:05,824 : INFO : estimated required memory for 2362 words and 100 dimensions: 3070600 bytes
2017-09-23 15:57:05,842 : INFO : resetting layer weights
2017-09-23 

In [5]:
model = Word2Vec.load("word2vec.model")

2017-09-23 15:57:06,581 : INFO : loading Word2Vec object from word2vec.model
2017-09-23 15:57:06,665 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2017-09-23 15:57:06,666 : INFO : setting ignored attribute syn0norm to None
2017-09-23 15:57:06,668 : INFO : setting ignored attribute cum_table to None
2017-09-23 15:57:06,671 : INFO : loaded word2vec.model


In [6]:
model['money'] # model will throw the error if the word is not existing in the corpus

array([ -4.71556745e-03,   4.69302833e-02,   1.38287038e-01,
        -3.66218458e-03,   7.15303123e-02,   5.62157221e-02,
        -8.60737562e-02,   6.27359748e-02,  -7.36661442e-03,
        -1.58841148e-01,   4.23208512e-02,   1.38949985e-02,
         1.83057617e-02,   2.88402699e-02,   6.04900308e-02,
         1.83993295e-01,   1.73676297e-01,  -2.65602320e-02,
        -1.46508232e-01,  -1.07726209e-01,   9.79714096e-02,
         5.27226329e-02,   2.91530229e-03,   6.25990033e-02,
        -6.74230307e-02,  -1.66072801e-01,  -4.92809936e-02,
        -7.08616152e-03,  -1.45713642e-01,   1.00677744e-01,
        -8.28253180e-02,  -8.98007676e-02,  -2.96194553e-02,
        -8.60425120e-04,   7.13192075e-02,  -4.98724245e-02,
        -6.44445866e-02,   6.31915778e-02,   2.86446977e-03,
         1.22054495e-01,   4.06177305e-02,  -7.61726648e-02,
         8.48585963e-02,   9.49178115e-02,  -1.41638592e-01,
        -1.10547736e-01,   1.49406374e-01,   6.58446923e-02,
         8.06896761e-02,

In [7]:
model.most_similar('money')

2017-09-23 15:57:06,716 : INFO : precomputing L2-norms of word weight vectors


[('which', 0.9999130964279175),
 ('long', 0.9998975396156311),
 ('better', 0.9998939037322998),
 ('point', 0.9998928904533386),
 ('though', 0.9998907446861267),
 ('around', 0.9998847246170044),
 ('most', 0.9998843669891357),
 ('come', 0.9998841285705566),
 ('take', 0.9998818039894104),
 ('god', 0.9998811483383179)]

In [8]:
model.similarity('money', 'amount')

0.99973765497596645

In [9]:
def cosine_similarity(inp1, inp2):
    return np.dot(inp1, inp2)/(np.linalg.norm(inp1)*np.linalg.norm(inp2))
    

In [10]:
cosine_similarity([1,2,3], [1,2,4])

0.9914601339836675

In [11]:
def average_similarity(text1, text2):
    text1 = text1.lower().split()
    text2 = text2.lower().split()
    
    # generating word vectors
    vec1 = np.array([model[word] for word in text1])
    vec2 = np.array([model[word] for word in text2])
    
    avg_vec1 = np.mean(vec1, axis=0)
    avg_vec2 = np.mean(vec2, axis=0)
    
    return cosine_similarity(avg_vec1, avg_vec2)

In [12]:
sentences = ["the king money", "happy about long story"]

In [13]:
average_similarity(sentences[0], sentences[1])

0.99981719

In [14]:
import spacy
import en_core_web_md

In [15]:
nlp = en_core_web_md.load()

In [20]:
sentence = 'Ramanujan was the great mathematician.'

In [23]:
doc = nlp(sentence)
type(doc)

spacy.tokens.doc.Doc

In [28]:
for token in doc:
    print(token, token.pos_, token.tag_, token.ent_type_,  token.similarity(nlp("math")))  
    # token.vector - this will display the word vector, it will be 0 if the word is not available in corpus

Ramanujan PROPN NNP PERSON 0.0
was VERB VBD  0.199599749025
the DET DT  0.190436797865
great ADJ JJ  0.228298441797
mathematician NOUN NN  0.382748813508
. PUNCT .  0.209577768661


In [29]:
doc.ents

(Ramanujan,)

In [34]:
type(doc.print_tree)

builtin_function_or_method

In [35]:
list(doc.noun_chunks)

[Ramanujan, the great mathematician]

In [37]:
list(doc.sents)

[Ramanujan was the great mathematician.]