Importing Libraries

In [6]:
import logging
import multiprocessing
import re
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
dirpath = 'C:/Users/ankit.bhatia/Documents/GitHub/PythonScripts/data/'

Import data

In [7]:
dataset = pd.read_csv(dirpath+'reddit-small.txt',delimiter="/t")
dataset.head()

  if __name__ == '__main__':


Unnamed: 0,one has european accent either because doesn exist there are accents from europe but not european accent
0,mid twenties male rocking skinny jeans pants h...
1,honestly wouldn have believed didn live she ma...
2,money just driver license credit cards and sub...
3,smoking tobacco went from shitty pall malls ma...
4,that one reason for but not the only one the o...


Define Model

In [8]:
def train_model(inp, out):
    logger = logging.getLogger("word2vect-training")
    logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s")
    logging.root.setLevel(level=logging.INFO)
    
    model = Word2Vec(LineSentence(inp), size=100, window=5,min_count=5,workers=multiprocessing.cpu_count())
    model.init_sims(replace = True)
    model.save(out)

Train Model

In [9]:
train_model(inp = dirpath+"reddit-small.txt",
            out = dirpath+"word-vec_out"   )

2017-09-23 21:49:15,622:INFO:collecting all words and their counts
2017-09-23 21:49:15,622:INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-23 21:49:15,714:INFO:collected 11440 word types from a corpus of 105198 raw words and 5000 sentences
2017-09-23 21:49:15,714:INFO:Loading a fresh vocabulary
2017-09-23 21:49:15,734:INFO:min_count=5 retains 2362 unique words (20% of original 11440, drops 9078)
2017-09-23 21:49:15,735:INFO:min_count=5 leaves 90968 word corpus (86% of original 105198, drops 14230)
2017-09-23 21:49:15,758:INFO:deleting the raw counts dictionary of 11440 items
2017-09-23 21:49:15,762:INFO:sample=0.001 downsamples 55 most-common words
2017-09-23 21:49:15,763:INFO:downsampling leaves estimated 70796 word corpus (77.8% of prior 90968)
2017-09-23 21:49:15,765:INFO:estimated required memory for 2362 words and 100 dimensions: 3070600 bytes
2017-09-23 21:49:15,799:INFO:resetting layer weights
2017-09-23 21:49:15,884:INFO:training model with 4 work

Load Model

In [10]:
model = Word2Vec.load(dirpath+"word-vec_out")

2017-09-23 21:49:16,906:INFO:loading Word2Vec object from C:/Users/ankit.bhatia/Documents/GitHub/PythonScripts/data/word-vec_out
2017-09-23 21:49:17,085:INFO:loading wv recursively from C:/Users/ankit.bhatia/Documents/GitHub/PythonScripts/data/word-vec_out.wv.* with mmap=None
2017-09-23 21:49:17,085:INFO:setting ignored attribute syn0norm to None
2017-09-23 21:49:17,085:INFO:setting ignored attribute cum_table to None
2017-09-23 21:49:17,085:INFO:loaded C:/Users/ankit.bhatia/Documents/GitHub/PythonScripts/data/word-vec_out


In [11]:
model['money']
#model[['money','credit']]
#type(model)

array([  3.36690596e-03,   2.07509920e-01,  -8.03962126e-02,
        -4.00607176e-02,   3.82987433e-03,   1.32872388e-01,
         2.72845477e-02,  -6.84864298e-02,   1.25237852e-01,
        -1.82130691e-02,   9.53656137e-02,  -1.11005373e-01,
         3.17790452e-03,  -1.90762263e-02,   4.00985964e-03,
         1.20315850e-01,  -4.44382019e-02,   1.79904819e-01,
         6.70349672e-02,   1.06361829e-01,   8.10712054e-02,
        -1.30286310e-02,   7.17504248e-02,  -1.99396256e-02,
        -1.38991237e-01,  -1.35300159e-01,  -1.00590408e-01,
        -7.17621669e-03,   9.60225761e-02,  -9.65964049e-02,
        -7.48647600e-02,  -3.74641307e-02,  -4.20694910e-02,
         2.92405076e-02,   1.37861311e-01,   7.33111799e-02,
        -1.16696060e-01,  -6.89079911e-02,   1.31329358e-01,
         1.67849213e-02,   6.30585402e-02,   7.59773180e-02,
         4.54270057e-02,  -1.08169194e-03,   1.23673931e-01,
         7.61288330e-02,   1.20582528e-01,   1.23724587e-01,
        -7.75826499e-02,

In [12]:
model.most_similar('money')

2017-09-23 21:49:17,337:INFO:precomputing L2-norms of word weight vectors


[('better', 0.9999033808708191),
 ('which', 0.99989914894104),
 ('some', 0.9998878240585327),
 ('long', 0.9998822212219238),
 ('different', 0.9998811483383179),
 ('enough', 0.999880313873291),
 ('makes', 0.9998777508735657),
 ('work', 0.9998776316642761),
 ('use', 0.9998761415481567),
 ('find', 0.9998738169670105)]

In [13]:
model.similarity('money','credit')
#type(model)

0.99957431960014764

## Semantic similarity

In [14]:
def cosine_similarity(inp1, inp2):
    return np.dot(inp1, inp2) / (np.linalg.norm(inp1)*np.linalg.norm(inp2))

In [15]:
cosine_similarity([1,5],[5,6])

0.87885343166569463

In [16]:
def average_similarity(text1, text2):
    # Lower and tokenize the words
    text1 = text1.lower().split()
    text2 = text2.lower().split()
    
    # Get a list of word vectors for each word in the sentence
    vector1 = np.array([model[word] for word in text1])
    vector2 = np.array([model[word] for word in text2])
    avg1_vector1 = np.mean(vector1,axis =0)
    avg1_vector2 = np.mean(vector2,axis =0)
    return cosine_similarity(avg1_vector1,avg1_vector2)

In [17]:
average_similarity('money','love')

0.99986523