In [1]:
__author__ = 'Ksenia Voronaya'

from sklearn.datasets import fetch_20newsgroups
from gensim.models import Word2Vec

import nltk
import string

In [2]:
corpus_data = fetch_20newsgroups(categories=['rec.motorcycles', 'rec.autos', 
                                             'rec.sport.hockey', 'soc.religion.christian', 
                                             'alt.atheism', 'sci.electronics'], subset='all')

In [3]:
print("List of topics ({} categories):".format(len(list(corpus_data.target_names))))
print(list(corpus_data.target_names))
print("Size of corpus is {} documents".format(len(corpus_data.data)))

List of topics (6 categories):
['alt.atheism', 'rec.autos', 'rec.motorcycles', 'rec.sport.hockey', 'sci.electronics', 'soc.religion.christian']
Size of corpus is 5765 documents


In [4]:
# tokenization
tokenized_data = [nltk.word_tokenize(text) for text in corpus_data.data]

In [5]:
corpus_texts = []
for text in tokenized_data:
    lemms = []
    for word in text:
        if word not in string.punctuation:
            lemms.append(word.lower())
    corpus_texts.append(lemms)
    
print("Size of tokenized corpus is {} documents".format(len(corpus_texts)))

Size of tokenized corpus is 5765 documents


In [6]:
# train our model by using Word2Vec
import gensim

w2v_model = gensim.models.Word2Vec(corpus_texts)

In [7]:
# similarity of words
words = [('motorcycle', 'car'), ('phone', 'gadget'), ('christianity', 'buddhism'), 
         ('hockey', 'football'), ('computer', 'hockey'), 
         ('motorcycle', 'computer'), ('motorcycle', 'christianity')]

for word_pair in words:
    print("******* Similarity between words {} & {} *******".format(word_pair[0], word_pair[1]))
    print(w2v_model.similarity(word_pair[0], word_pair[1]))

******* Similarity between words motorcycle & car *******
0.712350297086
******* Similarity between words phone & gadget *******
0.349898904959
******* Similarity between words christianity & buddhism *******
0.421998909873
******* Similarity between words hockey & football *******
0.557867952211
******* Similarity between words computer & hockey *******
0.259170065077
******* Similarity between words motorcycle & computer *******
0.0926919409609
******* Similarity between words motorcycle & christianity *******
0.0593047661766


In [8]:
# word doesn't match the list
words_list = ["motorcycle car buddhism", "hockey football computer", "computer phone gadget"]

for w in words_list:
    print("******* Word which doesn't match the list [{}] *******".format(w))
    print(w2v_model.doesnt_match(w.split()))

******* Word which doesn't match the list [motorcycle car buddhism] *******
buddhism
******* Word which doesn't match the list [hockey football computer] *******
computer
******* Word which doesn't match the list [computer phone gadget] *******
gadget


In [9]:
# addition and subtraction of words
words_list = [['king', 'queen', 'woman'], ['city', 'russia', 'moscow'], 
              ['apple', 'computer', 'jobs']]

for words in words_list:
    print("******* Add up the two words '{} & {}' *******".format(words[0], words[1]))
    print(w2v_model.most_similar(positive=[words[0], words[1]]))

******* Add up the two words 'king & queen' *******
[(u'saint', 0.9126960039138794), (u'lamb', 0.8956121206283569), (u'abraham', 0.8902089595794678), (u'gabriel', 0.8830323815345764), (u'hippo', 0.8762622475624084), (u'actor', 0.8716279864311218), (u'3:15', 0.8711950182914734), (u'tibi', 0.8706649541854858), (u'collapsed', 0.868289053440094), (u'jacob', 0.8666414022445679)]
******* Add up the two words 'city & russia' *******
[(u'slovakia', 0.9462153315544128), (u'sweep', 0.9222856163978577), (u'district', 0.9178619384765625), (u'standings', 0.912844181060791), (u'annual', 0.9102997779846191), (u'minnesota', 0.9098118543624878), (u'friday', 0.909060001373291), (u'alberta', 0.9087628722190857), (u'gulls', 0.908035397529602), (u'dallas', 0.9067976474761963)]
******* Add up the two words 'apple & computer' *******
[(u'engineering', 0.9446059465408325), (u'computing', 0.9366177320480347), (u'services', 0.9297630190849304), (u'dept', 0.9272782802581787), (u'lab', 0.921724796295166), (u'depa

In [10]:
for words in words_list:
    print("******* Add up and subtraction of words '{} & {} & {}' *******".format(words[0], words[1], words[2]))
    print(w2v_model.most_similar(positive=[words[0], words[1]], negative=[words[2]]))

******* Add up and subtraction of words 'king & queen & woman' *******
******* Add up and subtraction of words 'city & russia & moscow' *******
[(u'april', 0.8264734745025635), (u'division', 0.8034517765045166), (u'1993', 0.7940775156021118), (u'minnesota', 0.7915362119674683), (u'sweden', 0.7791882753372192), (u'standings', 0.7762460708618164), (u'finland', 0.7669036984443665), (u'friday', 0.766826868057251), (u'chicago', 0.7652980089187622), (u'alberta', 0.765078067779541)]
******* Add up and subtraction of words 'apple & computer & jobs' *******
[(u'engineering', 0.8233916759490967), (u'ibm', 0.8131964206695557), (u'dept', 0.807506799697876), (u'relva.rchland.ibm.com', 0.8060644268989563), (u'department', 0.8012591600418091), (u'uk', 0.7872180938720703), (u'technology', 0.7854435443878174), (u'institute', 0.78508460521698), (u'research', 0.7807159423828125), (u'services', 0.7723796963691711)]
