# Configuration

In [None]:
# import os
# os.system('pip install --upgrade gensim') # if Gensim is not installed

In [3]:
import gensim
import nltk
import json
from glob import glob
import logging
from nltk.tokenize import word_tokenize
from pprint import pprint # pretty print | https://docs.python.org/ko/3/library/pprint.html

print(gensim.__version__)

4.0.0


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /data/sech/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Word2vec from Scratch

## Load Corpus

In [6]:
def load_json_corpus(corpus_dir):
    fpaths = glob(corpus_dir + '/*')
    corpus = []
    for path in fpaths:
        with open(path, 'r') as f:
            doc = json.load(f)
            content = doc['content']
            doc_text = word_tokenize(content)
            corpus.append(doc_text)
    
    return corpus

In [7]:
""" It will take a minute """
corpus_dir = '/data/sech/workspace/text_mining_seminar/20210330_word2vec/WorldBankNews/'
corpus = load_json_corpus(corpus_dir)

In [8]:
print('Number of documents: %d' % len(corpus))

Number of documents: 9169


## Train Word2vec Model

In [9]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
""" It will take a minute """
from gensim.models.word2vec import Word2Vec
wb_w2v = Word2Vec(corpus)

2021-03-29 14:23:50,656 : INFO : collecting all words and their counts
2021-03-29 14:23:50,657 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-29 14:23:51,903 : INFO : collected 106683 word types from a corpus of 8194685 raw words and 9169 sentences
2021-03-29 14:23:51,904 : INFO : Creating a fresh vocabulary
2021-03-29 14:23:52,044 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 29017 unique words (27.19927261138138%% of original 106683, drops 77666)', 'datetime': '2021-03-29T14:23:52.023110', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep  4 2020, 07:30:14) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-107-generic-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}
2021-03-29 14:23:52,045 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 8073911 word corpus (98.52619106164545%% of original 8194685, drops 120774)', 'datetime': '2021-03-29T14:23:52.045543', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep  

2021-03-29 14:24:31,348 : INFO : EPOCH - 4 : training on 8194685 raw words (5958674 effective words) took 9.6s, 620240 effective words/s
2021-03-29 14:24:32,353 : INFO : EPOCH 5 - PROGRESS: at 10.48% examples, 602092 words/s, in_qsize 5, out_qsize 0
2021-03-29 14:24:33,358 : INFO : EPOCH 5 - PROGRESS: at 20.99% examples, 613069 words/s, in_qsize 5, out_qsize 0
2021-03-29 14:24:34,358 : INFO : EPOCH 5 - PROGRESS: at 31.18% examples, 616756 words/s, in_qsize 5, out_qsize 0
2021-03-29 14:24:35,359 : INFO : EPOCH 5 - PROGRESS: at 41.30% examples, 614256 words/s, in_qsize 5, out_qsize 0
2021-03-29 14:24:36,368 : INFO : EPOCH 5 - PROGRESS: at 51.65% examples, 613584 words/s, in_qsize 5, out_qsize 0
2021-03-29 14:24:37,391 : INFO : EPOCH 5 - PROGRESS: at 61.88% examples, 613498 words/s, in_qsize 5, out_qsize 0
2021-03-29 14:24:38,406 : INFO : EPOCH 5 - PROGRESS: at 72.58% examples, 616029 words/s, in_qsize 5, out_qsize 0
2021-03-29 14:24:39,424 : INFO : EPOCH 5 - PROGRESS: at 83.29% examples,

In [11]:
print(type(w2v))

<class 'gensim.models.word2vec.Word2Vec'>


## Sanity Check

In [14]:
wb_w2v.wv.most_similar('bank')

[('banks', 0.6523056030273438),
 ('transaction', 0.5722414255142212),
 ('commercial', 0.5686595439910889),
 ('borrowers', 0.5343312621116638),
 ('rating', 0.5328773856163025),
 ('sovereign', 0.520230233669281),
 ('company', 0.5194247364997864),
 ('collateral', 0.5083614587783813),
 ('currency', 0.5027228593826294),
 ('borrower', 0.4958907961845398)]

## Usage Demo

In [15]:
# Get word vector
print('word vector for "bank": (size: %d)' % len(wb_w2v.wv['bank']))
pprint(wb_w2v.wv['bank'])

word vector for "bank": (size: 100)
array([-1.462332  , -1.8022327 ,  0.53372693,  1.8523256 , -0.84932065,
       -1.4591262 ,  1.7800521 ,  0.29778385,  0.61090356,  0.41010308,
        0.2648589 , -0.8836652 , -0.23509975, -0.72303736,  0.37825194,
        0.9098913 ,  0.15957332, -0.5573116 , -0.16678242, -1.3305517 ,
        0.03870054,  1.2751849 , -0.9958456 , -0.34555736, -0.810182  ,
       -0.55675745,  1.4750186 ,  0.3988165 , -0.41113296, -0.6253789 ,
       -0.73357266, -0.93823725, -0.05289098, -0.2394051 ,  2.6977818 ,
        1.0480872 , -1.5274101 , -0.4036461 ,  0.07298851,  2.477908  ,
        0.1703974 ,  0.15330207,  0.06536277, -0.8249808 ,  2.2873003 ,
       -0.6959409 ,  1.1990219 ,  0.8184906 ,  0.10711139,  1.7417552 ,
       -0.34934676,  1.5469879 , -0.91393465,  1.7290417 , -0.9967765 ,
        0.48209578, -1.8724929 , -0.02577546, -0.04203504,  0.06223132,
        0.92172766,  0.99148655,  0.3224332 , -0.21204974,  2.059879  ,
        0.05276413, -0.74877

In [20]:
# Similarity b/w 2 words
from scipy.spatial.distance import cosine
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html

query1 = 'world'
query2 = 'bank'
wv1 = wb_w2v.wv[query1]
wv2 = wb_w2v.wv[query2]
print('cosine distance b/w "%s" and "%s"' % (query1, query2))
print(cosine(wv1, wv2))
print()
print('cosine similarity b/w "%s" and "%s"' % (query1, query2))
print(1 - cosine(wv1, wv2))

cosine distance b/w "world" and "bank"
0.843868300318718

cosine similarity b/w "world" and "bank"
0.15613169968128204


In [21]:
# Get most similar words
query1 = 'world'
query2 = 'bank'
print('Most similar words with "%s"' % query1)
pprint(wb_w2v.wv.most_similar('world'))
print()
print('Most 5 similar words with "%s"' % query2)
pprint(wb_w2v.wv.most_similar('bank', topn=5))

Most similar words with "world"
[('globe', 0.7241352200508118),
 ('region', 0.6937957406044006),
 ('country', 0.6821999549865723),
 ('continent', 0.6172592043876648),
 ('nation', 0.5616614818572998),
 ('global', 0.5160314440727234),
 ('planet', 0.4878930151462555),
 ('clock', 0.48342350125312805),
 ('China', 0.47411882877349854),
 ('Europe', 0.4573015868663788)]

Most 5 similar words with "bank"
[('banks', 0.6523056030273438),
 ('transaction', 0.5722414255142212),
 ('commercial', 0.5686595439910889),
 ('borrowers', 0.5343312621116638),
 ('rating', 0.5328773856163025)]


In [24]:
# Additive Composition
"""
Usage:
Positive word 1 - Negative word 1 + Positive word 2 = Result
Same as Pos1 : Neg1 = Result : Pos2
(e.g. "Korea" - "Seoul" + "Tokyo" = ? ; i.e. Korea:Seoul = ?:Tokyo)
"""

pos1 = 'Korea'
neg1 = 'Seoul'
pos2 = 'Tokyo'
# pos1 : neg1 = (result) : pos2
k = 5
print('%d candidate words for the nation whose capital city is %s:' % (k, pos2))
pprint(wb_w2v.wv.most_similar(positive=[pos1, pos2], negative=[neg1], topn=k)) # Expecting "Japan"

5 candidate words for the nation whose capital city is Tokyo:
[('Japan', 0.6795076131820679),
 ('Australia', 0.48621612787246704),
 ('WTO', 0.4786176383495331),
 ('IMF', 0.47635480761528015),
 ('Poland', 0.47211363911628723)]


# Pretrained Word2vec Model

## Model Download

In [5]:
import gensim.downloader as api

info = api.info() # https://github.com/RaRe-Technologies/gensim-data
pprint(info)

{'corpora': {'20-newsgroups': {'checksum': 'c92fd4f6640a86d5ba89eaad818a9891',
                               'description': 'The notorious collection of '
                                              'approximately 20,000 newsgroup '
                                              'posts, partitioned (nearly) '
                                              'evenly across 20 different '
                                              'newsgroups.',
                               'fields': {'data': '',
                                          'id': 'original id inferred from '
                                                'folder name',
                                          'set': 'marker of original split '
                                                 "(possible values 'train' and "
                                                 "'test')",
                                          'topic': 'name of topic (20 variant '
                                                   'of pos

In [26]:
""" It will take a few minutes """
google_w2v = api.load('word2vec-google-news-300')

2021-03-29 14:33:19,385 : INFO : loading projection weights from /data/sech/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2021-03-29 14:34:00,090 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /data/sech/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2021-03-29T14:34:00.090172', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep  4 2020, 07:30:14) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-107-generic-x86_64-with-glibc2.10', 'event': 'load_word2vec_format'}


In [28]:
print(type(google_w2v))

<class 'gensim.models.keyedvectors.KeyedVectors'>


In [29]:
# Sanity Check
google_w2v.most_similar('bank')

[('banks', 0.7440759539604187),
 ('banking', 0.690161406993866),
 ('Bank', 0.6698699593544006),
 ('lender', 0.6342284083366394),
 ('banker', 0.6092954277992249),
 ('depositors', 0.6031532287597656),
 ('mortgage_lender', 0.5797975659370422),
 ('depositor', 0.5716428756713867),
 ('BofA', 0.5714625120162964),
 ('Citibank', 0.5589520931243896)]

## Usage Demo

In [30]:
# Get word vector
def word_vector(query, model):
    if isinstance(model, gensim.models.word2vec.Word2Vec):
        result = model.wv[query]
    elif isinstance(model, gensim.models.keyedvectors.KeyedVectors):
        result = model[query]
    else:
        print('No Word2vec model was provided.')
    
    return result

In [31]:
pprint(word_vector('world', google_w2v)[:10])
pprint(word_vector('bank', google_w2v)[:10])

array([-0.06396484,  0.06835938,  0.22460938,  0.13183594, -0.05957031,
        0.03881836,  0.07568359, -0.14160156,  0.07080078,  0.15136719],
      dtype=float32)
array([ 0.02197266,  0.13476562, -0.05786133,  0.05566406,  0.09912109,
       -0.140625  , -0.0030365 ,  0.01879883,  0.25390625, -0.04882812],
      dtype=float32)


In [32]:
# Similarity b/w 2 words
def word_similarity(query1, query2, model):
    from scipy.spatial.distance import cosine
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
    wv1 = word_vector(query1, model)
    wv2 = word_vector(query2, model)
    sim = 1 - cosine(wv1, wv2)
    return sim

In [33]:
query1 = 'world'
query2 = 'bank'
print(word_similarity(query1, query2, google_w2v))

-0.01078906748443842


In [34]:
# Get most similar words
def similar_words(query, model, k):
    if isinstance(model, gensim.models.word2vec.Word2Vec):
        return model.wv.most_similar(query, topn=k)
    elif isinstance(model, gensim.models.keyedvectors.KeyedVectors):
        return model.most_similar(query, topn=k)
    else:
        print('No Word2vec model was provided.')

In [35]:
query = 'world'
k = 15
print('Most %d similar words with "%s"' % (k, query))
pprint(similar_words(query, google_w2v, k))

Most 15 similar words with "world"
[('globe', 0.6945997476577759),
 ('theworld', 0.6902236342430115),
 ('country', 0.5980385541915894),
 ('continent', 0.5966995358467102),
 ('worldâ_€_™', 0.5897718071937561),
 ('nation', 0.5760580897331238),
 ('global', 0.5744006037712097),
 ('worldwide', 0.5641196966171265),
 ('United_States', 0.544048011302948),
 ('globally', 0.5411289930343628),
 ('worlds', 0.5359798669815063),
 ('worldís', 0.5294704437255859),
 ('America', 0.5279377102851868),
 ('Europe', 0.5250756740570068),
 ('planet', 0.5170571804046631)]


In [36]:
# Additive Composition
def add_comp(pos1, neg1, pos2, model, k):
    """
    Usage:
    Positive word 1 - Negative word 1 + Positive word 2 = Result
    Same as Pos1 : Neg1 = Result : Pos2
    (e.g. "Korea" - "Seoul" + "Tokyo" = ? ; i.e. Korea:Seoul = ?:Tokyo)
    """
    if isinstance(model, gensim.models.word2vec.Word2Vec):
        res = model.wv.most_similar(positive=[pos1, pos2], negative=[neg1], topn=k)
    elif isinstance(model, gensim.models.keyedvectors.KeyedVectors):
        res = model.most_similar(positive=[pos1, pos2], negative=[neg1], topn=k)
    else:
        print('No Word2vec model was provided.')
        res = None
    
    return res

In [37]:
pos1 = 'Korea'
neg1 = 'Seoul'
pos2 = 'Tokyo'
# pos1 : neg1 = (result) : pos2
k = 5
print('%d candidate words for the nation whose capital city is %s:' % (k, pos2))
pprint(add_comp(pos1, neg1, pos2, google_w2v, k)) # Expecting "Japan"

5 candidate words for the nation whose capital city is Tokyo:
[('Japan', 0.8032678365707397),
 ('Japanese', 0.6562737822532654),
 ('Japans', 0.6080487370491028),
 ('Nippon', 0.5482696890830994),
 ('Toshiya', 0.5421562790870667)]
