# Configuration

In [None]:
# import os
# os.system('pip install --upgrade gensim') # if Gensim is not installed

In [1]:
import gensim
import nltk
import json
from glob import glob
import logging
from nltk.tokenize import word_tokenize
from pprint import pprint # pretty print | https://docs.python.org/ko/3/library/pprint.html

print(gensim.__version__)



4.0.0


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /data/blank54/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Word2vec from Scratch

## Load Corpus

In [3]:
def load_json_corpus(corpus_dir):
    fpaths = glob(corpus_dir + '/*')
    corpus = []
    for path in fpaths:
        with open(path, 'r') as f:
            doc = json.load(f)
            content = doc['content']
            doc_text = word_tokenize(content)
            corpus.append(doc_text)
    
    return corpus

In [11]:
""" It will take a minute """
corpus_dir = '/data/sech/workspace/text_mining_seminar/20210330_word2vec/WorldBankNews/'
corpus = load_json_corpus(corpus_dir)

In [6]:
print('Number of documents: %d' % len(corpus))

Number of documents: 9169


## Train Word2vec Model

In [7]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [8]:
""" It will take a minute """
from gensim.models.word2vec import Word2Vec
wb_w2v = Word2Vec(corpus)

2021-03-30 14:19:56,140 : INFO : collecting all words and their counts
2021-03-30 14:19:56,141 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-30 14:19:57,368 : INFO : collected 106683 word types from a corpus of 8194685 raw words and 9169 sentences
2021-03-30 14:19:57,369 : INFO : Creating a fresh vocabulary
2021-03-30 14:19:57,501 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 29017 unique words (27.19927261138138%% of original 106683, drops 77666)', 'datetime': '2021-03-30T14:19:57.499799', 'gensim': '4.0.0', 'python': '3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-107-generic-x86_64-with-debian-stretch-sid', 'event': 'prepare_vocab'}
2021-03-30 14:19:57,501 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 8073911 word corpus (98.52619106164545%% of original 8194685, drops 120774)', 'datetime': '2021-03-30T14:19:57.501889', 'gensim': '4.0.0', 'pytho

In [10]:
print(type(wb_w2v))

<class 'gensim.models.word2vec.Word2Vec'>


## Sanity Check

In [None]:
wb_w2v.wv.most_similar('bank')

## Usage Demo

In [None]:
# Get word vector
print('word vector for "bank": (size: %d)' % len(wb_w2v.wv['bank']))
pprint(wb_w2v.wv['bank'])

In [12]:
# Similarity b/w 2 words
from scipy.spatial.distance import cosine
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html

query1 = 'world'
query2 = 'bank'
wv1 = wb_w2v.wv[query1]
wv2 = wb_w2v.wv[query2]
print('cosine distance b/w "%s" and "%s"' % (query1, query2))
print(cosine(wv1, wv2))
print()
print('cosine similarity b/w "%s" and "%s"' % (query1, query2))
print(1 - cosine(wv1, wv2))

cosine distance b/w "world" and "bank"
0.8492620140314102

cosine similarity b/w "world" and "bank"
0.15073798596858978


In [None]:
# Get most similar words
query1 = 'world'
query2 = 'bank'
print('Most similar words with "%s"' % query1)
pprint(wb_w2v.wv.most_similar('world'))
print()
print('Most 5 similar words with "%s"' % query2)
pprint(wb_w2v.wv.most_similar('bank', topn=5))

In [None]:
# Additive Composition
"""
Usage:
Positive word 1 - Negative word 1 + Positive word 2 = Result
Same as Pos1 : Neg1 = Result : Pos2
(e.g. "Korea" - "Seoul" + "Tokyo" = ? ; i.e. Korea:Seoul = ?:Tokyo)
"""

pos1 = 'Korea'
neg1 = 'Seoul'
pos2 = 'Tokyo'
# pos1 : neg1 = (result) : pos2
k = 5
print('%d candidate words for the nation whose capital city is %s:' % (k, pos2))
pprint(wb_w2v.wv.most_similar(positive=[pos1, pos2], negative=[neg1], topn=k)) # Expecting "Japan"

# Pretrained Word2vec Model

## Model Download

In [15]:
import gensim.downloader as api

info = api.info() # https://github.com/RaRe-Technologies/gensim-data
pprint(info)

2021-03-30 14:30:20,581 : INFO : Creating /data/blank54/gensim-data


{'corpora': {'20-newsgroups': {'checksum': 'c92fd4f6640a86d5ba89eaad818a9891',
                               'description': 'The notorious collection of '
                                              'approximately 20,000 newsgroup '
                                              'posts, partitioned (nearly) '
                                              'evenly across 20 different '
                                              'newsgroups.',
                               'fields': {'data': '',
                                          'id': 'original id inferred from '
                                                'folder name',
                                          'set': 'marker of original split '
                                                 "(possible values 'train' and "
                                                 "'test')",
                                          'topic': 'name of topic (20 variant '
                                                   'of pos

In [16]:
""" It will take a few minutes """
google_w2v = api.load('word2vec-google-news-300')



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





2021-03-30 14:36:29,413 : INFO : word2vec-google-news-300 downloaded
2021-03-30 14:36:31,552 : INFO : loading projection weights from /data/blank54/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2021-03-30 14:37:27,918 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /data/blank54/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2021-03-30T14:37:27.918461', 'gensim': '4.0.0', 'python': '3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-107-generic-x86_64-with-debian-stretch-sid', 'event': 'load_word2vec_format'}


In [17]:
print(type(google_w2v))

<class 'gensim.models.keyedvectors.KeyedVectors'>


In [20]:
type(wb_w2v.wv)

gensim.models.keyedvectors.KeyedVectors

In [18]:
# Sanity Check
google_w2v.most_similar('bank')

[('banks', 0.7440759539604187),
 ('banking', 0.690161406993866),
 ('Bank', 0.6698698401451111),
 ('lender', 0.6342284679412842),
 ('banker', 0.6092953085899353),
 ('depositors', 0.6031531691551208),
 ('mortgage_lender', 0.5797975659370422),
 ('depositor', 0.5716427564620972),
 ('BofA', 0.5714625120162964),
 ('Citibank', 0.5589520335197449)]

## Usage Demo

In [19]:
# Get word vector
def word_vector(query, model):
    if isinstance(model, gensim.models.word2vec.Word2Vec):
        result = model.wv[query]
    elif isinstance(model, gensim.models.keyedvectors.KeyedVectors):
        result = model[query]
    else:
        print('No Word2vec model was provided.')
    
    return result

In [14]:
pprint(word_vector('world', google_w2v)[:10])
pprint(word_vector('bank', google_w2v)[:10])

NameError: name 'google_w2v' is not defined

In [None]:
# Similarity b/w 2 words
def word_similarity(query1, query2, model):
    from scipy.spatial.distance import cosine
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
    wv1 = word_vector(query1, model)
    wv2 = word_vector(query2, model)
    sim = 1 - cosine(wv1, wv2)
    return sim

In [None]:
query1 = 'world'
query2 = 'bank'
print(word_similarity(query1, query2, google_w2v))

In [None]:
# Get most similar words
def similar_words(query, model, k):
    if isinstance(model, gensim.models.word2vec.Word2Vec):
        return model.wv.most_similar(query, topn=k)
    elif isinstance(model, gensim.models.keyedvectors.KeyedVectors):
        return model.most_similar(query, topn=k)
    else:
        print('No Word2vec model was provided.')

In [None]:
query = 'world'
k = 15
print('Most %d similar words with "%s"' % (k, query))
pprint(similar_words(query, google_w2v, k))

In [None]:
# Additive Composition
def add_comp(pos1, neg1, pos2, model, k):
    """
    Usage:
    Positive word 1 - Negative word 1 + Positive word 2 = Result
    Same as Pos1 : Neg1 = Result : Pos2
    (e.g. "Korea" - "Seoul" + "Tokyo" = ? ; i.e. Korea:Seoul = ?:Tokyo)
    """
    if isinstance(model, gensim.models.word2vec.Word2Vec):
        res = model.wv.most_similar(positive=[pos1, pos2], negative=[neg1], topn=k)
    elif isinstance(model, gensim.models.keyedvectors.KeyedVectors):
        res = model.most_similar(positive=[pos1, pos2], negative=[neg1], topn=k)
    else:
        print('No Word2vec model was provided.')
        res = None
    
    return res

In [None]:
pos1 = 'Korea'
neg1 = 'Seoul'
pos2 = 'Tokyo'
# pos1 : neg1 = (result) : pos2
k = 5
print('%d candidate words for the nation whose capital city is %s:' % (k, pos2))
pprint(add_comp(pos1, neg1, pos2, google_w2v, k)) # Expecting "Japan"