# The goal of this notebook is to cluster and assign cluster numbers to each word so that they might be useful as features

In [1]:
import os
import json
import random
import pickle

import numpy as np
import pandas as pd

In [2]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

gensim.__version__

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


'2.3.0'

In [3]:
# this package can be found here : 
# https://github.com/Hironsan/anago
# after cloning, it can be installed with the typical : 
# python setup.py install

import anago

In [4]:
import tensorflow as tf

import sklearn
from sklearn.cluster import KMeans, MiniBatchKMeans
print(sklearn.__version__)

import keras
print(keras.__version__)

# importing a CRF layer (originally from Keras-contrib)
from keras_contrib.layers.crf import CRF

import keras.backend as K
from keras.layers import Dense, LSTM, GRU, Bidirectional, Embedding, Input, Dropout, Lambda
from keras.layers.merge import Concatenate
from keras.models import Model

0.19.1
2.1.2


In [5]:
import basic
from basic.nlp.tokenizers import clinical_tokenizers
from basic.nlp.annotation.annotation import Annotation, AnnotatedDocument
from basic.MADE.madetokenizer import build_made_tokenizer
from basic.nlp.sequenceutils import get_sentence_bio_tagged_tokens
from basic.MADE.madeutils import read_made_data, train_default_anago_model, get_all_sentence_tokens_and_tags, create_model, gather_validation_metrics

print('Imported custom BASIC modules')

Imported custom BASIC modules


In [6]:
EMBEDDINGS_BASE_DIR = r'c:\temp_embeddings'

#PRETRAINED_EMBEDDINGS_FILENAME = r'wikipedia-pubmed-and-PMC-w2v.bin'
PRETRAINED_EMBEDDINGS_FILENAME = r'pubmed+wiki+pitts-nopunct-lower-cbow-n10.bin'

K_CLUSTERS = 500
ENABLED_BATCH_KMEANS = True
KMEANS_BATCH_SIZE = 500000

# Load our embeddings

In [7]:
# let's load some pretrained embeddings as well

# NOTE : These embeddings are made available here:
# http://evexdb.org/pmresources/vec-space-models/

pretrained_word_vectors = KeyedVectors.load_word2vec_format(os.path.join(EMBEDDINGS_BASE_DIR, PRETRAINED_EMBEDDINGS_FILENAME), binary=True)  # C binary format
                                                 
print(pretrained_word_vectors)

<gensim.models.keyedvectors.KeyedVectors object at 0x0000021CD5874D68>


In [8]:
print(pretrained_word_vectors['the'].shape)

pretrained_embeddings_dimensions = pretrained_word_vectors['the'].shape[0]
print(pretrained_embeddings_dimensions)

(200,)
200


In [9]:
embeddings = pretrained_word_vectors
embeddings_dimensions = pretrained_embeddings_dimensions

In [10]:
%%time

word_vectors = pretrained_word_vectors.syn0

# Initalize a k-means object and use it to extract centroids
print('Running K means')

if ENABLED_BATCH_KMEANS:
    print('Using batch KMeans')
    kmeans = MiniBatchKMeans(n_clusters = K_CLUSTERS, 
                         #n_jobs = -2, 
                         batch_size = KMEANS_BATCH_SIZE)
else:
    print('Using original recipe KMeans')
    kmeans = KMeans( n_clusters = K_CLUSTERS, n_jobs = -2 )

cluster_idx = kmeans.fit_predict( word_vectors )

print('K means trained')

Running K means
Using batch KMeans
K means trained
Wall time: 17min 45s


In [11]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number
word_cluster_map = dict(zip(pretrained_word_vectors.wv.index2word, cluster_idx ))

print(list(word_cluster_map.items())[:50])

[('</s>', 346), ('the', 62), ('of', 323), ('and', 5), ('in', 323), ('to', 406), ('a', 329), ('with', 88), ('was', 461), ('for', 16), ('is', 71), ('were', 15), ('by', 251), ('that', 182), ('as', 71), ('on', 5), ('from', 143), ('at', 75), ('or', 236), ('this', 71), ('are', 71), ('an', 329), ('be', 182), ('patients', 432), ('not', 182), ('which', 329), ('it', 182), ('these', 229), ('we', 86), ('have', 71), ('after', 436), ('p', 216), ('cells', 266), ('has', 71), ('but', 182), ('had', 436), ('also', 477), ('than', 252), ('s', 407), ('two', 199), ('he', 254), ('been', 173), ('between', 252), ('their', 286), ('one', 465), ('his', 20), ('study', 86), ('all', 287), ('may', 229), ('no', 182)]


In [12]:
print(len(word_cluster_map))

1352549


In [13]:
typename = 'KMeans'
if ENABLED_BATCH_KMEANS:
    typename = 'BatchKmeans'

map_pickle_file_name = 'WordClusters_K{0}_{1}_{2}.pickle'.format(K_CLUSTERS, typename, PRETRAINED_EMBEDDINGS_FILENAME.split('.')[0])

print('Writing cluster map pickle to : {}'.format(map_pickle_file_name))
      
with open(map_pickle_file_name, 'wb') as handle:
    pickle.dump(word_cluster_map, handle, protocol=pickle.HIGHEST_PROTOCOL)
      
print('DONE writing cluster map pickle')

Writing cluster map pickle to : WordClusters_K500_BatchKmeans_pubmed+wiki+pitts-nopunct-lower-cbow-n10.pickle
DONE writing cluster map pickle
