# The goal of this notebook is to cluster and assign cluster numbers to each word so that they might be useful as features

In [None]:
import os
import json
import random
import pickle

import numpy as np
import pandas as pd

In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

gensim.__version__

In [None]:
import sklearn
from sklearn.cluster import KMeans, MiniBatchKMeans
print(sklearn.__version__)

In [None]:
import basic
from basic.nlp.tokenizers import clinical_tokenizers
from basic.nlp.annotation.annotation import Annotation, AnnotatedDocument
from basic.nlp.sequenceutils import get_sentence_bio_tagged_tokens

from madetokenizer import build_made_tokenizer
from madeutils import read_made_data, get_all_sentence_tokens_and_tags, gather_validation_metrics

print('Imported custom BASIC modules')

In [None]:
EMBEDDINGS_BASE_DIR = r'c:\embeddings'

CLUSTERS_BASE_DIR = r'resources/clusters'

PRETRAINED_EMBEDDINGS_FILENAME = r'wikipedia-pubmed-and-PMC-w2v.bin'
#PRETRAINED_EMBEDDINGS_FILENAME = r'pubmed+wiki+pitts-nopunct-lower-cbow-n10.bin'

K_CLUSTERS = 500
ENABLED_BATCH_KMEANS = True
KMEANS_BATCH_SIZE = 500000

# Load our embeddings

In [None]:
# let's load some pretrained embeddings as well

# NOTE : These embeddings are made available here:
# http://evexdb.org/pmresources/vec-space-models/

pretrained_word_vectors = KeyedVectors.load_word2vec_format(os.path.join(EMBEDDINGS_BASE_DIR, PRETRAINED_EMBEDDINGS_FILENAME), binary=True)  # C binary format
                                                 
print(pretrained_word_vectors)

In [None]:
print(pretrained_word_vectors['the'].shape)

pretrained_embeddings_dimensions = pretrained_word_vectors['the'].shape[0]
print(pretrained_embeddings_dimensions)

In [None]:
embeddings = pretrained_word_vectors
embeddings_dimensions = pretrained_embeddings_dimensions

In [None]:
%%time

word_vectors = pretrained_word_vectors.syn0

# Initalize a k-means object and use it to extract centroids
print('Running K means')

if ENABLED_BATCH_KMEANS:
    print('Using batch KMeans')
    kmeans = MiniBatchKMeans(n_clusters = K_CLUSTERS, 
                         #n_jobs = -2, 
                         batch_size = KMEANS_BATCH_SIZE)
else:
    print('Using original recipe KMeans')
    kmeans = KMeans( n_clusters = K_CLUSTERS, n_jobs = -2 )

cluster_idx = kmeans.fit_predict( word_vectors )

print('K means trained')

In [None]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number
word_cluster_map = dict(zip(pretrained_word_vectors.wv.index2word, cluster_idx ))

print(list(word_cluster_map.items())[:50])

In [None]:
print(len(word_cluster_map))

In [None]:
typename = 'KMeans'
if ENABLED_BATCH_KMEANS:
    typename = 'BatchKmeans'

map_pickle_file_name = '{3}/WordClusters_K{0}_{1}_{2}.pickle'.format(K_CLUSTERS, 
                                                                     typename, 
                                                                     PRETRAINED_EMBEDDINGS_FILENAME.split('.')[0],
                                                                    CLUSTERS_BASE_DIR)

print('Writing cluster map pickle to : {}'.format(map_pickle_file_name))
      
with open(map_pickle_file_name, 'wb') as handle:
    pickle.dump(word_cluster_map, handle, protocol=pickle.HIGHEST_PROTOCOL)
      
print('DONE writing cluster map pickle')