# The Purpose of this notebook is to train Smooth Inverse Frequency (SIF) embeddings but in a notebook so that long-loading tasks like loading embeddings or training models can be simplified so that we may be able to wrap this into a class and serialize it

In [None]:
import os
import sys

import pickle

import pandas as pd

import numpy as np

from sklearn.decomposition import TruncatedSVD
from sklearn.externals import joblib

import gensim
from gensim.models import KeyedVectors
print('gensim version : {}'.format(gensim.__version__))

import spacy
print('Spacy version : {}'.format(spacy.__version__))

In [None]:
sys.path.append('../src')
import data_io, params, SIF_embedding
from SIF_embedding import get_weighted_average

In [None]:
from data_io_w2v import load_w2v_word_map

In [None]:
# input
#wordfile = '../data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website
wordfile = r'C:/temp_embeddings/pubmed+wiki+pitts-nopunct-lower-cbow-n10.bin'

#EMBEDDINGS_FORMAT = 'GLOVE'
EMBEDDINGS_FORMAT = 'WORD2VEC_BIN'

weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency

weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme
sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

In [None]:
%%time

# load word vectors
#(words, We) = data_io.getWordmap(wordfile)

words = None
We = None
if EMBEDDINGS_FORMAT == 'GLOVE':
    print('Loading embeddings as GLOVE')
    (words, We) = data_io.load_glove_word_map(wordfile)
elif EMBEDDINGS_FORMAT == 'WORD2VEC_BIN':
    (words, We) = load_w2v_word_map(wordfile, binary = True)
elif EMBEDDINGS_FORMAT == 'WORD2VEC_TXT':
    (words, We) = load_w2v_word_map(wordfile, binary = False)

In [None]:
# load word weights
word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word

In [None]:
# load sentences
x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

In [None]:
# set parameters
sif_params = params.params()
sif_params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, sif_params) # embedding[i,:] is the embedding for sentence i

In [None]:
print(embedding.shape)

In [None]:
print(embedding)

# Now let's load some other sentences from a PICKLE

In [None]:
MIMIC_PICKLE_PATH = '../data/MIMIC_DISCHARGE_SUMMARIES.pickle'
#mimic_file = open(MIMIC_PICKLE_PATH, 'rb')
#mimic_df = pickle.load(mimic_file)
#mimic_file.close()

mimic_df = pd.read_pickle(MIMIC_PICKLE_PATH)

print(type(mimic_df))

In [None]:
print(len(mimic_df))

In [None]:
print(mimic_df.head())

In [None]:
#MAX_DOCUMENTS_FOR_TRAINING = 10000
MAX_DOCUMENTS_FOR_TRAINING = 100
MAX_TOKENS_PER_SENTENCE = 30
MIN_TOKENS_PER_SENTENCE = 8

In [None]:
%%time

mimic_texts = mimic_df.TEXT.unique()[:MAX_DOCUMENTS_FOR_TRAINING]

In [None]:
print(len(mimic_texts))

In [None]:
#print(mimic_texts[0])

# Let's load spacy and get ready to tokenize for sentences

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["tagger"])

In [None]:
%%time

print('Processing and gathering sentences...')

training_sentences = []
for mimic_text in mimic_texts:
    doc = nlp(mimic_text)
    # loop through sentences
    for sent in doc.sents:
        
        if len(sent) < MIN_TOKENS_PER_SENTENCE:
            continue
            
        if len(sent) > MAX_TOKENS_PER_SENTENCE:
            continue
        
        tokens = sent[0 : MAX_TOKENS_PER_SENTENCE]
        sentence_str = ' '.join(token.text for token in tokens)
        training_sentences.append(sentence_str)

In [None]:
print('Total training sentences : {}'.format(len(training_sentences)))

In [None]:
print(training_sentences[:5])

# Now let's try to train again

In [None]:
# load sentences
x, m = data_io.sentences2idx(training_sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

In [None]:
%%time

# set parameters
sif_params = params.params()
sif_params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, sif_params) # embedding[i,:] is the embedding for sentence i

In [None]:
print(embedding.shape)

In [None]:
print(embedding[0,])

In [None]:
class SIFModel(object):
    def __init__(self):
        self.trained = False
        self.svd = None
        self.word_map = None
        self.params = params
        
    def save(self, filename):
        components = [self.word_map, self.weight4ind, self.params, self.svd]
        joblib.dump(components, filename)
        
    def load(self, filename):
        components = joblib.load(filename)
        self.trained = True
        self.word_map = components[0]
        self.weight4ind = components[1]
        self.params = components[2]
        self.svd = components[3]

    def transform(self, We, sentences):
        x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
        w = data_io.seq2weight(x, m, self.weight4ind) # get word weights
        weighted_emb = get_weighted_average(We, x, w)
        # now use the model we've already loaded
        return self.remove_pc(weighted_emb)
        
    def compute_pc(self, X):
        # this is what happens in compute_pc() in src/SIF_embedding.py
        self.svd = TruncatedSVD(n_components=self.params.rmpc, n_iter=7, random_state=0)
        self.svd.fit(X)
        
    def remove_pc(self, X):
        pc = self.svd.components_
        
        if self.params.rmpc == 1:
            XX = X - X.dot(pc.transpose()) * pc
        else:
            XX = X - X.dot(pc.transpose()).dot(pc)
            
        return XX
        
    def fit(self, sentences, We, params, word_map, weight4ind):
        
        # store these off for pickling or extra transforms
        self.word_map = word_map
        self.weight4ind = weight4ind
        self.params = params
        
        x, m = data_io.sentences2idx(training_sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
        w = data_io.seq2weight(x, m, self.weight4ind) # get word weights
        
        # now let's do some of what happens in src/SIF_embedding.py
        # but also keep some pieces along the way
        weighted_emb = get_weighted_average(We, x, w)
        
        self.compute_pc(weighted_emb)
        
        self.trained = True
        
        return self.remove_pc(weighted_emb)

In [None]:
# SIF filename
SIF_JOBLIB_FILE_NAME = 'SIF_{0}.joblib'.format(os.path.splitext(os.path.basename(wordfile))[0])

print('Preparing to train a model to be stored at: {}'.format(SIF_JOBLIB_FILE_NAME))

In [None]:
sif_model = SIFModel()

# now let's train it...
model_embeddings = sif_model.fit(training_sentences, We, sif_params, words, weight4ind)
print(model_embeddings[0,])

In [None]:
sif_model.save(SIF_JOBLIB_FILE_NAME)

In [None]:
loaded_sif_model = SIFModel()
loaded_sif_model.load(SIF_JOBLIB_FILE_NAME)

In [None]:
loaded_embeddings = loaded_sif_model.transform(We, [training_sentences[0]])
print(loaded_embeddings)