# The Purpose of this notebook is to train Smooth Inverse Frequency (SIF) embeddings but in a notebook so that long-loading tasks like loading embeddings or training models can be simplified so that we may be able to wrap this into a class and serialize it

In [1]:
import os
import sys

import pickle

import pandas as pd

import numpy as np

import sklearn
from sklearn.decomposition import TruncatedSVD
# give this a different alias so that it does not conflict with SPACY
from sklearn.externals import joblib as sklearn_joblib

import gensim
from gensim.models import KeyedVectors
print('gensim version : {}'.format(gensim.__version__))

import spacy
print('Spacy version : {}'.format(spacy.__version__))



gensim version : 3.4.0
Spacy version : 2.0.18


In [2]:
sys.path.append('../src')
import data_io, params, SIF_embedding
from SIF_embedding import get_weighted_average
from sif_model import SIFModel

In [3]:
# input
#wordfile = '../data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website
wordfile = r'C:/temp_embeddings/pubmed+wiki+pitts-nopunct-lower-cbow-n10.bin'

#EMBEDDINGS_FORMAT = 'GLOVE'
EMBEDDINGS_FORMAT = 'WORD2VEC_BIN'

# this behavior may change between sets of embeddings since tokens may be all lowercased or case may be intact
LOWERCASE_TOKENS = True

weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency

weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme
sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

In [4]:
%%time

# load word vectors
#(words, We) = data_io.getWordmap(wordfile)

words, We = SIFModel.embedding_loading_helper(wordfile, EMBEDDINGS_FORMAT)

Loading word2vec formatted embeddings from [C:/temp_embeddings/pubmed+wiki+pitts-nopunct-lower-cbow-n10.bin] with binary=True
Wall time: 19.8 s


In [5]:
# load word weights
word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word

In [6]:
# load sentences
x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

In [7]:
# set parameters
sif_params = params.params()
sif_params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, sif_params) # embedding[i,:] is the embedding for sentence i

In [8]:
print(embedding.shape)

(2, 200)


In [9]:
print(embedding)

[[ 3.11440651e-01  4.49017632e-01  3.75954538e-02  1.43402284e-01
  -1.66645969e-01  3.02967699e-03  4.92990278e-02 -3.07121700e-01
   1.42688153e-01  2.70598224e-01  3.17469827e-01 -8.91577591e-02
  -2.74349888e-01 -1.06764611e-01 -4.45135944e-02  1.57391157e-01
  -3.42903763e-02 -1.84726822e-01  6.22194217e-02  3.21345152e-01
   1.34034157e-02  2.27410475e-02  1.78331967e-01 -1.08391695e-01
  -1.25103904e-01  6.54320606e-02  1.92067426e-01 -3.23984721e-02
   2.28680329e-01  4.44462385e-02  1.75134811e-01 -8.90729775e-02
  -1.47496216e-01 -6.12644060e-03  1.43976993e-01  1.64085490e-01
  -3.21689767e-01 -1.04377982e-01  7.43296531e-02  2.71703210e-01
   3.29077012e-01  1.04614446e-01  5.07722957e-02  3.89147501e-01
  -2.16672833e-03 -3.00373809e-01  2.09553575e-01  4.37169072e-02
  -1.92826213e-02 -7.68033348e-02 -5.68043184e-02  2.93115966e-01
  -2.44672315e-02  1.64577363e-01 -1.06934870e-01  1.67254890e-01
   1.16924524e-01 -1.42886363e-01 -8.39806456e-02  1.22895712e-01
   3.44653

# Now let's load some other sentences from a PICKLE

In [10]:
MIMIC_PICKLE_PATH = '../data/MIMIC_DISCHARGE_SUMMARIES.pickle'
#mimic_file = open(MIMIC_PICKLE_PATH, 'rb')
#mimic_df = pickle.load(mimic_file)
#mimic_file.close()

mimic_df = pd.read_pickle(MIMIC_PICKLE_PATH)

print(type(mimic_df))

<class 'pandas.core.frame.DataFrame'>


In [11]:
print(len(mimic_df))

55177


In [12]:
print(mimic_df.head())

   ROW_ID                                               TEXT
0       1  Admission Date:  [**2823-9-29**]              ...
1       2  Admission Date:  [**2830-3-12**]              ...
2       3  Admission Date:  [**2714-3-12**]              ...
3       4  Admission Date:  [**2678-10-4**]              ...
4       5  Admission Date:  [**2936-5-6**]     Discharge ...


In [13]:
#MAX_MIMIC_DOCUMENTS_FOR_TRAINING = 10000
MAX_MIMIC_DOCUMENTS_FOR_TRAINING = 100
MAX_TOKENS_PER_SENTENCE = 30
MIN_TOKENS_PER_SENTENCE = 8

In [14]:
%%time

mimic_texts = mimic_df.TEXT.unique()[:MAX_MIMIC_DOCUMENTS_FOR_TRAINING]

Wall time: 1.88 s


In [15]:
print(len(mimic_texts))

100


In [16]:
#print(mimic_texts[0])

# Let's load spacy and get ready to tokenize for sentences

In [17]:
#nlp = spacy.load("en_core_web_sm", disable=["tagger"])

# let's do a fast sentence breaking WITHOUT a dependency parse...
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])

# per spacy docs, this is a "rule-based sentence segmentation without the dependency parse."
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

In [18]:
%%time

print('Processing and gathering sentences...')

ENABLE_SPACY_MULTIPROCESSING = False

def get_whitespace_sentences(batch_id, texts):
    # let's do a fast sentence breaking WITHOUT a dependency parse...
    nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])

    # per spacy docs, this is a "rule-based sentence segmentation without the dependency parse."
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    
    whitespace_sentences = []
    for doc in nlp.pipe(texts):
        # loop through sentences
        for sent in doc.sents:

            if len(sent) < MIN_TOKENS_PER_SENTENCE:
                continue

            if len(sent) > MAX_TOKENS_PER_SENTENCE:
                continue

            tokens = sent[0 : MAX_TOKENS_PER_SENTENCE]
            sentence_str = ' '.join(token.text for token in tokens)
            whitespace_sentences.append(sentence_str)
            
    return whitespace_sentences

if ENABLE_SPACY_MULTIPROCESSING:
    
    from joblib import Parallel, delayed
    from functools import partial
    from spacy.util import minibatch
    import multiprocessing
    
    SPACY_JOBS = multiprocessing.cpu_count() - 2
    SPACY_BATCH_SIZE = 10
    
    print('Starting multiprocessing text processing...')
    
    partitions = minibatch(mimic_texts, size=SPACY_BATCH_SIZE)
    executor = Parallel(n_jobs=SPACY_JOBS, backend="multiprocessing", prefer="processes")
    do = delayed(get_whitespace_sentences)
    tasks = (do(i, batch) for i, batch in enumerate(partitions))
    training_sentences = executor(tasks)

    print('Total training sentences : {}'.format(len(training_sentences)))
    
else:
    training_sentences = []
    for i, doc in enumerate(nlp.pipe(mimic_texts, batch_size=100)):
        # loop through sentences
        for sent in doc.sents:

            if len(sent) < MIN_TOKENS_PER_SENTENCE:
                continue

            if len(sent) > MAX_TOKENS_PER_SENTENCE:
                continue

            tokens = sent[0 : MAX_TOKENS_PER_SENTENCE]
            sentence_str = ' '.join(token.text for token in tokens)
            
            if LOWERCASE_TOKENS:
                sentence_str = sentence_str.lower()
            
            training_sentences.append(sentence_str)

Processing and gathering sentences...
Wall time: 21.5 s


In [19]:
print('Total training sentences : {}'.format(len(training_sentences)))

Total training sentences : 6846


In [20]:
print(training_sentences[:5])

['the patient is \n in distress , rigoring and has aphasia and only limited history \n is obtained .', 'she \n states that headaches are unusual for her .', 'head ct was done and relealved attenuation \n within the subcortical white matter of the right medial frontal \n lobe .', 'lp was performed showing opening pressure 24 cm h2o wbc of \n 316 , protein 152 , glucose 16 .', '  the patient was evaluated by neuro in the \n ed .']


# Now let's try to train again

In [21]:
# load sentences
x, m = data_io.sentences2idx(training_sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

In [22]:
%%time

# set parameters
sif_params = params.params()
sif_params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, sif_params) # embedding[i,:] is the embedding for sentence i

Wall time: 159 ms


In [23]:
print(embedding.shape)

(6846, 200)


In [24]:
print(embedding[0,])

[ 0.17626167  0.23711565  0.08537466 -0.01195529  0.12861061 -0.13272128
 -0.19920165  0.08112462  0.05974769 -0.01452875  0.44087513 -0.43310128
 -0.05488625 -0.02537395  0.19309921 -0.13808011  0.31721966 -0.53458463
  0.05382784 -0.32087492 -0.23657945  0.18079587  0.17358089 -0.24244937
  0.01254969  0.1371328   0.22613152  0.07706336  0.11525739  0.10930948
 -0.59057764  0.37499111  0.30563124 -0.30139809 -0.25504244  0.18107983
 -0.25743415  0.46330456  0.22119576 -0.18447893  0.46111501  0.30915154
  0.03020038  0.19233093 -0.0119602  -0.05628163 -0.10377176 -0.21680597
 -0.28717111  0.19528575 -0.26908734  0.16856631 -0.14822798  0.00387527
 -0.29648486  0.10642594 -0.01181775  0.23163724 -0.07085339 -0.06140921
  0.09122611 -0.09666857  0.22963035 -0.23812956  0.41410267  0.27602303
  0.06187889  0.13667936  0.43869799  0.22540717 -0.24832362 -0.05650999
  0.09237753 -0.3220208  -0.06862474  0.17165615  0.01730826 -0.02327888
  0.23032499 -0.11491307 -0.09993472 -0.36805464 -0

In [25]:
# SIF filename
SIF_JOBLIB_FILE_NAME = 'SIF_{0}_MIMIC_{1}.joblib'.format(os.path.splitext(os.path.basename(wordfile))[0],
                                                        MAX_MIMIC_DOCUMENTS_FOR_TRAINING)

print('Preparing to train a model to be stored at: {}'.format(SIF_JOBLIB_FILE_NAME))

Preparing to train a model to be stored at: SIF_pubmed+wiki+pitts-nopunct-lower-cbow-n10_MIMIC_100.joblib


In [27]:
sif_model = SIFModel()

# now let's train it...
# def fit (self, sentences, We, lowercase_tokens, embeddings_type, embeddings_filepath, params, word_map, weight4ind)
model_embeddings = sif_model.fit(training_sentences, We, LOWERCASE_TOKENS, EMBEDDINGS_FORMAT, wordfile,
                                 sif_params, words, weight4ind)
print(model_embeddings[0,])

[ 0.17626167  0.23711565  0.08537466 -0.01195529  0.12861061 -0.13272128
 -0.19920165  0.08112462  0.05974769 -0.01452875  0.44087513 -0.43310128
 -0.05488625 -0.02537395  0.19309921 -0.13808011  0.31721966 -0.53458463
  0.05382784 -0.32087492 -0.23657945  0.18079587  0.17358089 -0.24244937
  0.01254969  0.1371328   0.22613152  0.07706336  0.11525739  0.10930948
 -0.59057764  0.37499111  0.30563124 -0.30139809 -0.25504244  0.18107983
 -0.25743415  0.46330456  0.22119576 -0.18447893  0.46111501  0.30915154
  0.03020038  0.19233093 -0.0119602  -0.05628163 -0.10377176 -0.21680597
 -0.28717111  0.19528575 -0.26908734  0.16856631 -0.14822798  0.00387527
 -0.29648486  0.10642594 -0.01181775  0.23163724 -0.07085339 -0.06140921
  0.09122611 -0.09666857  0.22963035 -0.23812956  0.41410267  0.27602303
  0.06187889  0.13667936  0.43869799  0.22540717 -0.24832362 -0.05650999
  0.09237753 -0.3220208  -0.06862474  0.17165615  0.01730826 -0.02327888
  0.23032499 -0.11491307 -0.09993472 -0.36805464 -0

In [28]:
#sif_model.save(SIF_JOBLIB_FILE_NAME)
sklearn_joblib.dump(sif_model, SIF_JOBLIB_FILE_NAME)

['SIF_pubmed+wiki+pitts-nopunct-lower-cbow-n10_MIMIC_100.joblib']

In [29]:
#loaded_sif_model = SIFModel()
#loaded_sif_model.load(SIF_JOBLIB_FILE_NAME)
loaded_sif_model = sklearn_joblib.load(SIF_JOBLIB_FILE_NAME)

In [30]:
loaded_embeddings = loaded_sif_model.transform(We, [training_sentences[0]])
print(loaded_embeddings)

[[ 0.17626167  0.23711565  0.08537466 -0.01195529  0.12861061 -0.13272128
  -0.19920165  0.08112462  0.05974769 -0.01452875  0.44087513 -0.43310128
  -0.05488625 -0.02537395  0.19309921 -0.13808011  0.31721966 -0.53458463
   0.05382784 -0.32087492 -0.23657945  0.18079587  0.17358089 -0.24244937
   0.01254969  0.1371328   0.22613152  0.07706336  0.11525739  0.10930948
  -0.59057764  0.37499111  0.30563124 -0.30139809 -0.25504244  0.18107983
  -0.25743415  0.46330456  0.22119576 -0.18447893  0.46111501  0.30915154
   0.03020038  0.19233093 -0.0119602  -0.05628163 -0.10377176 -0.21680597
  -0.28717111  0.19528575 -0.26908734  0.16856631 -0.14822798  0.00387527
  -0.29648486  0.10642594 -0.01181775  0.23163724 -0.07085339 -0.06140921
   0.09122611 -0.09666857  0.22963035 -0.23812956  0.41410267  0.27602303
   0.06187889  0.13667936  0.43869799  0.22540717 -0.24832362 -0.05650999
   0.09237753 -0.3220208  -0.06862474  0.17165615  0.01730826 -0.02327888
   0.23032499 -0.11491307 -0.09993472 