# The Purpose of this notebook is to train Smooth Inverse Frequency (SIF) embeddings but in a notebook so that long-loading tasks like loading embeddings or training models can be simplified so that we may be able to wrap this into a class and serialize it

In [5]:
import os
import sys

import pickle

import pandas as pd

import numpy as np

from sklearn.decomposition import TruncatedSVD
from sklearn.externals import joblib

import spacy
print('Spacy version : {}'.format(spacy.__version__))

Spacy version : 2.0.18


In [6]:
sys.path.append('../src')
import data_io, params, SIF_embedding
from SIF_embedding import get_weighted_average

In [7]:
# input
wordfile = '../data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website
weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency
weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme
sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

In [8]:
# load word vectors
#(words, We) = data_io.getWordmap(wordfile)
(words, We) = data_io.load_glove_word_map(wordfile)

Loading Glove Model
Current line count : 0
Current line count : 10000
Current line count : 20000
Current line count : 30000
Current line count : 40000
Current line count : 50000
Current line count : 60000
Current line count : 70000
Current line count : 80000
Current line count : 90000
Current line count : 100000
Current line count : 110000
Current line count : 120000
Current line count : 130000
Current line count : 140000
Current line count : 150000
Current line count : 160000
Current line count : 170000
Current line count : 180000
Current line count : 190000
Current line count : 200000
Current line count : 210000
Current line count : 220000
Current line count : 230000
Current line count : 240000
Current line count : 250000
Current line count : 260000
Current line count : 270000
Current line count : 280000
Current line count : 290000
Current line count : 300000
Current line count : 310000
Current line count : 320000
Current line count : 330000
Current line count : 340000
Current line c

In [9]:
# load word weights
word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word

In [10]:
# load sentences
x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

In [11]:
# set parameters
sif_params = params.params()
sif_params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, sif_params) # embedding[i,:] is the embedding for sentence i

In [12]:
print(embedding.shape)

(2, 300)


In [13]:
print(embedding)

[[-0.02397412  0.04764011  0.01670638 -0.01021727 -0.00139526  0.05780546
   0.02932482  0.02521947 -0.00411831 -0.0497154  -0.01200984  0.02978773
   0.02579444  0.0648637  -0.01721727  0.0088077  -0.00324565 -0.00423109
   0.02092886 -0.0288103   0.01772368  0.00952553  0.01083998  0.03672469
   0.02482178 -0.01848137  0.01709322 -0.0010324  -0.01724248  0.02142519
   0.06448491  0.01068625  0.04189265  0.06044579  0.01613084 -0.03517724
   0.04446857 -0.0635548  -0.04702112  0.001705   -0.02767853 -0.03828304
  -0.00997419 -0.05627686 -0.0539105  -0.03421345 -0.0210843   0.04780176
   0.06440688 -0.00873629 -0.03890336  0.06306987  0.02305344  0.04054183
   0.01818783 -0.0345772  -0.00531466  0.03098978 -0.04894507  0.03326195
  -0.01061555  0.02434095 -0.01672525  0.0040396  -0.00448458  0.01285247
  -0.00156122  0.02135667 -0.03233538  0.01347072 -0.05490188 -0.03487819
  -0.03027378 -0.04119349 -0.00896852 -0.03933423 -0.02704299 -0.00429622
  -0.02346152  0.00888304 -0.00354414 

# Now let's load some other sentences from a PICKLE

In [14]:
MIMIC_PICKLE_PATH = '../data/MIMIC_DISCHARGE_SUMMARIES.pickle'
#mimic_file = open(MIMIC_PICKLE_PATH, 'rb')
#mimic_df = pickle.load(mimic_file)
#mimic_file.close()

mimic_df = pd.read_pickle(MIMIC_PICKLE_PATH)

print(type(mimic_df))

<class 'pandas.core.frame.DataFrame'>


In [15]:
print(len(mimic_df))

55177


In [16]:
print(mimic_df.head())

   ROW_ID                                               TEXT
0       1  Admission Date:  [**2823-9-29**]              ...
1       2  Admission Date:  [**2830-3-12**]              ...
2       3  Admission Date:  [**2714-3-12**]              ...
3       4  Admission Date:  [**2678-10-4**]              ...
4       5  Admission Date:  [**2936-5-6**]     Discharge ...


In [17]:
MAX_DOCUMENTS_FOR_TRAINING = 10000
MAX_TOKENS_PER_SENTENCE = 30
MIN_TOKENS_PER_SENTENCE = 8

In [18]:
%%time

mimic_texts = mimic_df.TEXT.unique()[:MAX_DOCUMENTS_FOR_TRAINING]

Wall time: 1.86 s


In [None]:
print(len(mimic_texts))

In [19]:
#print(mimic_texts[0])

# Let's load spacy and get ready to tokenize for sentences

In [20]:
nlp = spacy.load("en_core_web_sm", disable=["tagger"])

In [21]:
%%time

print('Processing and gathering sentences...')

training_sentences = []
for mimic_text in mimic_texts:
    doc = nlp(mimic_text)
    # loop through sentences
    for sent in doc.sents:
        
        if len(sent) < MIN_TOKENS_PER_SENTENCE:
            continue
            
        if len(sent) > MAX_TOKENS_PER_SENTENCE:
            continue
        
        tokens = sent[0 : MAX_TOKENS_PER_SENTENCE]
        sentence_str = ' '.join(token.text for token in tokens)
        training_sentences.append(sentence_str)

Wall time: 1h 13min 27s


In [22]:
print('Total training sentences : {}'.format(len(training_sentences)))

Total training sentences : 1126346


In [23]:
print(training_sentences[:5])

['* * ]              Sex :    F \n\n Service : SURGERY \n\n', 'Patient recorded as having No Known Allergies to Drugs \n\n Attending:[**First Name3 ( LF )', '* * ] \n Chief Complaint : \n', 'headache and neck stiffness \n\n Major Surgical or Invasive Procedure : \n', 'The patient is \n in distress , rigoring and has aphasia and only limited history \n is obtained .']


# Now let's try to train again

In [24]:
# load sentences
x, m = data_io.sentences2idx(training_sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

In [25]:
%%time

# set parameters
sif_params = params.params()
sif_params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, sif_params) # embedding[i,:] is the embedding for sentence i

Wall time: 36 s


In [26]:
print(embedding.shape)

(1126346, 300)


In [27]:
print(embedding[0,])

[-1.76219580e-02  1.61995946e-02 -7.19719637e-02 -4.06783682e-03
  5.23425440e-02 -9.11286660e-02  8.00928662e-02 -4.98964288e-02
  7.54627182e-03 -2.33041924e-02 -1.91686445e-01 -1.87458424e-01
  7.92256573e-03 -1.15552549e-01 -9.54133804e-02 -2.40817803e-02
 -9.30613580e-02  5.85403618e-02 -4.53826036e-02  1.04779905e-01
  1.53054677e-02  3.70770784e-02 -5.04296015e-02 -5.27354980e-02
 -5.21344360e-02  1.55661385e-01  5.84484537e-02  5.44221380e-02
 -1.04250684e-01 -3.45987554e-02  2.05650576e-02  3.99680617e-02
  9.44325013e-02 -7.12544733e-02  2.27614427e-01 -8.87598875e-02
 -8.60411401e-02 -1.26490241e-02 -8.22869311e-02  4.60043930e-02
  5.40581098e-02  5.34578384e-02 -1.02580432e-01  1.73888276e-01
 -2.02854465e-01  2.29170101e-01 -2.26779076e-03  1.63016751e-01
 -5.47874516e-03  1.33322946e-01  5.06870408e-02 -2.01443644e-02
 -1.02761369e-01 -4.69683740e-02 -1.05394108e-01  6.49192622e-02
 -8.30736386e-02  6.59382513e-02 -4.66816308e-02  9.43523174e-03
  2.58029216e-02  2.98281

In [28]:
class SIFModel(object):
    def __init__(self):
        self.trained = False
        self.svd = None
        self.word_map = None
        self.params = params
        
    def save(self, filename):
        components = [self.word_map, self.weight4ind, self.params, self.svd]
        joblib.dump(components, filename)
        
    def load(self, filename):
        components = joblib.load(filename)
        self.trained = True
        self.word_map = components[0]
        self.weight4ind = components[1]
        self.params = components[2]
        self.svd = components[3]

    def transform(self, We, sentences):
        x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
        w = data_io.seq2weight(x, m, self.weight4ind) # get word weights
        weighted_emb = get_weighted_average(We, x, w)
        # now use the model we've already loaded
        return self.remove_pc(weighted_emb)
        
    def compute_pc(self, X):
        # this is what happens in compute_pc() in src/SIF_embedding.py
        self.svd = TruncatedSVD(n_components=self.params.rmpc, n_iter=7, random_state=0)
        self.svd.fit(X)
        
    def remove_pc(self, X):
        pc = self.svd.components_
        
        if self.params.rmpc == 1:
            XX = X - X.dot(pc.transpose()) * pc
        else:
            XX = X - X.dot(pc.transpose()).dot(pc)
            
        return XX
        
    def fit(self, sentences, We, params, word_map, weight4ind):
        
        # store these off for pickling or extra transforms
        self.word_map = word_map
        self.weight4ind = weight4ind
        self.params = params
        
        x, m = data_io.sentences2idx(training_sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
        w = data_io.seq2weight(x, m, self.weight4ind) # get word weights
        
        # now let's do some of what happens in src/SIF_embedding.py
        # but also keep some pieces along the way
        weighted_emb = get_weighted_average(We, x, w)
        
        self.compute_pc(weighted_emb)
        
        self.trained = True
        
        return self.remove_pc(weighted_emb)

In [29]:
# SIF filename
SIF_JOBLIB_FILE_NAME = 'SIF_{0}.joblib'.format(os.path.splitext(os.path.basename(wordfile))[0])

print('Preparing to train a model to be stored at: {}'.format(SIF_JOBLIB_FILE_NAME))

Preparing to train a model to be stored at: SIF_glove.840B.300d.joblib


In [30]:
sif_model = SIFModel()

# now let's train it...
model_embeddings = sif_model.fit(training_sentences, We, sif_params, words, weight4ind)
print(model_embeddings[0,])

[-1.76219580e-02  1.61995946e-02 -7.19719637e-02 -4.06783682e-03
  5.23425440e-02 -9.11286660e-02  8.00928662e-02 -4.98964288e-02
  7.54627182e-03 -2.33041924e-02 -1.91686445e-01 -1.87458424e-01
  7.92256573e-03 -1.15552549e-01 -9.54133804e-02 -2.40817803e-02
 -9.30613580e-02  5.85403618e-02 -4.53826036e-02  1.04779905e-01
  1.53054677e-02  3.70770784e-02 -5.04296015e-02 -5.27354980e-02
 -5.21344360e-02  1.55661385e-01  5.84484537e-02  5.44221380e-02
 -1.04250684e-01 -3.45987554e-02  2.05650576e-02  3.99680617e-02
  9.44325013e-02 -7.12544733e-02  2.27614427e-01 -8.87598875e-02
 -8.60411401e-02 -1.26490241e-02 -8.22869311e-02  4.60043930e-02
  5.40581098e-02  5.34578384e-02 -1.02580432e-01  1.73888276e-01
 -2.02854465e-01  2.29170101e-01 -2.26779076e-03  1.63016751e-01
 -5.47874516e-03  1.33322946e-01  5.06870408e-02 -2.01443644e-02
 -1.02761369e-01 -4.69683740e-02 -1.05394108e-01  6.49192622e-02
 -8.30736386e-02  6.59382513e-02 -4.66816308e-02  9.43523174e-03
  2.58029216e-02  2.98281

In [31]:
sif_model.save(SIF_JOBLIB_FILE_NAME)

In [32]:
loaded_sif_model = SIFModel()
loaded_sif_model.load(SIF_JOBLIB_FILE_NAME)

In [33]:
loaded_embeddings = loaded_sif_model.transform(We, [training_sentences[0]])
print(loaded_embeddings)

[[-1.76219580e-02  1.61995946e-02 -7.19719637e-02 -4.06783682e-03
   5.23425440e-02 -9.11286660e-02  8.00928662e-02 -4.98964288e-02
   7.54627182e-03 -2.33041924e-02 -1.91686445e-01 -1.87458424e-01
   7.92256573e-03 -1.15552549e-01 -9.54133804e-02 -2.40817803e-02
  -9.30613580e-02  5.85403618e-02 -4.53826036e-02  1.04779905e-01
   1.53054677e-02  3.70770784e-02 -5.04296015e-02 -5.27354980e-02
  -5.21344360e-02  1.55661385e-01  5.84484537e-02  5.44221380e-02
  -1.04250684e-01 -3.45987554e-02  2.05650576e-02  3.99680617e-02
   9.44325013e-02 -7.12544733e-02  2.27614427e-01 -8.87598875e-02
  -8.60411401e-02 -1.26490241e-02 -8.22869311e-02  4.60043930e-02
   5.40581098e-02  5.34578384e-02 -1.02580432e-01  1.73888276e-01
  -2.02854465e-01  2.29170101e-01 -2.26779076e-03  1.63016751e-01
  -5.47874516e-03  1.33322946e-01  5.06870408e-02 -2.01443644e-02
  -1.02761369e-01 -4.69683740e-02 -1.05394108e-01  6.49192622e-02
  -8.30736386e-02  6.59382513e-02 -4.66816308e-02  9.43523174e-03
   2.58029