# The Purpose of this notebook is to train Smooth Inverse Frequency (SIF) embeddings but in a notebook so that long-loading tasks like loading embeddings or training models can be simplified so that we may be able to wrap this into a class and serialize it

In [1]:
import os
import sys

In [2]:
sys.path.append('../src')
import data_io, params, SIF_embedding

In [3]:
# input
wordfile = '../data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website
weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency
weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme
sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

In [4]:
# load word vectors
#(words, We) = data_io.getWordmap(wordfile)
(words, We) = data_io.load_glove_word_map(wordfile)

Loading Glove Model
Current line count : 0
Current line count : 10000
Current line count : 20000
Current line count : 30000
Current line count : 40000
Current line count : 50000
Current line count : 60000
Current line count : 70000
Current line count : 80000
Current line count : 90000
Current line count : 100000
Current line count : 110000
Current line count : 120000
Current line count : 130000
Current line count : 140000
Current line count : 150000
Current line count : 160000
Current line count : 170000
Current line count : 180000
Current line count : 190000
Current line count : 200000
Current line count : 210000
Current line count : 220000
Current line count : 230000
Current line count : 240000
Current line count : 250000
Current line count : 260000
Current line count : 270000
Current line count : 280000
Current line count : 290000
Current line count : 300000
Current line count : 310000
Current line count : 320000
Current line count : 330000
Current line count : 340000
Current line c

In [5]:
# load word weights
word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word

In [6]:
# load sentences
x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

In [7]:
# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i

In [9]:
print(embedding.shape)

(2, 300)


In [8]:
print(embedding)

[[-0.02397412  0.04764011  0.01670638 -0.01021727 -0.00139526  0.05780546
   0.02932482  0.02521947 -0.00411831 -0.0497154  -0.01200984  0.02978773
   0.02579444  0.0648637  -0.01721727  0.0088077  -0.00324565 -0.00423109
   0.02092886 -0.0288103   0.01772368  0.00952553  0.01083998  0.03672469
   0.02482178 -0.01848137  0.01709322 -0.0010324  -0.01724248  0.02142519
   0.06448491  0.01068625  0.04189265  0.06044579  0.01613084 -0.03517724
   0.04446857 -0.0635548  -0.04702112  0.001705   -0.02767853 -0.03828304
  -0.00997419 -0.05627686 -0.0539105  -0.03421345 -0.0210843   0.04780176
   0.06440688 -0.00873629 -0.03890336  0.06306987  0.02305344  0.04054183
   0.01818783 -0.0345772  -0.00531466  0.03098978 -0.04894507  0.03326195
  -0.01061555  0.02434095 -0.01672525  0.0040396  -0.00448458  0.01285247
  -0.00156122  0.02135667 -0.03233538  0.01347072 -0.05490188 -0.03487819
  -0.03027378 -0.04119349 -0.00896852 -0.03933423 -0.02704299 -0.00429622
  -0.02346152  0.00888304 -0.00354414 