In [1]:
# Make my plots pretty!
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['savefig.dpi'] = 100
mpl.rcParams['figure.dpi'] = 100

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Feature Extraction Part 3

Yesterday's experiments seems to yield good features. I have one more trick up my sleeve as far as feature generation is concerned: combining tf-idf and word embeddings.

Then we'll try some other classification methods like random forests and XGBoost.

I'll also refactor some of my code into python functions that can be reused.

In [2]:
# Just load the small training/validation sets we made, no need the whole model.
import pickle
train_set = pickle.load(open('../data/train_small.pickle', 'r'))
valid_set = pickle.load(open('../data/valid_small.pickle', 'r'))
print(len(train_set))
print(len(valid_set))

20000
5000


In [3]:
import torch

In [4]:
# What each data point looks like. Pretty basic.
train_set[1]

(['is', 'michael', 'phelps', 'the', 'greatest', 'swimmer', 'ever', '?'],
 ['is',
  'michael',
  'phelps',
  'the',
  'greatest',
  'swimmer',
  'of',
  'all',
  'time',
  '?'],
 0)

In [5]:
# Load word embeddings. 100d to be quick
import sys
sys.path.append('../models/')
import data
reload(data)

dictionary, embed = data.load_embeddings('../data/glove.6B.100d.txt')
print(embed.size())

torch.Size([400000, 100])


In [6]:
# Sentence to word embedding.
reload(data)
embed_out = torch.zeros((8,100))
data.embed_words(dictionary, embed, ['the', 'quick', 'brown', 'fox'], embed_out[:4])

0

In [7]:
# Check the results
embed_out[:6,:10]


-0.0382 -0.2449  0.7281 -0.3996  0.0832  0.0440 -0.3914  0.3344 -0.5755  0.0875
-0.4315 -0.2204 -0.2268 -0.1022 -0.3186 -0.1181 -0.0934 -0.0698 -0.2903 -0.3401
-0.4381 -0.0994 -0.2604 -1.1084  0.1055 -0.0545  0.4487  0.0617 -0.5880 -0.2174
 0.1692 -0.9978  0.2443 -0.7969  0.0364 -0.5613  0.1731  0.2929 -0.4329 -0.8227
 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
[torch.FloatTensor of size 6x10]

In [8]:
# If we load an unknown word, embedding should be zeroed out
reload(data)
embed_out = torch.randn((8,100))
print(data.embed_words(dictionary, embed, ['kkkkzkzkkzkz'], embed_out[2:3]))
embed_out[:4,:10]

1



-0.4866 -1.3207 -0.8360 -0.6047 -0.1985 -0.8930  1.3665  0.6613  0.8760 -0.7007
-0.1408 -0.0129 -0.6157  1.5378 -0.6237  0.7065  0.8764 -0.4265  0.2084  0.7650
 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
-1.3122  0.7493 -0.6223 -1.3783  0.3999 -0.0679 -1.3426 -1.5309 -0.4686  2.0079
[torch.FloatTensor of size 4x10]

# TF-IDF

Let's count TF-IDF on the whole training set.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
reload(data)

<module 'data' from 'data.pyc'>

In [22]:
q1_word_tokenized = data.load_tokenized('../data/train_q1.txt')

In [24]:
q2_word_tokenized = data.load_tokenized('../data/train_q2.txt')
print('len:', len(q2_word_tokenized))
print(q2_word_tokenized[1234])

len: 404288
('what', 'is', 'one', 'key', 'criteria', 'or', 'computer', 'part', 'that', 'makes', 'a', 'computer', 'fast', 'for', 'gaming', '?')


In [26]:
import itertools
import pandas as pd
tf = TfidfVectorizer()
#itertools.chain()

In [28]:
%time tf.fit(itertools.imap(lambda x: ' '.join(x), itertools.chain(q1_word_tokenized, q2_word_tokenized)))

CPU times: user 13.6 s, sys: 2.81 s, total: 16.4 s
Wall time: 16.3 s


TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [30]:
# Save this so I don't have to do this again.
pickle.dump(tf, open('tfidf.pickle', 'wb'))

In [10]:
tf = pickle.load(open('../data/tfidf.pickle', 'rb'))

What does one of these tf-idf vectors look like?

In [11]:
sentence = train_set[2525][0]
print(sentence)
sentence_sparse = tf.transform([' '.join(sentence)])
print(sentence_sparse)

['what', 'are', 'some', 'of', 'the', 'most', 'epic', 'dialogues', 'in', 'hollywood', 'movies', '?']
  (0, 58228)	0.102677315476
  (0, 53417)	0.104353842032
  (0, 49758)	0.210069623875
  (0, 37679)	0.144075882446
  (0, 35383)	0.322653773137
  (0, 35253)	0.251826328892
  (0, 26566)	0.132532742595
  (0, 25140)	0.402384762401
  (0, 18537)	0.506381771442
  (0, 15611)	0.537732845086
  (0, 4424)	0.145261985783


In [12]:
# We can precompute tfidf-reweighted sentence embeddings
reload(data)
tfidf_embed_out = torch.zeros((16, 100))
data.get_reweighted_embeddings(tf, dictionary, embed, sentence, tfidf_embed_out)

11

In [13]:
tfidf_embed_out[:11]


-0.0156  0.0394  0.0917  ...  -0.0278  0.0228  0.0946
 0.1278  0.0215 -0.3566  ...  -0.3855 -0.0945  0.5576
 0.0656 -0.0124  0.1186  ...  -0.0196  0.2303  0.0804
          ...             ⋱             ...          
-0.0040 -0.0256  0.0760  ...  -0.0152  0.0864  0.0282
 0.0895  0.2036  0.2927  ...  -0.0610  0.0177 -0.0318
 0.3734 -0.1171  0.3375  ...   0.0275  0.3118  0.2352
[torch.FloatTensor of size 11x100]

In [14]:
# Compare with:
embed_out = torch.zeros((len(sentence), 100))
print(data.embed_words(dictionary, embed, sentence, embed_out))
embed_out[:11]

0



-0.1518  0.3841  0.8934  ...  -0.2712  0.2216  0.9211
-0.5153  0.8319  0.2246  ...  -1.2024  1.1304  0.3479
-0.1445  0.5602  0.2054  ...  -0.3640  0.9161  0.8209
          ...             ⋱             ...          
 0.0857 -0.2220  0.1657  ...  -0.0743  0.7581 -0.3424
 0.9280 -0.2910  0.8388  ...   0.0685  0.7749  0.5844
 0.2032 -0.0385  0.3675  ...  -0.0609  0.7138  0.2492
[torch.FloatTensor of size 11x100]

In [16]:
max_word_len = 30
embed_size = 100
q1_train_all = torch.zeros((len(train_set), max_word_len, embed_size))
q2_train_all = torch.zeros((len(train_set), max_word_len, embed_size))

In [56]:
reload(data)
vectorized_train = data.vectorize(dictionary, embed, train_set, q1_train_all, q2_train_all, tf=tf)

In [57]:
q1_valid_all = torch.zeros((len(valid_set), max_word_len, embed_size))
q2_valid_all = torch.zeros((len(valid_set), max_word_len, embed_size))
vectorized_valid = data.vectorize(dictionary, embed, valid_set, q1_valid_all, q2_valid_all, tf=tf)

# Feature Vectors

Now each sentence is encoded as an Lx30 matrix. Let's use the vectorization techniques. The results of each should be a NxD matrix, one row for each training example.

In [60]:
import features
reload(features)

# The simplest thing would be averaging all the per-word vectors to get a single sentence embedding.
# This is a mean of the tfidf-weighted GloVE vectors.
q1_train_mean = q1_train_all.mean(dim=1).squeeze()
q2_train_mean = q2_train_all.mean(dim=1).squeeze()
q1_valid_mean = q1_valid_all.mean(dim=1).squeeze()
q2_valid_mean = q2_valid_all.mean(dim=1).squeeze()

In [257]:
# Permutation test
reload(features)
features.permute(vectorized_train[:100], distance='absolute', n_trials=200)[0, 10:20]


 0.4750
 0.5100
 0.5350
 0.6400
 0.6250
 0.5850
 0.7550
 0.6900
 0.5250
 0.4350
[torch.FloatTensor of size 10]

In [255]:
# Verify implementation

q1 = vectorized_train[0][0]
q2 = vectorized_train[0][1]
q1_vs = list(q1.chunk(3)) + list(q2.chunk(4))
len(q1_vs)
q1_mean = q1.mean(dim=0)
q2_mean = q2.mean(dim=0)
base_diff = (q1_mean - q2_mean).abs()
print(base_diff[0, 30:40])
len_q1 = q1.size(0)
len_q2 = q2.size(0)
sum_len = len_q1 + len_q2

running_diff = torch.zeros(100)

for t in xrange(20000):
    np.random.shuffle(q1_vs)
    qs1 = torch.stack(q1_vs[:len_q1])
    qs2 = torch.stack(q1_vs[len_q1:])
    qs1_mean = qs1.mean(dim=0)
    qs2_mean = qs2.mean(dim=0)
    diff = (qs1_mean - qs2_mean).abs()
    running_diff.add_((diff > base_diff).squeeze().float())
    
val = (running_diff / 20000)
val[10:20]


 0.1103
 0.0540
 0.1230
 0.0292
 0.0065
 0.0028
 0.0036
 0.0204
 0.0218
 0.0187
[torch.FloatTensor of size 10]




 0.4562
 0.5696
 0.5389
 0.6754
 0.6581
 0.5696
 0.7164
 0.6837
 0.5438
 0.4431
[torch.FloatTensor of size 10]

In [150]:
j.mean(dim=0)


 0.5199  0.3491  0.6506  0.4932  0.5075  0.5595  0.5885
[torch.FloatTensor of size 1x7]

In [69]:
s[3][torch.randperm(34)[:21]] = 1

In [90]:
j = torch.rand((10, 7))
j


 0.7309  0.9930  0.8515  0.2611  0.9239  0.0191  0.3806
 0.0496  0.1212  0.4117  0.8901  0.2001  0.9738  0.0644
 0.4557  0.8290  0.9228  0.0063  0.4309  0.8844  0.4808
 0.6261  0.0879  0.8273  0.5520  0.2190  0.4143  0.7219
 0.1603  0.0601  0.4650  0.5642  0.0548  0.3870  0.2903
 0.5868  0.6631  0.4622  0.1865  0.2093  0.6016  0.5310
 0.4925  0.1365  0.7357  0.8064  0.9292  0.8576  0.9671
 0.7286  0.4199  0.7516  0.9416  0.6415  0.5607  0.5838
 0.7414  0.0938  0.1671  0.2931  0.6318  0.5355  0.8770
 0.6271  0.0864  0.9116  0.4303  0.8347  0.3608  0.9880
[torch.FloatTensor of size 10x7]

In [83]:
counts = torch.zeros((10,7))
counts


    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
    0     0     0     0     0     0     0
[torch.FloatTensor of size 10x7]

In [108]:
b = ((j - .6) * 10).ceil().clamp(0, 1).mean(dim=0)
b


 0.5000  0.3000  0.6000  0.3000  0.5000  0.4000  0.4000
[torch.FloatTensor of size 1x7]

In [113]:
(j - b.repeat(10,1)).ceil().clamp(0, 1).mean(dim=0)


 0.6000  0.4000  0.6000  0.6000  0.5000  0.7000  0.7000
[torch.FloatTensor of size 1x7]