## LM Evaluation

In [1]:
from fastai.text import *
import html

In [2]:
PATH=Path('data/aclImdb/')
CLAS_PATH=Path('data/imdb_clas/')
LM_PATH=Path('data/imdb_lm/')

In [3]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

In [4]:
max_vocab = 60000
min_freq = 2

In [5]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [6]:
vs=len(itos)
vs,len(trn_lm)

(60002, 90000)

In [7]:
em_sz,nh,nl = 400,1150,3

## Load existing LM model and weights

In [8]:
PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [9]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [10]:
wd=1e-7
bptt=70
bs=250
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [11]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [12]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [13]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.unfreeze()

## Replace weights from classifier-encoder (not LM)

In [14]:
learner.load('lm1')
#learner.load_encoder('lm1_enc')
learner.load_encoder('clas_2_enc')

## Let's evaluate

In [15]:
# cosine similarity - to check quality of our sentence encoder
def cos_sim(v1,v2):
    return F.cosine_similarity(T(v1).unsqueeze(0),T(v2).unsqueeze(0)).mean()

### Round 1 - simple sentences

In [16]:
x_inp = ["i like apples",
         "i want to buy some apples",
         "where is your cell phone"]

In [17]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [18]:
tok

[['i', 'like', 'apples'],
 ['i', 'want', 'to', 'buy', 'some', 'apples'],
 ['where', 'is', 'your', 'cell', 'phone']]

In [19]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[12, 52, 13154], [12, 203, 8, 808, 64, 13154], [134, 9, 146, 2739, 1668]]

In [20]:
m = learner.model[0]
kk=m(V(T([X[2]])));kk #last sentence in X - word level encoding....10 words 400 dim vecs



([Variable containing:
  ( 0  ,.,.) = 
    6.7115e-02 -1.3268e-03 -3.3807e-02  ...  -2.1760e-03 -1.7191e-03 -6.0852e-01
   -4.6452e-04  1.6626e-03  8.8850e-02  ...  -1.8400e-04 -2.5501e-04 -4.1492e-04
   -1.7479e-03  1.8946e-01  5.1639e-03  ...   1.3672e-01 -1.3180e-02  4.6813e-02
    4.5800e-01  2.7667e-02 -7.9878e-02  ...   2.4866e-01 -1.9402e-01  4.1230e-01
    5.2607e-01  1.8060e-02 -1.0414e-01  ...   1.4630e-01 -9.0713e-02  1.8234e-01
  [torch.FloatTensor of size 1x5x1150], Variable containing:
  ( 0  ,.,.) = 
   -2.6255e-02  1.1901e-02  2.0462e-03  ...  -3.1233e-02 -2.8014e-01  6.2324e-04
   -3.3963e-02 -2.3060e-02  2.3785e-03  ...   5.5123e-03 -2.6828e-02  2.5454e-04
   -1.8081e-03  7.7851e-03  1.5832e-02  ...  -1.0883e-02 -4.6499e-02 -1.2019e-04
   -1.9474e-02 -1.4098e-02  8.2105e-03  ...  -5.3911e-02 -2.8161e-02  4.9225e-04
   -2.3617e-02 -4.9448e-03  1.8774e-02  ...  -3.8561e-02  9.5322e-03 -1.4935e-03
  [torch.FloatTensor of size 1x5x1150], Variable containing:
  ( 0 ,.,.) =

In [21]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - word level encoding....10 words 400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - word level encoding....10 words 400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - word level encoding....10 words 400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [22]:
x_inp

['i like apples', 'i want to buy some apples', 'where is your cell phone']

In [23]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.8836122751235962, 0.17896205186843872, 0.23436345160007477)

In [24]:
np.inner(kk0,kk1)

2.230761

In [25]:
np.inner(kk1,kk2)

0.83972144

In [26]:
np.inner(kk0,kk2)

1.0898541

### Round 2 - increase sentence complexity

In [27]:
x_inp = ["i like apples and oranges",
         "i hate all fruits especially apples and oranges",
         "i am going to buy some apples and oranges"]

In [28]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [29]:
tok

[['i', 'like', 'apples', 'and', 'oranges'],
 ['i', 'hate', 'all', 'fruits', 'especially', 'apples', 'and', 'oranges'],
 ['i', 'am', 'going', 'to', 'buy', 'some', 'apples', 'and', 'oranges']]

In [30]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[12, 52, 13154, 5, 20864],
 [12, 738, 43, 22144, 280, 13154, 5, 20864],
 [12, 261, 182, 8, 808, 64, 13154, 5, 20864]]

In [31]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - word level encoding....10 words 400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - word level encoding....10 words 400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - word level encoding....10 words 400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [32]:
x_inp

['i like apples and oranges',
 'i hate all fruits especially apples and oranges',
 'i am going to buy some apples and oranges']

In [33]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.503886342048645, 0.8975854516029358, 0.46116968989372253)

In [34]:
np.inner(kk0,kk1)

0.89525676

In [35]:
np.inner(kk1,kk2)

2.8896785

In [36]:
np.inner(kk0,kk2)

0.98151237

### Round 3 - more complex!

In [37]:
x_inp = ["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges."]

In [38]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [39]:
tok

[['let',
  "'s",
  'talk',
  'about',
  'fruits',
  'for',
  'a',
  'second',
  '.',
  'apples',
  'are',
  'nice',
  '.',
  'oranges',
  'too',
  '.',
  'i',
  'kinda',
  'like',
  'them',
  '.'],
 ['i',
  'compared',
  'the',
  'prices',
  'of',
  'apples',
  'and',
  'oranges',
  'at',
  'walmart',
  'and',
  'kroger',
  'stores'],
 ['oh',
  'you',
  'wanna',
  'talk',
  'about',
  'apples',
  '.',
  'sure',
  '.',
  'i',
  'am',
  'not',
  'sure',
  'if',
  'i',
  'have',
  'said',
  'this',
  'before',
  'but',
  'i',
  'do',
  'like',
  'them',
  'and',
  'oranges',
  '.']]

In [40]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[302,
  16,
  713,
  58,
  22144,
  22,
  6,
  349,
  3,
  13154,
  33,
  358,
  3,
  20864,
  116,
  3,
  12,
  2040,
  52,
  110,
  3],
 [12, 1128, 2, 12023, 7, 13154, 5, 20864, 44, 17680, 5, 0, 5400],
 [452,
  26,
  2890,
  713,
  58,
  13154,
  3,
  273,
  3,
  12,
  261,
  32,
  273,
  62,
  12,
  36,
  326,
  13,
  176,
  24,
  12,
  57,
  52,
  110,
  5,
  20864,
  3]]

In [41]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - word level encoding....10 words 400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - word level encoding....10 words 400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - word level encoding....10 words 400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [42]:
x_inp

["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
 'i compared the prices of apples and oranges at walmart and kroger stores',
 'oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges.']

In [43]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.2883625328540802, 0.2486269772052765, 0.9720225930213928)

In [44]:
np.inner(kk0,kk1)

0.5486556

In [45]:
np.inner(kk1,kk2)

0.49776977

In [46]:
np.inner(kk0,kk2)

1.5571241

### Round 4 - really complex

In [47]:
x_inp = ["there is no comparison here. you are comparing apples to oranges",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "i don't see anything common between these two categories."]

In [48]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [49]:
tok

[['there',
  'is',
  'no',
  'comparison',
  'here',
  '.',
  'you',
  'are',
  'comparing',
  'apples',
  'to',
  'oranges'],
 ['i',
  'compared',
  'the',
  'prices',
  'of',
  'apples',
  'and',
  'oranges',
  'at',
  'walmart',
  'and',
  'kroger',
  'stores'],
 ['i',
  'do',
  "n't",
  'see',
  'anything',
  'common',
  'between',
  'these',
  'two',
  'categories',
  '.']]

In [50]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[53, 9, 73, 1884, 148, 3, 26, 33, 4324, 13154, 8, 20864],
 [12, 1128, 2, 12023, 7, 13154, 5, 20864, 44, 17680, 5, 0, 5400],
 [12, 57, 29, 83, 255, 1116, 222, 150, 126, 9281, 3]]

In [51]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - word level encoding....10 words 400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - word level encoding....10 words 400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - word level encoding....10 words 400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [52]:
x_inp

['there is no comparison here. you are comparing apples to oranges',
 'i compared the prices of apples and oranges at walmart and kroger stores',
 "i don't see anything common between these two categories."]

In [53]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.57721346616745, 0.2024388462305069, 0.22252123057842255)

In [54]:
np.inner(kk0,kk1)

0.97971964

In [55]:
np.inner(kk1,kk2)

0.37234193

In [56]:
np.inner(kk0,kk2)

0.34369394