## LM Evaluation

In [1]:
from fastai.text import *
import html

In [2]:
PATH=Path('data/aclImdb/')
CLAS_PATH=Path('data/imdb_clas/')
LM_PATH=Path('data/imdb_lm/')

In [3]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

In [4]:
max_vocab = 60000
min_freq = 2

In [5]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [6]:
vs=len(itos)
vs,len(trn_lm)

(60002, 90000)

In [7]:
em_sz,nh,nl = 400,1150,3

## Load existing LM model and weights

In [8]:
PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [9]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [10]:
wd=1e-7
bptt=70
bs=250
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [11]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [12]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [13]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.unfreeze()

## Replace weights from classifier-encoder (not LM)

In [14]:
learner.load('lm1')
#learner.load_encoder('lm1_enc')
learner.load_encoder('clas_2_enc')

## Let's evaluate

Our goal: Encode 3 sentences using a pre-trained encoder and check the similarity scores between each pair of sentences. We use 2 methods to calculate semantic similarity: cosine similarity and inner product of encodings.

In [15]:
# cosine similarity - to check quality of our sentence encoder
def cos_sim(v1,v2):
    return F.cosine_similarity(T(v1).unsqueeze(0),T(v2).unsqueeze(0)).mean()

### Round 1 - simple sentences

In [16]:
x_inp = ["i like apples",
         "i want to buy some apples",
         "where is your cell phone"]

In [17]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [18]:
tok

[['i', 'like', 'apples'],
 ['i', 'want', 'to', 'buy', 'some', 'apples'],
 ['where', 'is', 'your', 'cell', 'phone']]

In [19]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[12, 52, 13154], [12, 203, 8, 808, 64, 13154], [134, 9, 146, 2739, 1668]]

In [20]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - sentence level encoding....10 words 400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - sentence level encoding....10 words 400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - sentence level encoding....10 words 400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [21]:
x_inp

['i like apples', 'i want to buy some apples', 'where is your cell phone']

In [22]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.9108594655990601, 0.1530112475156784, 0.16146282851696014)

In [23]:
np.inner(kk0,kk1)

2.4458122

In [24]:
np.inner(kk1,kk2)

0.6681246

In [25]:
np.inner(kk0,kk2)

0.670863

### Round 2 - increase sentence complexity

In [26]:
x_inp = ["i like apples and oranges",
         "i hate all fruits especially apples and oranges",
         "i am going to buy some apples and oranges"]

In [27]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [28]:
tok

[['i', 'like', 'apples', 'and', 'oranges'],
 ['i', 'hate', 'all', 'fruits', 'especially', 'apples', 'and', 'oranges'],
 ['i', 'am', 'going', 'to', 'buy', 'some', 'apples', 'and', 'oranges']]

In [29]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[12, 52, 13154, 5, 20864],
 [12, 738, 43, 22144, 280, 13154, 5, 20864],
 [12, 261, 182, 8, 808, 64, 13154, 5, 20864]]

In [30]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - sentence level encoding....400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - sentence level encoding....400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - sentence level encoding....400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [31]:
x_inp

['i like apples and oranges',
 'i hate all fruits especially apples and oranges',
 'i am going to buy some apples and oranges']

In [32]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.5731163620948792, 0.9138398170471191, 0.5365544557571411)

In [33]:
np.inner(kk0,kk1)

0.85583323

In [34]:
np.inner(kk1,kk2)

2.142747

In [35]:
np.inner(kk0,kk2)

0.8784741

### Round 3 - more complex!

In [36]:
x_inp = ["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges."]

In [37]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [38]:
tok

[['let',
  "'s",
  'talk',
  'about',
  'fruits',
  'for',
  'a',
  'second',
  '.',
  'apples',
  'are',
  'nice',
  '.',
  'oranges',
  'too',
  '.',
  'i',
  'kinda',
  'like',
  'them',
  '.'],
 ['i',
  'compared',
  'the',
  'prices',
  'of',
  'apples',
  'and',
  'oranges',
  'at',
  'walmart',
  'and',
  'kroger',
  'stores'],
 ['oh',
  'you',
  'wanna',
  'talk',
  'about',
  'apples',
  '.',
  'sure',
  '.',
  'i',
  'am',
  'not',
  'sure',
  'if',
  'i',
  'have',
  'said',
  'this',
  'before',
  'but',
  'i',
  'do',
  'like',
  'them',
  'and',
  'oranges',
  '.']]

In [39]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[302,
  16,
  713,
  58,
  22144,
  22,
  6,
  349,
  3,
  13154,
  33,
  358,
  3,
  20864,
  116,
  3,
  12,
  2040,
  52,
  110,
  3],
 [12, 1128, 2, 12023, 7, 13154, 5, 20864, 44, 17680, 5, 0, 5400],
 [452,
  26,
  2890,
  713,
  58,
  13154,
  3,
  273,
  3,
  12,
  261,
  32,
  273,
  62,
  12,
  36,
  326,
  13,
  176,
  24,
  12,
  57,
  52,
  110,
  5,
  20864,
  3]]

In [40]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - sentence level encoding....400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - sentence level encoding....400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - sentence level encoding....400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [41]:
x_inp

["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
 'i compared the prices of apples and oranges at walmart and kroger stores',
 'oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges.']

In [42]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.30107757449150085, 0.28530433773994446, 0.9661124348640442)

In [43]:
np.inner(kk0,kk1)

0.5728831

In [44]:
np.inner(kk1,kk2)

0.55668664

In [45]:
np.inner(kk0,kk2)

1.6137743

### Round 4 - really complex

In [46]:
x_inp = ["there is no comparison here. you are comparing apples to oranges",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "i don't see anything common between these two categories."]

In [47]:
tok = Tokenizer().proc_all_mp(partition_by_cores(x_inp))

In [48]:
tok

[['there',
  'is',
  'no',
  'comparison',
  'here',
  '.',
  'you',
  'are',
  'comparing',
  'apples',
  'to',
  'oranges'],
 ['i',
  'compared',
  'the',
  'prices',
  'of',
  'apples',
  'and',
  'oranges',
  'at',
  'walmart',
  'and',
  'kroger',
  'stores'],
 ['i',
  'do',
  "n't",
  'see',
  'anything',
  'common',
  'between',
  'these',
  'two',
  'categories',
  '.']]

In [49]:
X = [[stoi[o1] for o1 in o] for o in tok]; X

[[53, 9, 73, 1884, 148, 3, 26, 33, 4324, 13154, 8, 20864],
 [12, 1128, 2, 12023, 7, 13154, 5, 20864, 44, 17680, 5, 0, 5400],
 [12, 57, 29, 83, 255, 1116, 222, 150, 126, 9281, 3]]

In [50]:
m = learner.model

# Set batch size to 1
#m[0].bs=1
# Turn off dropout
#m.eval()
# Reset hidden state
#m.reset()

kk0=m[0](V(T([X[0]]))) #first sentence in X - sentence level encoding....400 dim vecs
kk1=m[0](V(T([X[1]]))) #second sentence in X - sentence level encoding....400 dim vecs
kk2=m[0](V(T([X[2]]))) #third sentence in X - sentence level encoding....400 dim vecs


kk0=to_np(kk0)
kk1=to_np(kk1)
kk2=to_np(kk2)


kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims


kk1.shape

(400,)

In [51]:
x_inp

['there is no comparison here. you are comparing apples to oranges',
 'i compared the prices of apples and oranges at walmart and kroger stores',
 "i don't see anything common between these two categories."]

In [52]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.42863762378692627, 0.25566914677619934, 0.11218404769897461)

In [53]:
np.inner(kk0,kk1)

1.0221705

In [54]:
np.inner(kk1,kk2)

0.4332598

In [55]:
np.inner(kk0,kk2)

0.25929877