## Semantic Similarity Evaluation - using pre-trained weights from the LM/Classifier

In [372]:
from fastai.text import *
import html

In [373]:
PATH=Path('data/aclImdb/')
CLAS_PATH=Path('data/imdb_clas/')
LM_PATH=Path('data/imdb_lm/')

In [374]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')[:10]
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')[:10]
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

In [375]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [376]:
max_vocab = 60000
min_freq = 2

In [377]:
vs=len(itos)
vs,len(trn_lm)

(60002, 10)

In [378]:
em_sz,nh,nl = 400,1150,3

In [379]:
#pickle.dump(itos, open('itos_41k.pkl', 'wb'))

In [380]:
len(itos)

60002

## Load existing LM model and weights

In [381]:
PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [382]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [383]:
wd=1e-7
bptt=70
bs=250
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [384]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [385]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [386]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.unfreeze()

## Replace weights from classifier-encoder (not LM)

In [401]:
learner.load('lm1')
#learner.load_encoder('lm1_enc')
learner.load_encoder('clas_2_enc')

## Let's evaluate

Our goal: Encode 3 sentences using a pre-trained encoder and check the similarity scores between each pair of sentences. We use 2 methods to calculate semantic similarity: cosine similarity and inner product of encodings.

In [418]:
# cosine similarity - to check quality of our sentence encoder
def cos_sim(v1,v2):
    return F.cosine_similarity(T(v1).unsqueeze(0),T(v2).unsqueeze(0)).mean()

In [419]:
itos = pickle.load(open('itos_41k.pkl', 'rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

# Create data for evaluation

In [587]:
x1_inp = ["i like apples",
         "i want to buy some apples",
         "where is your cell phone"]

x2_inp = ["i like apples and oranges",
         "i love all fruits especially apples and oranges",
         "where is the new movie showing?"]

x3_inp = ["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges."]

x4_inp = ["there is no comparison here. you are comparing apples to oranges",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "i don't see anything common between these two categories."]

x5_inp = ["i would love to own a nice boat and go sailing in the pacific ocean",
         "i'm thinking of getting a fancy boat and set sail into the south pacific",
         "i wish to own a small house and live there without any worries"]

In [588]:
tok1 = Tokenizer().proc_all(x1_inp,'en')
tok2 = Tokenizer().proc_all(x2_inp,'en')
tok3 = Tokenizer().proc_all(x3_inp,'en')
tok4 = Tokenizer().proc_all(x4_inp,'en')
tok5 = Tokenizer().proc_all(x5_inp,'en')

In [589]:
X1 = [[stoi[o1] for o1 in o] for o in tok1]
X2 = [[stoi[o1] for o1 in o] for o in tok2]
X3 = [[stoi[o1] for o1 in o] for o in tok3]
X4 = [[stoi[o1] for o1 in o] for o in tok4]
X5 = [[stoi[o1] for o1 in o] for o in tok5]

In [590]:
m = learner.model
# Turn off dropout
m.eval()

SequentialRNN(
  (0): RNN_Encoder(
    (encoder): Embedding(60002, 400, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(60002, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(400, 1150, dropout=0.105)
      )
      (1): WeightDrop(
        (module): LSTM(1150, 1150, dropout=0.105)
      )
      (2): WeightDrop(
        (module): LSTM(1150, 400, dropout=0.105)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60002, bias=False)
    (dropout): LockedDropout(
    )
  )
)

In [591]:
#Create reusable func for inference
#Laid out for readability - will refactor later
def run_model(X):
    kk0=m[0](V(T([X[0]]))) #first sentence in X - sentence level encoding....10 words 400 dim vecs
    kk1=m[0](V(T([X[1]]))) #second sentence in X - sentence level encoding....10 words 400 dim vecs
    kk2=m[0](V(T([X[2]]))) #third sentence in X - sentence level encoding....10 words 400 dim vecs

    kk0=to_np(kk0)
    kk1=to_np(kk1)
    kk2=to_np(kk2)

    kk0 = (kk0[0][2][0][-1]) # 1st sentence encoding 400 dims. -1 is the last element that's supposed to have the final encoded state
    kk1 = (kk1[0][2][0][-1]) # 2nd sentence encoding 400 dims
    kk2 = (kk2[0][2][0][-1]) # 3rd sentence encoding 400 dims
    
    return kk0,kk1,kk2

### Round 1 - simple sentences

In [592]:
kk0,kk1,kk2 = run_model(X1)
kk1.shape

(400,)

In [593]:
x1_inp

['i like apples', 'i want to buy some apples', 'where is your cell phone']

In [594]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.9999998807907104, 0.06893263012170792, 0.06893263757228851)

In [595]:
np.inner(kk0,kk1),np.inner(kk1,kk2),np.inner(kk0,kk2)

(5.2282677, 0.16939801, 0.169398)

### Round 2 - increase sentence complexity

In [596]:
kk0,kk1,kk2 = run_model(X2)
kk1.shape

(400,)

In [597]:
x2_inp

['i like apples and oranges',
 'i love all fruits especially apples and oranges',
 'where is the new movie showing?']

In [598]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.8213068842887878, 0.22487039864063263, 0.2921583652496338)

In [599]:
np.inner(kk0,kk1),np.inner(kk1,kk2),np.inner(kk0,kk2)

(3.1879294, 0.73501706, 0.5341433)

### Round 3 - more complex!

In [600]:
kk0,kk1,kk2 = run_model(X3)
kk1.shape

(400,)

In [601]:
x3_inp

["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
 'i compared the prices of apples and oranges at walmart and kroger stores',
 'oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges.']

In [602]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.043320432305336, 0.043320432305336, 1.0000001192092896)

In [603]:
np.inner(kk0,kk1),np.inner(kk1,kk2),np.inner(kk0,kk2)

(0.14767489, 0.14767489, 1.7979703)

### Round 4 - really complex

In [604]:
kk0,kk1,kk2 = run_model(X4)
kk1.shape

(400,)

In [605]:
x4_inp

['there is no comparison here. you are comparing apples to oranges',
 'i compared the prices of apples and oranges at walmart and kroger stores',
 "i don't see anything common between these two categories."]

In [606]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.0017274579731747508, 0.043320432305336, 0.02612287364900112)

In [462]:
np.inner(kk0,kk1),np.inner(kk1,kk2),np.inner(kk0,kk2)

(0.0115689635, 0.14767492, 0.09227343)

**As you can see, the model is not able to get the nuance in Round 4.**


### Round 5 - nuance check

In [608]:
kk0,kk1,kk2 = run_model(X5)
kk1.shape

(400,)

In [611]:
x5_inp

['i would love to own a nice boat and go sailing in the pacific ocean',
 "i'm thinking of getting a fancy boat and set sail into the south pacific",
 'i wish to own a small house and live there without any worries']

In [612]:
cos_sim(kk0,kk1), cos_sim(kk1,kk2), cos_sim(kk0,kk2)

(0.5177718997001648, 0.14863775670528412, 0.046706970781087875)

In [613]:
np.inner(kk0,kk1),np.inner(kk1,kk2),np.inner(kk0,kk2)

(0.7448063, 0.67609465, 0.48657796)

**As you can see, the model did ok-ish but is not able to get the nuance. 0 and 2 should have scored higher**

We have been using the encoder from the IMDB classifier for comparing semantic similarity. Let's see if we can do better by creating a new model.

# Quora dataset

We need a standard dataset specifically meant for the semantic similarity task.
Let's use the quora kaggle dataset that contains pairs of english sentences and the goal is to predict if a given pair of sentences are semantically similar(meaning).

y=1 indicates they have the same meaning  
y=0 means the pair differ in meaning

In [487]:
#QUESTION_PAIRS_FILE = '/datasets/quora_duplicate_questions.tsv'
#print("Processing", QUESTION_PAIRS_FILE)

question1 = []
question2 = []
is_duplicate = []
with open(QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        question1.append(row['question1'])
        question2.append(row['question2'])
        is_duplicate.append(int(row['is_duplicate']))

print('Question pairs: %d' % len(question1))

Question pairs: 404290


In [488]:
question1[:10]

['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use シ instead of し?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?']

In [489]:
question2[:10]

['What is the step by step guide to invest in share market?',
 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?',
 'How can Internet speed be increased by hacking through DNS?',
 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?',
 'Which fish would survive in salt water?',
 "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",
 'What keeps childern active and far from phone and video games?',
 'What should I do to be a great geologist?',
 'When do you use "&" instead of "and"?',
 'How do I hack Motorola DCX3400 for free internet?']

In [490]:
len(question1),len(question2),len(is_duplicate)

(404290, 404290, 404290)

In [491]:
is_duplicate[:10]

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

In [492]:
chunksize=64000

In [493]:
tok_q1 = Tokenizer.proc_all_mp(partition_by_cores(question1))

In [494]:
tok_q2 = Tokenizer.proc_all_mp(partition_by_cores(question2))

In [495]:
tok_q2[:10]

[['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market',
  '?'],
 ['what',
  'would',
  'happen',
  'if',
  'the',
  'indian',
  'government',
  'stole',
  'the',
  'kohinoor',
  '(',
  'koh',
  '-',
  'i',
  '-',
  'noor',
  ')',
  'diamond',
  'back',
  '?'],
 ['how',
  'can',
  'internet',
  'speed',
  'be',
  'increased',
  'by',
  'hacking',
  'through',
  't_up',
  'dns',
  '?'],
 ['find',
  'the',
  'remainder',
  'when',
  '[',
  'math]23^{24',
  '}',
  '[',
  '/',
  'math',
  ']',
  'is',
  'divided',
  'by',
  '24,23',
  '?'],
 ['which', 'fish', 'would', 'survive', 'in', 'salt', 'water', '?'],
 ['i',
  "'m",
  'a',
  'triple',
  'capricorn',
  '(',
  'sun',
  ',',
  'moon',
  'and',
  'ascendant',
  'in',
  'capricorn',
  ')',
  'what',
  'does',
  'this',
  'say',
  'about',
  'me',
  '?'],
 ['what',
  'keeps',
  'childern',
  'active',
  'and',
  'far',
  'from',
  'phone',
  'and',
  'video',
  'games',
  '?'],
 [

In [496]:
ques = tok_q1 + tok_q2

In [497]:
freq = Counter(p for o in ques for p in o)

In [498]:
freq.most_common(25)

[('?', 852054),
 ('the', 377634),
 ('what', 324433),
 ('is', 271122),
 ('i', 223363),
 ('how', 220656),
 ('a', 211277),
 ('to', 205717),
 ('in', 196940),
 ('do', 169773),
 ('of', 159862),
 ('are', 146580),
 ('and', 133925),
 ('can', 114550),
 ('for', 104498),
 (',', 98321),
 ('t_up', 97217),
 ('you', 93102),
 ('why', 84030),
 ('it', 71057),
 ('my', 70930),
 ('best', 70596),
 ('on', 60715),
 ('does', 59502),
 ('.', 49499)]

In [499]:
max_vocab = 60000
min_freq = 2

In [500]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [501]:
itos[:10]

['_unk_', '_pad_', '?', 'the', 'what', 'is', 'i', 'how', 'a', 'to']

In [502]:
len(itos)

41665

In [503]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
list(stoi)[:10]

['_unk_', '_pad_', '?', 'the', 'what', 'is', 'i', 'how', 'a', 'to']

In [504]:
q1 = np.array([[stoi[o] for o in p] for p in tok_q1])
q2 = np.array([[stoi[o] for o in p] for p in tok_q2])

In [505]:
q1.shape,q2.shape

((404290,), (404290,))

In [506]:
str(q1[0])

'[4, 5, 3, 1254, 69, 1254, 2576, 9, 589, 10, 773, 390, 10, 43, 2]'

In [507]:
itos_arr = np.array(itos)
itos_arr[q1[0]]

array(['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in',
       'india', '?'], dtype='<U65')

In [508]:
# np.save('q1.npy', q1)
# np.save('q2.npy', q2)
# pickle.dump(itos, open('itos.pkl', 'wb'))

In [509]:
q1 = np.load('q1.npy')
q2 = np.load('q2.npy')
itos = pickle.load(open('itos_41k.pkl', 'rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

In [510]:
vs=len(itos) #vocab size
vs,len(q1)

(41665, 404290)

In [511]:
itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

In [512]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [513]:
wgts.keys()

odict_keys(['0.encoder.weight', '0.encoder_with_dropout.embed.weight', '0.rnns.0.module.weight_ih_l0', '0.rnns.0.module.bias_ih_l0', '0.rnns.0.module.bias_hh_l0', '0.rnns.0.module.weight_hh_l0_raw', '0.rnns.1.module.weight_ih_l0', '0.rnns.1.module.bias_ih_l0', '0.rnns.1.module.bias_hh_l0', '0.rnns.1.module.weight_hh_l0_raw', '0.rnns.2.module.weight_ih_l0', '0.rnns.2.module.bias_ih_l0', '0.rnns.2.module.bias_hh_l0', '0.rnns.2.module.weight_hh_l0_raw', '1.decoder.weight'])

In [514]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0) 
row_m.shape, row_m[:10]

((400,),
 array([-0.0183 , -0.13826,  0.01438, -0.01285,  0.00407,  0.01944,  0.01149, -0.13282, -0.02295, -0.01722],
       dtype=float32))

In [515]:
# Create embedding matrix and take token weights from wikitext103 if available
# Use 60002 instead of 41665 for future embedding matrix where backbone encoder needs to be loaded.
# not needed for simple model
new_w = np.zeros((len(itos), em_sz), dtype=np.float32)
for i,w in enumerate(itos):
    r = stoi2[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m

In [516]:
new_w.shape

(41665, 400)

In [517]:
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

In [518]:
wgts['1.decoder.weight'].shape

torch.Size([41665, 400])

In [519]:
trn_keep = np.random.rand(len(q1))>0.1

In [520]:
q1_trn = q1[trn_keep]
q2_trn = q2[trn_keep]
lbl_trn = np.asarray(is_duplicate)[trn_keep]

In [521]:
np.asarray([lbl_trn]).T

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [522]:
(T(np.array([lbl_trn[101]]).T)).float()


 0
[torch.FloatTensor of size 1]

In [523]:
q1_trn.shape

(363796,)

In [524]:
q1_val = q1[~trn_keep]
q2_val = q2[~trn_keep]
lbl_val = np.asarray(is_duplicate)[~trn_keep]

In [525]:
lbl_val = np.asarray([lbl_val]).T
lbl_val.shape

(40494, 1)

In [526]:
lbl_trn = np.asarray([lbl_trn]).T
lbl_trn.shape

(363796, 1)

In [527]:
lbl_val = lbl_val.T
lbl_val.shape

(1, 40494)

In [528]:
lbl_trn = lbl_trn.T
lbl_trn.shape

(1, 363796)

In [529]:
vs,em_sz

(41665, 400)

In [530]:
class PairDataset(Dataset):
    def __init__(self, X, y): self.x1,self.x2,self.y = X[0],X[1],y
    def __getitem__(self, idx): return A(self.x1[idx], self.x2[idx], (T(self.y[idx]).float()))
    def __len__(self): return len(self.x1)

In [531]:
trn_ds = PairDataset(X=[q1_trn[:1000],q2_trn[:1000]],y=(lbl_trn[:1000]).T)
val_ds = PairDataset(X=[q1_val[:100],q2_val[:100]],y=(lbl_val[:100]).T)

In [532]:
trn_ds = PairDataset(X=[q1_trn,q2_trn],y=(lbl_trn).T)
val_ds = PairDataset(X=[q1_val,q2_val],y=(lbl_val).T)

In [533]:
trn_ds.__getitem__(15)

[array([    4,    25, 11486,   113,     2]),
 array([    4,    25, 11486,  1370,     2]),
 array([1.], dtype=float32)]

In [534]:
# Unable to run with larger bs because of DataLoader transpose issue
bs=48
#bs=1

In [535]:
#??DataLoader

In [536]:
trn_dl = DataLoader(trn_ds, bs, transpose=True, transpose_y=True, num_workers=1, 
                    pad_idx=1, pre_pad=False) #, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, transpose_y=True, num_workers=1, 
                    pad_idx=1, pre_pad=False) #, sampler=val_samp)
md = ModelData(PATH, trn_dl, val_dl)

In [537]:
it = iter(trn_dl)
its = [next(it) for i in range(5)]
[(len(x1),len(x2),len(y)) for x1,x2,y in its]
#[((y)) for x1,x2,y in its]
#next(it)

[(31, 40, 1), (33, 34, 1), (24, 36, 1), (32, 43, 1), (44, 37, 1)]

# Create model - simple

In [538]:
def create_emb(vecs, itos, em_sz):
    emb = nn.Embedding(len(itos), em_sz, padding_idx=1)
    wgts = emb.weight.data
    miss = []
    for i,w in enumerate(itos):
        try: wgts[i] = torch.from_numpy(vecs[i])
        except: miss.append(w)
    print(len(miss),miss[5:10])
    return emb

In [539]:
nh,nl = 256,2

In [747]:
class PairRNN(nn.Module):
    #def __init__(self, vecs_enc, itos_enc, em_sz_enc, vecs_dec, itos_dec, em_sz_dec, nh, out_sl, nl=2):
    def __init__(self, vecs, itos, em_sz, nh, out_sl=75, nl=2, bs=100):
        super().__init__()
        self.nl,self.nh,self.out_sl,self.bs = nl,nh,out_sl,bs
        self.emb = create_emb(vecs, itos, em_sz)
        self.emb_drop = nn.Dropout(0.15)
        self.gru = nn.GRU(em_sz, nh, num_layers=nl, dropout=0.25)
        self.out = nn.Linear(nh, em_sz, bias=False)
        
    def forward(self, inp1, inp2):
        sl,bs = inp1.size()
        h = self.initHidden(bs)
        emb1 = self.emb_drop(self.emb(inp1))
        emb2 = self.emb_drop(self.emb(inp2))
        out_1, h1 = self.gru(emb1, h)
        out_2, h2 = self.gru(emb2, h)
        h1 = self.out(h1)
        h2 = self.out(h2)
        return F.cosine_similarity(h1[1],h2[1],dim=1)#[0].mean(dim=1)
        #return h1[1]
    
    def initHidden(self, bs): return V(torch.zeros(self.nl, bs, self.nh))

In [748]:
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [749]:
rnn = PairRNN(new_w, itos, em_sz, nh, bs=bs)
learn = RNN_Learner(md, SingleModel(to_gpu(rnn)), opt_fn=opt_fn)
learn.crit = nn.L1Loss()
#learn.metrics = [accuracy]
#nn.CosineEmbeddingLoss()

0 []


In [750]:
#learn.lr_find()
#learn.fit(lr, 1, cycle_len=12, use_clr=(20,10))

In [751]:
learn.load('quora2')

In [752]:
learn

PairRNN(
  (emb): Embedding(41665, 400, padding_idx=1)
  (emb_drop): Dropout(p=0.15)
  (gru): GRU(400, 256, num_layers=2, dropout=0.25)
  (out): Linear(in_features=256, out_features=400, bias=False)
)

## Create data for evaluation

In [835]:
x1_inp = ["i like apples",
         "i want to buy some apples",
         "where is your cell phone"]

x2_inp = ["i like apples and oranges",
         "i love all fruits especially apples and oranges",
         "where is the new movie showing?"]

x3_inp = ["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges."]

x4_inp = ["there is no comparison here. you are comparing apples to oranges",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "i don't see anything common between these two categories."]

x5_inp = ["i would love to own a nice boat and go sailing in the pacific ocean",
         "i'm thinking of getting a fancy boat and set sail into the south pacific",
         "i wish to own a small house and live there without any worries"]

In [836]:
tok1 = Tokenizer().proc_all(x1_inp,'en')
tok2 = Tokenizer().proc_all(x2_inp,'en')
tok3 = Tokenizer().proc_all(x3_inp,'en')
tok4 = Tokenizer().proc_all(x4_inp,'en')
tok5 = Tokenizer().proc_all(x5_inp,'en')

In [837]:
X1 = [[stoi[o1] for o1 in o] for o in tok1]
X2 = [[stoi[o1] for o1 in o] for o in tok2]
X3 = [[stoi[o1] for o1 in o] for o in tok3]
X4 = [[stoi[o1] for o1 in o] for o in tok4]
X5 = [[stoi[o1] for o1 in o] for o in tok5]

In [855]:
#learn = RNN_Learner(md, SingleModel(to_gpu(rnn)), opt_fn=opt_fn)
def predict_similarities(m,sent0,sent1,sent2):
    m.eval()
    cc0 = learn.model((V(T([sent0]).permute(1,0))),(V(T([sent1]).permute(1,0))))
    cc1 = learn.model((V(T([sent1]).permute(1,0))),(V(T([sent2]).permute(1,0))))
    cc2 = learn.model((V(T([sent0]).permute(1,0))),(V(T([sent2]).permute(1,0))))
    return cc0,cc1,cc2

### Round 1:

In [873]:
x1_inp

['i like apples', 'i want to buy some apples', 'where is your cell phone']

In [857]:
sent0 = X1[0]; sent1 = X1[1]; sent2 = X1[2] #round 1

In [858]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(Variable containing:
 0.6741
[torch.FloatTensor of size 1]
, Variable containing:
 0.4813
[torch.FloatTensor of size 1]
, Variable containing:
 0.5396
[torch.FloatTensor of size 1]
)


In [713]:
# cos_sim results from previous model
# 1: (0.9999998807907104, 0.06893263012170792, 0.06893263757228851)

### Round 2:

In [874]:
x2_inp

['i like apples and oranges',
 'i love all fruits especially apples and oranges',
 'where is the new movie showing?']

In [859]:
sent0 = X2[0]; sent1 = X2[1]; sent2 = X2[2] #round 2

In [860]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(Variable containing:
 0.9236
[torch.FloatTensor of size 1]
, Variable containing:
 0.3989
[torch.FloatTensor of size 1]
, Variable containing:
 0.3701
[torch.FloatTensor of size 1]
)


In [861]:
# cos_sim results from previous model
# 2: (0.8213068842887878, 0.22487039864063263, 0.2921583652496338)

### Round 3:

In [875]:
x3_inp

["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
 'i compared the prices of apples and oranges at walmart and kroger stores',
 'oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges.']

In [862]:
sent0 = X3[0]; sent1 = X3[1]; sent2 = X3[2] #round 3

In [863]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(Variable containing:
 0.4416
[torch.FloatTensor of size 1]
, Variable containing:
 0.7028
[torch.FloatTensor of size 1]
, Variable containing:
 0.8711
[torch.FloatTensor of size 1]
)


In [864]:
# cos_sim results from previous model
# 3: (0.043320432305336, 0.043320432305336, 1.0000001192092896)

### Round 4:

In [876]:
x4_inp

['there is no comparison here. you are comparing apples to oranges',
 'i compared the prices of apples and oranges at walmart and kroger stores',
 "i don't see anything common between these two categories."]

In [865]:
sent0 = X4[0]; sent1 = X4[1]; sent2 = X4[2] #round 4

In [866]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(Variable containing:
 0.4978
[torch.FloatTensor of size 1]
, Variable containing:
 0.4850
[torch.FloatTensor of size 1]
, Variable containing:
 0.6487
[torch.FloatTensor of size 1]
)


In [867]:
# cos_sim results from previous model
# 4: (0.0017274579731747508, 0.043320432305336, 0.02612287364900112)

### Round 5:

In [877]:
x5_inp

['i would love to own a nice boat and go sailing in the pacific ocean',
 "i'm thinking of getting a fancy boat and set sail into the south pacific",
 'i wish to own a small house and live there without any worries']

In [868]:
sent0 = X5[0]; sent1 = X5[1]; sent2 = X5[2] #round 5

In [869]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(Variable containing:
 0.8350
[torch.FloatTensor of size 1]
, Variable containing:
 0.5553
[torch.FloatTensor of size 1]
, Variable containing:
 0.6052
[torch.FloatTensor of size 1]
)


In [870]:
# cos_sim results from previous model
# 5: (0.5177718997001648, 0.14863775670528412, 0.046706970781087875)

In [871]:
#learn.load('quora2')

#### Since sim(A,B) = sim (B,A) we can double the dataset.
TODO: Train model with twice the data after commutating.