## Semantic Similarity Evaluation - using pre-trained weights from the LM/Classifier

In [1]:
from fastai.text import *
import html

In [4]:
PATH=Path('data/aclImdb/')
CLAS_PATH=Path('data/imdb_clas/')
LM_PATH=Path('data/imdb_lm/')

In [5]:
PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [6]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [7]:
em_sz=400

# Quora dataset

We need a standard dataset specifically meant for the semantic similarity task.
Let's use the quora kaggle dataset that contains pairs of english sentences and the goal is to predict if a given pair of sentences are semantically similar(meaning).

y=1 indicates they have the same meaning  
y=0 means the pair differ in meaning

In [109]:
QUESTION_PAIRS_FILE = 'data/quora_duplicate_questions.tsv'
#print("Processing", QUESTION_PAIRS_FILE)

question1 = []
question2 = []
is_duplicate = []
with open(QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        question1.append(row['question1'])
        question2.append(row['question2'])
        is_duplicate.append(int(row['is_duplicate']))

print('Question pairs: %d' % len(question1))

Question pairs: 404290


In [110]:
question1[:10]

['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use シ instead of し?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?']

In [111]:
question2[:10]

['What is the step by step guide to invest in share market?',
 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?',
 'How can Internet speed be increased by hacking through DNS?',
 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?',
 'Which fish would survive in salt water?',
 "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",
 'What keeps childern active and far from phone and video games?',
 'What should I do to be a great geologist?',
 'When do you use "&" instead of "and"?',
 'How do I hack Motorola DCX3400 for free internet?']

In [112]:
len(question1),len(question2),len(is_duplicate)

(404290, 404290, 404290)

In [113]:
is_duplicate[:10]

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

#### Since sim(A,B) = sim (B,A) we can double the dataset.

In [114]:
ques = question1 + question2
question2 = question2 + question1
question1 = ques
is_duplicate = is_duplicate + is_duplicate

In [115]:
chunksize=80000

In [116]:
tok_q1 = Tokenizer.proc_all_mp(partition_by_cores(question1))

In [117]:
tok_q2 = Tokenizer.proc_all_mp(partition_by_cores(question2))

In [118]:
tok_q2[:10]

[['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market',
  '?'],
 ['what',
  'would',
  'happen',
  'if',
  'the',
  'indian',
  'government',
  'stole',
  'the',
  'kohinoor',
  '(',
  'koh',
  '-',
  'i',
  '-',
  'noor',
  ')',
  'diamond',
  'back',
  '?'],
 ['how',
  'can',
  'internet',
  'speed',
  'be',
  'increased',
  'by',
  'hacking',
  'through',
  't_up',
  'dns',
  '?'],
 ['find',
  'the',
  'remainder',
  'when',
  '[',
  'math]23^{24',
  '}',
  '[',
  '/',
  'math',
  ']',
  'is',
  'divided',
  'by',
  '24,23',
  '?'],
 ['which', 'fish', 'would', 'survive', 'in', 'salt', 'water', '?'],
 ['i',
  "'m",
  'a',
  'triple',
  'capricorn',
  '(',
  'sun',
  ',',
  'moon',
  'and',
  'ascendant',
  'in',
  'capricorn',
  ')',
  'what',
  'does',
  'this',
  'say',
  'about',
  'me',
  '?'],
 ['what',
  'keeps',
  'childern',
  'active',
  'and',
  'far',
  'from',
  'phone',
  'and',
  'video',
  'games',
  '?'],
 [

In [123]:
ques = sorted(set(map(tuple, tok_q1+tok_q2)), reverse=True)
len(ques)

537054

In [124]:
freq = Counter(p for o in ques for p in o)

In [125]:
freq.most_common(25)

[('?', 567912),
 ('the', 251908),
 ('what', 214480),
 ('is', 186191),
 ('a', 154911),
 ('i', 149562),
 ('to', 141677),
 ('in', 139553),
 ('how', 135432),
 ('of', 111924),
 ('do', 110223),
 ('are', 98555),
 ('and', 89368),
 ('for', 74763),
 ('t_up', 73392),
 (',', 72193),
 ('can', 71248),
 ('you', 60716),
 ('why', 56154),
 ('it', 52359),
 ('my', 45133),
 ('does', 41961),
 ('best', 40964),
 ('.', 39769),
 ('on', 37977)]

In [126]:
max_vocab = 60000
min_freq = 2

In [127]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [128]:
itos[:10]

['_unk_', '_pad_', '?', 'the', 'what', 'is', 'a', 'i', 'to', 'in']

In [129]:
len(itos)

37597

In [130]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
list(stoi)[:10]

['_unk_', '_pad_', '?', 'the', 'what', 'is', 'a', 'i', 'to', 'in']

In [131]:
q1 = np.array([[stoi[o] for o in p] for p in tok_q1])
q2 = np.array([[stoi[o] for o in p] for p in tok_q2])

In [132]:
q1.shape,q2.shape

((808580,), (808580,))

In [133]:
str(q1[0])

'[4, 5, 3, 1312, 69, 1312, 2275, 8, 552, 9, 684, 347, 9, 47, 2]'

In [134]:
itos_arr = np.array(itos)
itos_arr[q1[0]]

array(['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in',
       'india', '?'], dtype='<U33')

In [135]:
np.save('q1_dbl.npy', q1)
np.save('q2_dbl.npy', q2)
pickle.dump(itos, open('itos.pkl', 'wb'))

In [136]:
q1 = np.load('q1_dbl.npy')
q2 = np.load('q2_dbl.npy')
itos = pickle.load(open('itos.pkl', 'rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

In [137]:
vs=len(itos) #vocab size
vs,len(q1)

(37597, 808580)

In [138]:
itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

In [139]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [140]:
wgts.keys()

odict_keys(['0.encoder.weight', '0.encoder_with_dropout.embed.weight', '0.rnns.0.module.weight_ih_l0', '0.rnns.0.module.bias_ih_l0', '0.rnns.0.module.bias_hh_l0', '0.rnns.0.module.weight_hh_l0_raw', '0.rnns.1.module.weight_ih_l0', '0.rnns.1.module.bias_ih_l0', '0.rnns.1.module.bias_hh_l0', '0.rnns.1.module.weight_hh_l0_raw', '0.rnns.2.module.weight_ih_l0', '0.rnns.2.module.bias_ih_l0', '0.rnns.2.module.bias_hh_l0', '0.rnns.2.module.weight_hh_l0_raw', '1.decoder.weight'])

In [141]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0) 
row_m.shape, row_m[:10]

((400,),
 array([-0.0183 , -0.13826,  0.01438, -0.01285,  0.00407,  0.01944,  0.01149, -0.13282, -0.02295, -0.01722],
       dtype=float32))

In [142]:
# Create embedding matrix and take token weights from wikitext103 if available
# Use 60002 instead of 41665 for future embedding matrix where backbone encoder needs to be loaded.
# not needed for simple model
new_w = np.zeros((len(itos), em_sz), dtype=np.float32)
for i,w in enumerate(itos):
    r = stoi2[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m

In [143]:
new_w.shape

(37597, 400)

In [144]:
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

In [145]:
wgts['1.decoder.weight'].shape

torch.Size([37597, 400])

In [146]:
trn_keep = np.random.rand(len(q1))>0.1

In [147]:
q1_trn = q1[trn_keep]
q2_trn = q2[trn_keep]
lbl_trn = np.asarray(is_duplicate)[trn_keep]

In [148]:
np.asarray([lbl_trn]).T

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [149]:
(T(np.array([lbl_trn[101]]).T)).float()


 0
[torch.FloatTensor of size 1]

In [150]:
q1_trn.shape

(727447,)

In [151]:
q1_val = q1[~trn_keep]
q2_val = q2[~trn_keep]
lbl_val = np.asarray(is_duplicate)[~trn_keep]

In [152]:
lbl_val = np.asarray([lbl_val]).T
lbl_val.shape

(81133, 1)

In [153]:
lbl_trn = np.asarray([lbl_trn]).T
lbl_trn.shape

(727447, 1)

In [154]:
lbl_val = lbl_val.T
lbl_val.shape

(1, 81133)

In [155]:
lbl_trn = lbl_trn.T
lbl_trn.shape

(1, 727447)

In [156]:
vs,em_sz

(37597, 400)

In [157]:
class PairDataset(Dataset):
    def __init__(self, X, y): self.x1,self.x2,self.y = X[0],X[1],y
    def __getitem__(self, idx): return A(self.x1[idx], self.x2[idx], (T(self.y[idx]).float()))
    def __len__(self): return len(self.x1)

In [158]:
trn_ds = PairDataset(X=[q1_trn[:1000],q2_trn[:1000]],y=(lbl_trn[:1000]).T)
val_ds = PairDataset(X=[q1_val[:100],q2_val[:100]],y=(lbl_val[:100]).T)

In [159]:
trn_ds = PairDataset(X=[q1_trn,q2_trn],y=(lbl_trn).T)
val_ds = PairDataset(X=[q1_val,q2_val],y=(lbl_val).T)

In [160]:
trn_ds.__getitem__(15)

[array([  20,   12,  282,   92,    8,   30,  259,   31,    3,  264,   72, 4428,    2]),
 array([   10,    12,   598,   118,    76, 11955,     6,   142,     2]),
 array([0.], dtype=float32)]

In [161]:
# Unable to run with larger bs because of DataLoader transpose issue
bs=300
#bs=1

In [162]:
#??DataLoader

In [163]:
trn_dl = DataLoader(trn_ds, bs, transpose=True, transpose_y=True, num_workers=1, 
                    pad_idx=1, pre_pad=False) #, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, transpose_y=True, num_workers=1, 
                    pad_idx=1, pre_pad=False) #, sampler=val_samp)
md = ModelData(PATH, trn_dl, val_dl)

In [164]:
it = iter(trn_dl)
its = [next(it) for i in range(5)]
[(len(x1),len(x2),len(y)) for x1,x2,y in its]
#[((y)) for x1,x2,y in its]
#next(it)

[(51, 43, 300), (37, 46, 300), (34, 48, 300), (43, 60, 300), (76, 41, 300)]

# Create model - simple

In [165]:
def create_emb(vecs, itos, em_sz):
    emb = nn.Embedding(len(itos), em_sz, padding_idx=1)
    wgts = emb.weight.data
    miss = []
    for i,w in enumerate(itos):
        try: wgts[i] = torch.from_numpy(vecs[i])
        except: miss.append(w)
    print(len(miss),miss[5:10])
    return emb

In [166]:
nh,nl = 256,2

Let's add another layer of GRUs (not bi directional) and see if we can provide alternate pathways (_ala resnet_) for the gradients to flow.

In [167]:
class PairRNN(nn.Module):
    #def __init__(self, vecs_enc, itos_enc, em_sz_enc, vecs_dec, itos_dec, em_sz_dec, nh, out_sl, nl=2):
    def __init__(self, vecs, itos, em_sz, nh, out_sl=75, nl=2, bs=100):
        super().__init__()
        self.nl,self.nh,self.out_sl,self.bs = nl,nh,out_sl,bs
        self.emb = create_emb(vecs, itos, em_sz)
        self.emb_drop = nn.Dropout(0.15)
        self.gru1 = nn.GRU(em_sz, nh, num_layers=nl, dropout=0.25)
        self.gru2 = nn.GRU(em_sz, nh, num_layers=nl, dropout=0.25)
        self.out = nn.Linear(2*nh, em_sz, bias=False)
        
    def forward(self, inp1, inp2):
        sl,bs = inp1.size()
        h = self.initHidden(bs)

        emb1 = self.emb_drop(self.emb(inp1))
        emb2 = self.emb_drop(self.emb(inp2))
        
        out_1, h1 = self.gru1(emb1, h) #Gru layer 1
        out_2, h2 = self.gru1(emb2, h) #Gru layer 1
        out_3, h3 = self.gru2(emb1, h) #Gru layer 2
        out_4, h4 = self.gru2(emb2, h) #Gru layer 2
        
        h5 = self.out(torch.cat((h1[1],h3[1]),1))
        h6 = self.out(torch.cat((h2[1],h4[1]),1))
        
        return F.cosine_similarity(h5,h6)
    
    def initHidden(self, bs): return V(torch.zeros(self.nl, bs, self.nh))

In [168]:
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

#### Note: We have opted to use the L1 loss to optimize. This means we are trying to get our cosine sim to predict values very close to 0 or 1 or penalize if it drifts away too much.

The quora kaggle competition uses a BCE loss(negative log likelihood) to evaluate

In [169]:
rnn = PairRNN(new_w, itos, em_sz, nh, bs=bs)
learn = RNN_Learner(md, SingleModel(to_gpu(rnn)), opt_fn=opt_fn)
learn.crit = nn.L1Loss()
#nn.CosineEmbeddingLoss()

0 []


In [262]:
learn.model

PairRNN(
  (emb): Embedding(37597, 400, padding_idx=1)
  (emb_drop): Dropout(p=0.15)
  (gru1): GRU(400, 256, num_layers=2, dropout=0.25)
  (gru2): GRU(400, 256, num_layers=2, dropout=0.25)
  (out): Linear(in_features=512, out_features=400, bias=False)
)

In [170]:
learn.unfreeze()
learn[0].trainable = False
learn[0].trainable

False

In [171]:
lr = 1e-3
learn.fit(lr, 1, cycle_len=5, use_clr=(32,10))

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                                     
    0      0.286228   0.410427  
    1      0.257659   0.301446                                     
    2      0.253119   0.331992                                     
    3      0.221312   0.282643                                     
    4      0.212439   0.264009                                     



[array([0.26401])]

In [282]:
learn.save('quora_db1')

In [261]:
lr = 1e-3
learn.fit(lr, 1, cycle_len=5, use_clr=(32,10))

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                                     
    0      0.228651   0.266893  
    1      0.210128   0.234721                                     
    2      0.196772   0.230007                                     
    3      0.18365    0.219447                                     
    4      0.17546    0.220643                                     



[array([0.22064])]

In [283]:
learn.save('quora_db2')

In [284]:
learn.unfreeze()
learn[0].trainable

True

In [285]:
lr = 1e-3
learn.fit(lr, 1, cycle_len=5, use_clr=(32,10))

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                                     
    0      0.19798    0.226221  
    1      0.190937   0.222594                                     
    2      0.181158   0.213164                                     
    3      0.172567   0.207104                                     
    4      0.16399    0.202823                                     



[array([0.20282])]

In [286]:
learn.save('quora_db3')

In [317]:
val = learn.predict()

In [318]:
len(val),len(lbl_val[0])

(81133, 81133)

In [352]:
print (np.sum(  (np.asarray(val,dtype=float) >= 0.7611) == (lbl_val[0])  ) / len(val) * 100, "% accuracy")

82.38694489295355 % accuracy


In [353]:
lr = 1e-3
learn.fit(lr, 1, cycle_len=5, use_clr=(32,10))

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                                     
    0      0.190384   0.216909  
    1      0.182725   0.217011                                     
    2      0.176167   0.204406                                     
    3      0.168004   0.197842                                     
    4      0.161245   0.191268                                     



[array([0.19127])]

## Check log loss

We used the L1loss function to optimize the learner. This gave us good semantic similarity scores in general. Let's now evaluate if this translates to better semantic similatity specifically for the quora dataset. The quora kaggle challenge uses negative log likelihood to evaluate.

In [377]:
val = learn.predict() # get evaluation from our L1loss trained model

In [378]:
len(val),len(lbl_val[0])

(81133, 81133)

In [379]:
# calculate negative log likelihood (BCE LOSS) from predictions
def binary_loss(y, p):
    return np.mean(-(y * np.log(p) + (1-y)*np.log(1-p)))

In [374]:
binary_loss(lbl_val[0],np.clip(val,0.06,0.907))

0.49602077407368567

We get 0.496 which is quite poor for the quora set. The competition leaderboard shows 0.1 and 0.2 log loss scores. Please see this [github page](https://github.com/lemuriandezapada/quora_test) for a much better solution.

This probably means that we have captured the essence of semantic similarity using L1 loss + cosine similarity - but we have failed to tune it to the quora dataset specifically.

### Let's try to switch from L1 loss to neg log loss and tune some more

In [363]:
learn.save('quora_db4')

In [367]:
def binary_loss_crit(y, p):
    p = p.clamp(min=0.06,max=0.907)
    return torch.mean(-(y * torch.log(p) + (1-y)*torch.log(1-p)))

In [375]:
learn.crit=binary_loss_crit

In [376]:
lr = 1e-3
learn.fit(lr, 1, cycle_len=1, use_clr=(32,10))

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                     
    0      0.864667   0.953047  



[array([0.95305])]

In [390]:
learn.save('quora_db5')

Looks like it has a long way to go as 0.953 is a terrible score. Let's abandon this approach for now.

## Create data for evaluation

In [436]:
x1_inp = ["i like apples",
         "i want to buy some apples",
         "where is your cell phone"]

x2_inp = ["i like apples and oranges",
         "i love all fruits especially apples and oranges",
         "where is the new movie showing?"]

x3_inp = ["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges."]

x4_inp = ["there is no comparison here. you are comparing apples to oranges",
         "i compared the prices of apples and oranges at walmart and kroger stores",
         "i don't see anything common between these two categories."]

x5_inp = ["i would love to own a nice boat and go sailing in the pacific ocean",
         "i'm thinking of getting a fancy boat and set sail into the south pacific",
         "i wish to own a small house and live there without any worries"]

In [437]:
tok1 = Tokenizer().proc_all(x1_inp,'en')
tok2 = Tokenizer().proc_all(x2_inp,'en')
tok3 = Tokenizer().proc_all(x3_inp,'en')
tok4 = Tokenizer().proc_all(x4_inp,'en')
tok5 = Tokenizer().proc_all(x5_inp,'en')

In [438]:
X1 = [[stoi[o1] for o1 in o] for o in tok1]
X2 = [[stoi[o1] for o1 in o] for o in tok2]
X3 = [[stoi[o1] for o1 in o] for o in tok3]
X4 = [[stoi[o1] for o1 in o] for o in tok4]
X5 = [[stoi[o1] for o1 in o] for o in tok5]

In [439]:
#learn = RNN_Learner(md, SingleModel(to_gpu(rnn)), opt_fn=opt_fn)
def predict_similarities(m,sent0,sent1,sent2):
    m.eval()
    cc0 = m((V(T([sent0]).permute(1,0))),(V(T([sent1]).permute(1,0))))
    cc1 = m((V(T([sent1]).permute(1,0))),(V(T([sent2]).permute(1,0))))
    cc2 = m((V(T([sent0]).permute(1,0))),(V(T([sent2]).permute(1,0))))
    return cc0.data[0],cc1.data[0],cc2.data[0]

In [440]:
learn.load('quora_db2')

### Round 1:

In [441]:
x1_inp

['i like apples', 'i want to buy some apples', 'where is your cell phone']

In [442]:
sent0 = X1[0]; sent1 = X1[1]; sent2 = X1[2] #round 1

In [443]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(0.8084962964057922, 0.5435537695884705, 0.16693493723869324)


In [444]:
# cos_sim results from previous model
# 1: (0.9999998807907104, 0.06893263012170792, 0.06893263757228851)

### Round 2:

In [445]:
x2_inp

['i like apples and oranges',
 'i love all fruits especially apples and oranges',
 'where is the new movie showing?']

In [446]:
sent0 = X2[0]; sent1 = X2[1]; sent2 = X2[2] #round 2

In [447]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(0.2680668234825134, 0.11690233647823334, -0.020731007680296898)


In [448]:
# cos_sim results from previous model
# 2: (0.8213068842887878, 0.22487039864063263, 0.2921583652496338)

### Round 3:

In [449]:
x3_inp

["let's talk about fruits for a second. Apples are nice. Oranges too. I kinda like them.",
 'i compared the prices of apples and oranges at walmart and kroger stores',
 'oh you wanna talk about apples. sure. i am not sure if i have said this before but i do like them and oranges.']

In [450]:
sent0 = X3[0]; sent1 = X3[1]; sent2 = X3[2] #round 3

In [451]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(0.007769078016281128, 0.2540293335914612, -0.027798207476735115)


In [452]:
# cos_sim results from previous model
# 3: (0.043320432305336, 0.043320432305336, 1.0000001192092896)

### Round 4:

In [453]:
x4_inp

['there is no comparison here. you are comparing apples to oranges',
 'i compared the prices of apples and oranges at walmart and kroger stores',
 "i don't see anything common between these two categories."]

In [454]:
sent0 = X4[0]; sent1 = X4[1]; sent2 = X4[2] #round 4

In [455]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(0.7291907072067261, 0.02016669511795044, 0.009204450994729996)


In [456]:
# cos_sim results from previous model
# 4: (0.0017274579731747508, 0.043320432305336, 0.02612287364900112)

### Round 5:

In [457]:
x5_inp

['i would love to own a nice boat and go sailing in the pacific ocean',
 "i'm thinking of getting a fancy boat and set sail into the south pacific",
 'i wish to own a small house and live there without any worries']

In [458]:
sent0 = X5[0]; sent1 = X5[1]; sent2 = X5[2] #round 5

In [459]:
print (predict_similarities(learn.model,sent0,sent1,sent2))

(0.6918005347251892, 0.2942836582660675, 0.17552269995212555)


In [460]:
# cos_sim results from previous model
# 5: (0.5177718997001648, 0.14863775670528412, 0.046706970781087875)