# **LAB-2: Scalable Machine Learning and Deep Learning**

## **Paolo Teta & Ralfs Zangis**
---
**TASK:** Implement **S-BERT** model

**Outline:**
- Load the dataset
- Regression
- Classification
- Evaluation with STS benchmark dataset (cosine similarity and Spearmean correlation)
- Semantic search
---


## **Requirements**

In [None]:
"""
!pip install sentence_transformers
!pip install transformers
!pip install tokenizers
!pip install wget

import os
import re
import csv
import wget
import json
import math
import scipy
import torch
import string
import sklearn

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from sentence_transformers import SentenceTransformer
from sentence_transformers import LoggingHandler
from sentence_transformers import models, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

from transformers import BertTokenizer, TFBertModel, BertConfig
# from transformers import DistilBertTokenizer, DistilBertModel # smaller model

from tokenizers import BertWordPieceTokenizer

from torch.utils.data import DataLoader

from datetime import datetime
"""

**Mount Google Drive to load saved models**

In [19]:
# from google.colab import drive
# drive.mount('/content/drive')

## **REGRESSION**

Pre-trained model "*bert-base-uncased*" and word embedding model

In [None]:
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)

In [5]:
## BERT -> original model
# model = TFBertModel.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## DistilBERT -> smaller model
# model = DistilBertModel.from_pretrained('distilbert-base-uncased')
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Loading the datasets

In [6]:
columns = ['title', 'type', 'year', 'id', 'score', 'sentence_1', 'sentence_2']

In [8]:
train_path = '/content/sts-train.csv'
os.path.isfile(train_path)

train_samples = []

with open(train_path, newline='') as train:
    df_train = csv.DictReader(train, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_train:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        train_samples.append(input_example)
        # print(input_example)

In [9]:
test_path = '/content/sts-test.csv'
os.path.isfile(test_path)

test_samples = []

with open(test_path, newline='') as test:
    df_test = csv.DictReader(test, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_test:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        test_samples.append(input_example)
        # print(input_example)

In [10]:
dev_path = '/content/sts-dev.csv'
os.path.isfile(dev_path)

dev_samples = []

with open(dev_path, newline='') as dev:
    df_dev = csv.DictReader(dev, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_dev:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        dev_samples.append(input_example)
        # print(input_example)

Considering the given paper "*Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks*"

In [11]:
train_batch_size = 16
# train_batch_size = 32 # try to speed up the training

learn_rate = 2e-5
num_epochs = 1

Mean-pooling strategy

In [12]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

Define the model

In [13]:
# custom model using mean pooling of the word embeddings given as input
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Load the training set and define the loss function as the cosine similarity

In [14]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

Define the evaluator for the sentence embeddings

In [15]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

10% of train dataset for warm-up

In [16]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

**Training**

In [17]:
save_path = './training_sts_reg_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [18]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
            optimizer_class=torch.optim.Adam,
            optimizer_params={'lr': learn_rate},
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=save_path)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

**Evaluation on STS benchmark dataset**

Mathematical relationship: *cosine_similarity = 1 - cosine_distance*

In [None]:
print('Loading the stored model ...')
model = SentenceTransformer(save_path)

In [32]:
test_eval = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
c_s = test_eval(model, output_path=save_path)
print('Cosine similarity with the sentence_transformers library = ', c_s)

Cosine similarity with the sentence_transformers library =  0.5415700812711168


Usually the result is between 0.7 and 0.8

Embedding sentences

In [None]:
df_test = pd.read_csv(test_path, sep='\t', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE)
df_test.columns = columns

In [27]:
embed_1 = model.encode(df_test['sentence_1'], convert_to_numpy=True, batch_size=train_batch_size)
embed_2 = model.encode(df_test['sentence_2'], convert_to_numpy=True, batch_size=train_batch_size)

Compute the cosine similarity

In [33]:
cos_sim = 1 - sklearn.metrics.pairwise.paired_cosine_distances(embed_1, embed_2)
print('Cosine similarity = ', cos_sim)

Cosine similarity =  [0.24992722 0.9093471  0.65500164 ... 0.75315464 0.8264359  0.93877596]


Spearmean correlation coefficient

In [37]:
spr_corr = scipy.stats.spearmanr(cos_sim, df_test['score'])
print('Spearmean correlation coefficient = ', spr_corr[0])

Spearmean correlation coefficient =  0.5566412903311987


**Comment:** the two results match each other

---

## **CLASSIFICATION**

In [None]:
print('***** Downloading dataset ...')

# The URL for the dataset zip file.
url = 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip'

# Download the file (if we haven't already)
if not os.path.exists('./snli_1.0.zip'):
    wget.download(url, './snli_1.0.zip')

***** Downloading dataset ...


In [None]:
!unzip snli_1.0.zip

Archive:  snli_1.0.zip
   creating: snli_1.0/
  inflating: snli_1.0/.DS_Store      
   creating: __MACOSX/
   creating: __MACOSX/snli_1.0/
  inflating: __MACOSX/snli_1.0/._.DS_Store  
 extracting: snli_1.0/Icon           
  inflating: __MACOSX/snli_1.0/._Icon  
  inflating: snli_1.0/README.txt     
  inflating: __MACOSX/snli_1.0/._README.txt  
  inflating: snli_1.0/snli_1.0_dev.jsonl  
  inflating: snli_1.0/snli_1.0_dev.txt  
  inflating: snli_1.0/snli_1.0_test.jsonl  
  inflating: snli_1.0/snli_1.0_test.txt  
  inflating: snli_1.0/snli_1.0_train.jsonl  
  inflating: snli_1.0/snli_1.0_train.txt  
  inflating: __MACOSX/._snli_1.0     


In [None]:
# import pandas as pd
# df_train = pd.read_json(r'/content/snli_1.0/snli_1.0_train.jsonl', lines=True)
# df_train.to_csv(r'/content/snli_1.0_train.csv', index = None)

# df_test = pd.read_json(r'/content/snli_1.0/snli_1.0_test.jsonl', lines=True)
# df_test.to_csv(r'/content/snli_1.0_test.csv', index = None)

# df_dev = pd.read_json(r'/content/snli_1.0/snli_1.0_dev.jsonl', lines=True)
# df_dev.to_csv(r'/content/snli_1.0_dev.csv', index = None)

In [None]:
label2int = {"contradiction": 0,
             "entailment": 1,
             "neutral": 2}

In [None]:
train_path = '/content/snli_1.0_train.jsonl'
train_samples=[]

df_class = pd.read_json("snli_1.0/snli_1.0_train.jsonl", lines=True)
df_class

# index = df_class[df_class['gold_label'] == '-'].index
# df_class.drop(index, inplace=True)

Unnamed: 0,annotator_labels,captionID,gold_label,pairID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
0,[neutral],3416050480.jpg#4,neutral,3416050480.jpg#4r1n,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,A person is training his horse for a competition.,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1,[contradiction],3416050480.jpg#4,contradiction,3416050480.jpg#4r1c,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is at a diner, ordering an omelette.",( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
2,[entailment],3416050480.jpg#4,entailment,3416050480.jpg#4r1e,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is outdoors, on a horse.","( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
3,[neutral],2267923837.jpg#2,neutral,2267923837.jpg#2r1n,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,They are smiling at their parents,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...
4,[entailment],2267923837.jpg#2,entailment,2267923837.jpg#2r1e,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,There are children present,( There ( ( are children ) present ) ),(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...
...,...,...,...,...,...,...,...,...,...,...
550147,[contradiction],2267923837.jpg#3,contradiction,2267923837.jpg#3r1c,Four dirty and barefooted children.,( ( ( ( Four dirty ) and ) ( barefooted childr...,(ROOT (NP (NP (CD Four) (NNS dirty)) (CC and) ...,four kids won awards for 'cleanest feet',( ( four kids ) ( ( won awards ) ( ( ( for ` )...,(ROOT (S (NP (CD four) (NNS kids)) (VP (VBD wo...
550148,[neutral],2267923837.jpg#3,neutral,2267923837.jpg#3r1n,Four dirty and barefooted children.,( ( ( ( Four dirty ) and ) ( barefooted childr...,(ROOT (NP (NP (CD Four) (NNS dirty)) (CC and) ...,"four homeless children had their shoes stolen,...",( ( ( ( ( ( four ( homeless children ) ) ( had...,(ROOT (S (S (NP (CD four) (JJ homeless) (NNS c...
550149,[neutral],7979219683.jpg#2,neutral,7979219683.jpg#2r1n,A man is surfing in a bodysuit in beautiful bl...,( ( A man ) ( ( is ( surfing ( in ( ( a bodysu...,(ROOT (S (NP (DT A) (NN man)) (VP (VBZ is) (VP...,A man in a bodysuit is competing in a surfing ...,( ( ( A man ) ( in ( a bodysuit ) ) ) ( ( is (...,(ROOT (S (NP (NP (DT A) (NN man)) (PP (IN in) ...
550150,[contradiction],7979219683.jpg#2,contradiction,7979219683.jpg#2r1c,A man is surfing in a bodysuit in beautiful bl...,( ( A man ) ( ( is ( surfing ( in ( ( a bodysu...,(ROOT (S (NP (DT A) (NN man)) (VP (VBZ is) (VP...,A man in a business suit is heading to a board...,( ( ( A man ) ( in ( a ( business suit ) ) ) )...,(ROOT (S (NP (NP (DT A) (NN man)) (PP (IN in) ...


In [None]:
print(df_class['gold_label'])

0               neutral
1         contradiction
2            entailment
3               neutral
4            entailment
              ...      
550147    contradiction
550148          neutral
550149          neutral
550150    contradiction
550151       entailment
Name: gold_label, Length: 550152, dtype: object


In [None]:
df_class.gold_label.value_counts()

entailment       183416
contradiction    183187
neutral          182764
-                   785
Name: gold_label, dtype: int64

In [None]:
print(len(df_class.gold_label.index))

550152


In [None]:
# train_class = []
# delete = []

# for i in df_class.index:
#   print(i)
  
#   if df_class['gold_label'][i] == 'contradiction':
#     label_id = 0
#     inp_sample = InputExample(texts=[df_class['sentence1'][i], df_class['sentence2'][i]], label=label_id)
#     train_class.append(inp_sample)
#   elif df_class['gold_label'][i] == 'entailment':
#     label_id = 1
#     inp_sample = InputExample(texts=[df_class['sentence1'][i], df_class['sentence2'][i]], label=label_id)
#     train_class.append(inp_sample)
#   elif df_class['gold_label'][i] == 'neutral':
#     label_id = 2
#     inp_sample = InputExample(texts=[df_class['sentence1'][i], df_class['sentence2'][i]], label=label_id)
#     train_class.append(inp_sample)
#   else:
#     delete_id = 3
#     inp_sample =  InputExample(texts=[df_class['sentence1'][i], df_class['sentence2'][i]], label=delete_id)
#     delete.append(inp_sample)    

In [None]:
## here we get rid of the "-" label

train_class = []

for i in df_class.index:
  #print(i)
  
  if df_class['gold_label'][i] == 'contradiction':
    label_id = 0
    
  elif df_class['gold_label'][i] == 'entailment':
    label_id = 1
    
  elif df_class['gold_label'][i] == 'neutral':
    label_id = 2
  
  inp_sample = InputExample(texts=[df_class['sentence1'][i], df_class['sentence2'][i]], label=label_id)
  train_class.append(inp_sample)
  #print(inp_sample)

In [None]:
## previous try

# train_path = '/content/snli_1.0_dev.csv'
# train_samples=[]
# with open(train_path, newline='') as train:
#     columns=['annotator_labels',
#              'captionID',
#              'gold_label',
#              'pairID',
#              'sentence1', 'sentence1_binary_parse', 'sentence1_parse',
#              'sentence2', 'sentence2_binary_parse', 'sentence2_parse']
#     df = csv.DictReader(train, delimiter='|', fieldnames=columns, quoting=csv.QUOTE_NONE)
#     for row in df:
#         inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=row['gold_label'])
#         train_samples.append(inp_example)
#         #print(inp_example)

In [None]:
test_path = 'snli_1.0/snli_1.0_test.jsonl'
df_class_test = pd.read_json(test_path, lines=True)

test_class=[]

for i in df_class_test.index:
  if df_class_test['gold_label'][i] == 'contradiction':
    label_id = 0
  elif df_class_test['gold_label'][i] == 'entailment':
    label_id = 1
  elif df_class_test['gold_label'][i] == 'neutral':
    label_id = 2
  inp_sample = InputExample(texts=[df_class_test['sentence1'][i], df_class_test['sentence2'][i]], label=label_id)
  test_class.append(inp_sample)
  #print(inp_sample)

In [None]:
dev_path = 'snli_1.0/snli_1.0_dev.jsonl'

df_class_dev = pd.read_json(dev_path, lines=True)

dev_class = []

for i in df_class_dev.index:
  if df_class_dev['gold_label'][i] == 'contradiction':
    label_id = 0
  elif df_class_dev['gold_label'][i] == 'entailment':
    label_id  = 1
  elif df_class_dev['gold_label'][i] == 'neutral':
    label_id = 2
  inp_sample = InputExample(texts=[df_class_dev['sentence1'][i], df_class_dev['sentence2'][i]], label=label_id)
  dev_class.append(inp_sample)
  #print(inp_sample)

In [None]:
# Read the dataset
train_batch_size = 16

model_save_path = './training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
## DOING IT WITH ALL THE DATASET
#train_dataloader = DataLoader(train_class, shuffle=True, batch_size=train_batch_size)

## DOING IT WITH A SUBSET
train_dataloader = DataLoader(train_class[0:200000], shuffle=True, batch_size=train_batch_size)

#     :param model: SentenceTransformer model => BERT base
#     :param sentence_embedding_dimension: Dimension of your sentence embeddings => shape
#     :param num_labels: Number of different labels => 3
#     :param concatenation_sent_rep: Concatenate vectors u,v for the softmax classifier? => T
#     :param concatenation_sent_difference: Add abs(u-v) for the softmax classifier? => T
#     :param concatenation_sent_multiplication: Add u*v for the softmax classifier? => F
#     :param loss_fct: Optional: Custom pytorch loss function. If not set, uses nn.CrossEntropyLoss() => T

################################################################################
# creating custom model, by using mean pooling of the word embeddings given as input
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################################################################################

train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3)

In [None]:
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_class, batch_size=train_batch_size, name='snli-dev')

In [None]:
# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34385 [00:00<?, ?it/s]

## Evaluation with snli-test

In [None]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_class, name='snli-test')
test_evaluator(model, output_path=model_save_path)

Result with 200000 -> 0.3378714236743856

## Evaluation with STS-benchmark test and dev for eval

In [None]:
#dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, main_similarity='Cosine', name='sts-dev')
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

## to be implemented!!!
dev_evaluator = scipy.stats.spearmanr(a, b=None, axis=0, nan_policy='propagate', alternative='two-sided')

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )


In [None]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

Result with 200000 -> 0.7167475547347155

## Semantic search

In [None]:
## LINK
# https://www.kaggle.com/rmisra/news-category-dataset/download

In [None]:
## DOESN'T WORK WITH LINK

# print('Downloading dataset...')

# # The URL for the dataset zip file.
# url = 'https://www.kaggle.com/rmisra/news-category-dataset?select=News_Category_Dataset_v2.json'

# # Download the file (if we haven't already)
# if not os.path.exists('./news.zip'):
#     wget.download(url,'./news.zip')

print('Uploading dataset to directory...')

Uploading dataset to directory...


In [None]:
!unzip news.zip

Archive:  news.zip
replace News_Category_Dataset_v2.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
# We use the best model to encode all passages, so that we can use it with sematic search
#model_name = '/content/best_models/2.3-task'
model_name = '/content/drive/MyDrive/Colab Notebooks/training_nli_'
encoder = SentenceTransformer(model_name)
top_k = 5  # number of passages we want to retrieve with the bi-encoder

In [None]:
# import gzip

# news = []
# with gzip.open(url, 'rt', encoding='utf8') as fIn:
#     for line in fIn:
#         data = json.loads(line.strip())
#         for paragraph in data['short_description']:
#             # We encode the passages as [title, text]
#             news.append([data['headlines'], paragraph])

In [None]:
# news = []

# with open('/content/News_Category_Dataset_v2.json') as fIn:
#     for line in fIn:
#         data = json.loads(line.strip())
#         #print(data)
#         for paragraph in data['short_description']:
#             # We encode the passages as [title, text]
#             news.append([data['headline'], paragraph]) # --> wrong

In [None]:
news = []

with open('/content/News_Category_Dataset_v2.json') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        #print(data)
        # We encode the passages as [title, text]
        news.append([data['headline'], data['short_description']])

In [None]:
print(news[-1])
print(data)

['Dwight Howard Rips Teammates After Magic Loss To Hornets', 'The five-time all-star center tore into his teammates Friday night after Orlando committed 23 turnovers en route to losing']
{'category': 'SPORTS', 'headline': 'Dwight Howard Rips Teammates After Magic Loss To Hornets', 'authors': '', 'link': 'https://www.huffingtonpost.com/entry/dwight-howard-rips-teammates-magic-hornets_us_5bb69b24e4b097869fd1b331', 'short_description': 'The five-time all-star center tore into his teammates Friday night after Orlando committed 23 turnovers en route to losing', 'date': '2012-01-28'}


In [None]:
#corpus_embeddings = encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings = encoder.encode(news, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/6277 [00:00<?, ?it/s]

In [None]:
## COMMENT THIS

# while True:
#     input = input("Please enter a question: ")

#     # Encode the query using the encoder and find potentially relevant passages
#     start_time = time.time()
#     question_embedding = encoder.encode(input, convert_to_tensor=True)
#     hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
#     hits = hits[0]  # Get the hits for the first query

#     end_time = time.time()

#     # Output of top-k hits
#     print("Input question:", query)
#     print("Results (after {:.3f} seconds):".format(end_time - start_time))
#     for hit in hits:
#         print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']]))

#     print("\n\n========\n")

In [None]:
top_k = min(5, len(corpus))

input = input("Please enter a question: ")

query_embedding = encoder.encode(input, convert_to_tensor=True)

# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)

print("\n======================\n\n")
print("Input:", input)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results[0], top_results[1]):
      print(news[idx], "(Score: {:.4f})".format(score))