# **LAB-2: Scalable Machine Learning and Deep Learning**

## **Paolo Teta & Ralfs Zangis**
---
**TASK:** Implement **S-BERT** model

**Outline:**
- Load the dataset
- Regression
- Classification
- Evaluation with STS benchmark dataset (cosine similarity and Spearmean correlation)
- Semantic search
---


## **Requirements**

### Install dependencies

In [None]:
!pip install sentence_transformers
!pip install transformers
!pip install tokenizers
!pip install wget
!pip install torch

### Spark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

### ML

In [None]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from sentence_transformers import SentenceTransformer
from sentence_transformers import LoggingHandler
from sentence_transformers import models, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

from transformers import BertTokenizer, TFBertModel, BertConfig
# from transformers import DistilBertTokenizer, DistilBertModel # smaller model

### Other

In [None]:
import os
import re
import csv
import wget
import json
import math
import scipy
import torch
import string
import sklearn

import numpy as np
import pandas as pd

from tokenizers import BertWordPieceTokenizer
from torch.utils.data import DataLoader
from datetime import datetime

**Mount Google Drive to load saved models**

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## **REGRESSION**

### Loading the datasets

In [None]:
schema = StructType([
    StructField("genre", StringType(), True),
    StructField("filename", StringType(), True),
    StructField("year", StringType(), True),
    StructField("year_id", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("sentence1", StringType(), True),
    StructField("sentence2", StringType(), True)])

train = spark.read.csv("stsbenchmark/sts-train.csv", sep ='\t', header=False, schema=schema)
test = spark.read.csv("stsbenchmark/sts-test.csv", sep ='\t', header=False, schema=schema)
dev = spark.read.csv("stsbenchmark/sts-dev.csv", sep ='\t', header=False, schema=schema)

train.show()

### Normalize

In [None]:
train = train.withColumn("score", col("score")/2.5-1)
test = test.withColumn("score", col("score")/2.5-1)
dev = dev.withColumn("score", col("score")/2.5-1)

dev.select("score").describe().show()

### Create samples

In [None]:
df_train = train.select("sentence1", "sentence2", "score").toPandas()

train_samples = []
for index, row in df_train.iterrows():
    input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score'])
    train_samples.append(input_example)

In [None]:
df_test = test.select("sentence1", "sentence2", "score").toPandas()

test_samples = []
for index, row in df_test.iterrows():
    input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score'])
    test_samples.append(input_example)

In [None]:
df_dev = dev.select("sentence1", "sentence2", "score").toPandas()

dev_samples = []
for index, row in df_dev.iterrows():
    input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score'])
    dev_samples.append(input_example)

## Considering the given paper "*Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks*"

### Get pre-trained model "*bert-base-uncased*" and word embedding model

In [None]:
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)

### Set mean-pooling strategy

In [None]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

## Define the model

In [None]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

### Load the training set and define the loss function as the cosine similarity

In [None]:
train_batch_size = 16
# train_batch_size = 32 # try to speed up the training

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

### Define the evaluator for the sentence embeddings

In [None]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

10% of train dataset for warm-up

In [None]:
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

**Training**

In [None]:
save_path = './training_sts_reg_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
learn_rate = 2e-5

model.fit(train_objectives=[(train_dataloader, train_loss)],
            optimizer_class=torch.optim.Adam,
            optimizer_params={'lr': learn_rate},
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=save_path)

**Evaluation on STS benchmark dataset**

Mathematical relationship: *cosine_similarity = 1 - cosine_distance*

In [None]:
print('Loading the stored model ...')
model = SentenceTransformer(save_path)

In [None]:
test_eval = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
c_s = test_eval(model, output_path=save_path)
print('Cosine similarity with the sentence_transformers library = ', c_s)

Usually the result is between 0.7 and 0.8

Embedding sentences

In [None]:
embed_1 = model.encode(df_test['sentence1'], convert_to_numpy=True, batch_size=train_batch_size)
embed_2 = model.encode(df_test['sentence2'], convert_to_numpy=True, batch_size=train_batch_size)

Compute the cosine similarity

In [None]:
cos_sim = 1 - sklearn.metrics.pairwise.paired_cosine_distances(embed_1, embed_2)
print('Cosine similarity = ', cos_sim)

Spearmean correlation coefficient

In [None]:
spr_corr = scipy.stats.spearmanr(cos_sim, df_test['score'])
print('Spearmean correlation coefficient = ', spr_corr[0])

**Comment:** the two results match each other

---

## **CLASSIFICATION**

In [None]:
print('***** Downloading dataset ...')

# The URL for the dataset zip file.
url = 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip'

# Download the file (if we haven't already)
if not os.path.exists('./snli_1.0.zip'):
    wget.download(url, './snli_1.0.zip')

In [None]:
!unzip snli_1.0.zip

In [None]:
label2int = {"contradiction": 0,
             "entailment": 1,
             "neutral": 2,
             "-": 3}

### Create samples

In [None]:
train_path = 'snli_1.0/snli_1.0_train.jsonl'
df_class_train = pd.read_json(train_path, lines=True)

#print(df_class_train['gold_label'])
#print(df_class_train.gold_label.value_counts())
#print(len(df_class_train.gold_label.index))
#print(df_class_train['gold_label'].unique())

train_class = []
for index, row in df_class_train.iterrows():
    input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=label2int[row['gold_label']])
    train_class.append(input_example)

In [None]:
test_path = 'snli_1.0/snli_1.0_test.jsonl'
df_class_test = pd.read_json(test_path, lines=True)

test_class=[]

for index, row in df_class_test.iterrows():
    input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=label2int[row['gold_label']])
    test_class.append(input_example)

In [None]:
dev_path = 'snli_1.0/snli_1.0_dev.jsonl'
df_class_dev = pd.read_json(dev_path, lines=True)

dev_class = []

for index, row in df_class_dev.iterrows():
    input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=label2int[row['gold_label']])
    dev_class.append(input_example)

In [None]:
model_save_path = './training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
train_batch_size = 16

## DOING IT WITH ALL THE DATASET
#train_dataloader = DataLoader(train_class, shuffle=True, batch_size=train_batch_size)

## DOING IT WITH A SUBSET
train_dataloader = DataLoader(train_class[0:200000], shuffle=True, batch_size=train_batch_size)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3)

In [None]:
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_class, batch_size=train_batch_size, name='snli-dev')

In [None]:
# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

## Evaluation with snli-test

In [None]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_class, name='snli-test')
test_evaluator(model, output_path=model_save_path)

Result with 200000 -> 0.3378714236743856

## Evaluation with STS-benchmark test and dev for eval

In [None]:
#dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, main_similarity='Cosine', name='sts-dev')
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

## to be implemented!!!
dev_evaluator = scipy.stats.spearmanr(a, b=None, axis=0, nan_policy='propagate', alternative='two-sided')

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )


In [None]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

Result with 200000 -> 0.7167475547347155

## Semantic search

In [None]:
## LINK
# https://www.kaggle.com/rmisra/news-category-dataset/download

In [None]:
## DOESN'T WORK WITH LINK

# print('Downloading dataset...')

# # The URL for the dataset zip file.
# url = 'https://www.kaggle.com/rmisra/news-category-dataset?select=News_Category_Dataset_v2.json'

# # Download the file (if we haven't already)
# if not os.path.exists('./news.zip'):
#     wget.download(url,'./news.zip')

print('Uploading dataset to directory...')

In [None]:
!unzip news.zip

In [None]:
# We use the best model to encode all passages, so that we can use it with sematic search
#model_name = '/content/best_models/2.3-task'
model_name = '/content/drive/MyDrive/Colab Notebooks/training_nli_'
encoder = SentenceTransformer(model_name)
top_k = 5  # number of passages we want to retrieve with the bi-encoder

In [None]:
# import gzip

# news = []
# with gzip.open(url, 'rt', encoding='utf8') as fIn:
#     for line in fIn:
#         data = json.loads(line.strip())
#         for paragraph in data['short_description']:
#             # We encode the passages as [title, text]
#             news.append([data['headlines'], paragraph])

In [None]:
# news = []

# with open('/content/News_Category_Dataset_v2.json') as fIn:
#     for line in fIn:
#         data = json.loads(line.strip())
#         #print(data)
#         for paragraph in data['short_description']:
#             # We encode the passages as [title, text]
#             news.append([data['headline'], paragraph]) # --> wrong

In [None]:
news = []

with open('/content/News_Category_Dataset_v2.json') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        # We encode the passages as [title, text]
        news.append([data['headline'], data['short_description']])

In [None]:
print(news[-1])
print(data)

In [None]:
#corpus_embeddings = encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings = encoder.encode(news, convert_to_tensor=True, show_progress_bar=True)

In [None]:
## COMMENT THIS

# while True:
#     input = input("Please enter a question: ")

#     # Encode the query using the encoder and find potentially relevant passages
#     start_time = time.time()
#     question_embedding = encoder.encode(input, convert_to_tensor=True)
#     hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
#     hits = hits[0]  # Get the hits for the first query

#     end_time = time.time()

#     # Output of top-k hits
#     print("Input question:", query)
#     print("Results (after {:.3f} seconds):".format(end_time - start_time))
#     for hit in hits:
#         print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']]))

#     print("\n\n========\n")

In [None]:
top_k = min(5, len(corpus))

input = input("Please enter a question: ")

query_embedding = encoder.encode(input, convert_to_tensor=True)

# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)

print("\n======================\n\n")
print("Input:", input)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results[0], top_results[1]):
      print(news[idx], "(Score: {:.4f})".format(score))