<a href="https://colab.research.google.com/github/bubriks/ID2223/blob/main/Lab2/lab-2_colab-version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LAB-2: Scalable Machine Learning and Deep Learning**

## **Paolo Teta & Ralfs Zangis**
---
**TASK:** Implement **S-BERT** model

**Outline:**
- Load the dataset
- Regression
- Classification
- Evaluation with STS benchmark dataset (cosine similarity and Spearmean correlation)
- Semantic search
---


**REMEMBER:** UPLOAD DATA TO SESSION STORAGE (*sts-benchmark* and *news*)

## **Requirements**

Install dependencies

In [None]:
!pip install sentence_transformers
!pip install transformers
!pip install tokenizers
!pip install torch
!pip install wget

# !pip install pyspark
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import *
# from pyspark.sql.types import *
# spark = SparkSession.builder.getOrCreate()

import os
import re
import csv
import wget
import json
import math
import scipy
import torch
import string
import sklearn

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from sentence_transformers import SentenceTransformer
from sentence_transformers import LoggingHandler
from sentence_transformers import models, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

from transformers import BertTokenizer, TFBertModel, BertConfig
# from transformers import DistilBertTokenizer, DistilBertModel # -> smaller model

from tokenizers import BertWordPieceTokenizer

from torch.utils.data import DataLoader

from datetime import datetime

**Mount Google Drive to load saved models**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## **REGRESSION**

Pre-trained model "*bert-base-uncased*" and word embedding model

In [None]:
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)

In [None]:
## BERT -> original model
# model = TFBertModel.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## DistilBERT -> smaller model
# model = DistilBertModel.from_pretrained('distilbert-base-uncased')
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Load the datasets

In [None]:
columns = ['title', 'type', 'year', 'id', 'score', 'sentence_1', 'sentence_2']

In [None]:
print('Loading train dataset ...')

train_path = '/content/sts-train.csv'
os.path.isfile(train_path)

train_samples = []

with open(train_path, newline='') as train:
    df_train = csv.DictReader(train, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_train:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        train_samples.append(input_example)

In [None]:
print('Loading test dataset ...')

test_path = '/content/sts-test.csv'
os.path.isfile(test_path)

test_samples = []

with open(test_path, newline='') as test:
    df_test = csv.DictReader(test, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_test:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        test_samples.append(input_example)

In [None]:
print('Loading evaluation dataset ...')

dev_path = '/content/sts-dev.csv'
os.path.isfile(dev_path)

dev_samples = []

with open(dev_path, newline='') as dev:
    df_dev = csv.DictReader(dev, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_dev:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        dev_samples.append(input_example)

Considering the given paper "*Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks*"

In [None]:
train_batch_size = 16
# train_batch_size = 32 # try to speed up the training

learn_rate = 2e-5
num_epochs = 1

Mean-pooling strategy

In [None]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

Define the model

In [None]:
# custom model using mean pooling of the word embeddings given as input
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Load the training set and define the loss function as the cosine similarity

In [None]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

Define the evaluator for the sentence embeddings

In [None]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

10% of train dataset for warm-up

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

**Training**

In [None]:
save_path = './training_sts_reg_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
            optimizer_class=torch.optim.Adam,
            optimizer_params={'lr': learn_rate},
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=save_path)

**Evaluation on STS benchmark dataset**

Mathematical relationship: *cosine_similarity = 1 - cosine_distance*

In [None]:
print('Loading the stored model ...')
model = SentenceTransformer(save_path)

In [None]:
test_eval = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
c_s = test_eval(model, output_path=save_path)
print('Cosine similarity with the sentence_transformers library = ', c_s)

# sometimes the result is between 0.7 and 0.8

Embedding sentences

In [None]:
df_test = pd.read_csv(test_path, sep='\t', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE)
df_test.columns = columns

In [None]:
embed_1 = model.encode(df_test['sentence_1'], convert_to_numpy=True, batch_size=train_batch_size)
embed_2 = model.encode(df_test['sentence_2'], convert_to_numpy=True, batch_size=train_batch_size)

Compute the cosine similarity

In [None]:
cos_sim = 1 - sklearn.metrics.pairwise.paired_cosine_distances(embed_1, embed_2)
print('Cosine similarity = ', cos_sim)

Spearmean correlation coefficient

In [None]:
spr_corr = scipy.stats.spearmanr(cos_sim, df_test['score'])
print('Spearmean correlation coefficient = ', spr_corr[0])

**Comment:** the two results match each other

---

## **CLASSIFICATION**

Download and unzip the dataset

In [None]:
print('Downloading dataset from web ...')

url = 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip'

if not os.path.exists('./snli_1.0.zip'):
    wget.download(url,'./snli_1.0.zip')

In [None]:
!unzip snli_1.0.zip

In [None]:
'''
# convert .json file to .csv file

df_train_cl = pd.read_json(r'/content/snli_1.0/snli_1.0_train.jsonl', lines=True)
df_train_cl.to_csv(r'/content/snli_1.0_train.csv', index=None)

df_test_cl = pd.read_json(r'/content/snli_1.0/snli_1.0_test.jsonl', lines=True)
df_test_cl.to_csv(r'/content/snli_1.0_test.csv', index=None)

df_dev_cl = pd.read_json(r'/content/snli_1.0/snli_1.0_dev.jsonl', lines=True)
df_dev_cl.to_csv(r'/content/snli_1.0_dev.csv', index=None)



train_path = '/content/snli_1.0/snli_1.0_train.jsonl'
train_samples = []
with open(train_path, newline='') as train:
    columns = ['annotator_labels',
               'captionID',
               'gold_label',
               'pairID',
               'sentence1', 'sentence1_binary_parse', 'sentence1_parse',
               'sentence2', 'sentence2_binary_parse', 'sentence2_parse']
    df_train_cl = csv.DictReader(train, delimiter='|', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_train_cl:
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=row['gold_label'])
        train_samples.append(inp_example)
'''

Load the datasets

In [None]:
print('Loading train dataset ...')
train_path = '/content/snli_1.0/snli_1.0_train.jsonl'
df_train_cl = pd.read_json(train_path, lines=True)

print('Loading test dataset ...')
test_path = '/content/snli_1.0/snli_1.0_test.jsonl'
df_test_cl = pd.read_json(test_path, lines=True)

print('Loading evaluation dataset ...')
dev_path = '/content/snli_1.0/snli_1.0_dev.jsonl'
df_dev_cl = pd.read_json(dev_path, lines=True)

In [None]:
print("Labels in the dataset:\n")
df_train_cl['gold_label']

In [None]:
df_train_cl['gold_label'].value_counts()

Convert labels to numerical vales

In [None]:
label_map = {"contradiction": 0,
             "entailment": 1,
             "neutral": 2}

In [None]:
## get rid of the "-" label
# idx = df_train_cl[df_train_cl['gold_label'] == '-'].index
# df_train_cl.drop(idx, inplace=True)

In [None]:
# TRAIN SET
train_smpls = []

for i in df_train_cl.index:
    if df_train_cl['gold_label'][i] == 'contradiction':
        id = 0
    elif df_train_cl['gold_label'][i] == 'entailment':
        id = 1
    elif df_train_cl['gold_label'][i] == 'neutral':
        id = 2
    input_sample = InputExample(texts=[df_train_cl['sentence1'][i], df_train_cl['sentence2'][i]], label=id)
    train_smpls.append(input_sample)

In [None]:
# TEST SET
test_smpls = []
ids = []

for i in df_test_cl.index:
    if df_test_cl['gold_label'][i] == 'contradiction':
        id = 0
    elif df_test_cl['gold_label'][i] == 'entailment':
        id = 1
    elif df_test_cl['gold_label'][i] == 'neutral':
        id = 2
    input_sample = InputExample(texts=[df_test_cl['sentence1'][i], df_test_cl['sentence2'][i]], label=id)
    test_smpls.append(input_sample)
    ids.append(id)

In [None]:
# DEV SET
dev_smpls = []

for i in df_dev_cl.index:
    if df_dev_cl['gold_label'][i] == 'contradiction':
        id = 0
    elif df_dev_cl['gold_label'][i] == 'entailment':
        id = 1
    elif df_dev_cl['gold_label'][i] == 'neutral':
        id = 2
    input_sample = InputExample(texts=[df_dev_cl['sentence1'][i], df_dev_cl['sentence2'][i]], label=id)
    dev_smpls.append(input_sample)

Using the previous model

In [None]:
model

Load the training set and define the loss function as the cosine similarity

In [None]:
# DOING IT WITH ALL THE DATASET
train_dataloader_cl = DataLoader(train_smpls, shuffle=True, batch_size=train_batch_size)

# DOING IT WITH A SUBSET OF THE DATASET
# train_dataloader_cl = DataLoader(train_smpls[0:200000], shuffle=True, batch_size=train_batch_size)

train_loss_cl = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label_map))

Define the evaluator for the sentence embeddings

In [None]:
evaluator_cl = EmbeddingSimilarityEvaluator.from_input_examples(dev_smpls, batch_size=train_batch_size, name='snli-dev')

10% of train dataset for warm-up

In [None]:
warmup_steps_cl = math.ceil(len(train_dataloader_cl) * num_epochs * 0.1)

**Training**

In [None]:
save_path_cl = './training_snli_class_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
# model.fit(train_objectives=[(train_dataloader_cl, train_loss_cl)],
#             evaluator=evaluator_cl,
#             epochs=num_epochs,
#             evaluation_steps=1000,
#             warmup_steps=warmup_steps_cl,
#             output_path=save_path_cl
#             )

Load the saved model folder

In [None]:
print('Loading the stored model from Google Drive ...')

path = '/content/drive/MyDrive/Colab Notebooks/training_snli'

if os.path.exists(path):
    model = SentenceTransformer(path)

# model = SentenceTransformer(save_path_cl) # if run the training

**Evaluation on SNLI dataset with library**

In [None]:
test_eval_cl = EmbeddingSimilarityEvaluator.from_input_examples(test_smpls, batch_size=train_batch_size, name='snli-test')
c_s_cl = test_eval_cl(model, output_path=path) # or save_path_cl
print('Cosine similarity with the sentence_transformers library = ', c_s_cl)

# result with 200000 -> 0.3378714236743856

**Evaluation on STS benchmark dataset with library**

In [None]:
c_s_sts = test_eval(model, output_path=path) # from regression task
print('Cosine similarity with the sentence_transformers library = ', c_s_sts)

# result with 200000 -> 0.7167475547347155

**Evaluation on SNLI and STS benchmark datasets** (no library)

Embedding sentences

In [None]:
embed_1_snli = model.encode(df_test_cl['sentence1'], convert_to_numpy=True, batch_size=train_batch_size)
embed_2_snli = model.encode(df_test_cl['sentence2'], convert_to_numpy=True, batch_size=train_batch_size)

embed_1 = model.encode(df_test['sentence_1'], convert_to_numpy=True, batch_size=train_batch_size)
embed_2 = model.encode(df_test['sentence_2'], convert_to_numpy=True, batch_size=train_batch_size)

Compute the cosine similarity

In [None]:
cos_sim_cl = 1 - sklearn.metrics.pairwise.paired_cosine_distances(embed_1_snli, embed_2_snli)
print('SNLI-test: cosine similarity = ', cos_sim_cl)

In [None]:
cos_sim_sts = 1 - sklearn.metrics.pairwise.paired_cosine_distances(embed_1, embed_2)
print('STS benchmark: cosine similarity = ', cos_sim_sts)

Spearmean correlation coefficient

In [None]:
spr_corr_cl = scipy.stats.spearmanr(cos_sim_cl, ids)
print('SNLI-test: Spearmean correlation coefficient = ', spr_corr_cl[0])

In [None]:
spr_corr_sts = scipy.stats.spearmanr(cos_sim_sts, df_test['score'])
print('STS benchmark: Spearmean correlation coefficient = ', spr_corr_sts[0])

**Comment:** All the results match each other

---

## **SEMANTIC SEARCH**

**Link to dataset:** https://www.kaggle.com/rmisra/news-category-dataset

In [None]:
print('Uploading dataset to the session storage ...')

file_path = '/content/news.zip'
os.path.isfile(file_path)

In [None]:
!unzip news.zip

In [None]:
news_path = '/content/News_Category_Dataset_v2.json'
os.path.isfile(news_path)

news_set = pd.read_json(news_path, lines=True)

In [None]:
news = []

with open(news_path) as f:
    for line in f:
        record = json.loads(line.strip())
        # encoding as [headline, short_description]
        news.append([record['headline'], record['short_description']])

Using the previous saved model to encode the text

In [None]:
encoder = SentenceTransformer(path)
embed_news = encoder.encode(news, convert_to_tensor=True, show_progress_bar=True)

In [None]:
search = input("Find close to: ")
n_close = 5 # number of similar record

embed_query = encoder.encode(search, convert_to_tensor=True)

In [None]:
cos_sim = util.pytorch_cos_sim(embed_query, embed_news)[0]
top_close = torch.topk(cos_sim, k=n_close)

In [None]:
print("Find close to: ", search)

print("\nTop ", n_close, " closer news in the dataset:")

for score, idx in zip(top_close[0], top_close[1]):
    print(news[idx], "(score: {:.4f})".format(score))

---