In [1]:
import warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import pandas as pd
pd.options.mode.chained_assignment = None

import torch

from datasets import load_dataset

import tensorflow as tf

import tensorflow_hub as hub
import numpy as np

import math
import scipy

import requests
import zipfile

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import sklearn
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
import gc
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
import torch

[nltk_data] Downloading package stopwords to /home/sarba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sarba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [92]:
#!pip install datasets==2.16.1

### Section 0: Load & Preprocess STS Dataset 

In [3]:
from datasets import load_dataset
dataset = load_dataset("stsb_multi_mt", name="en")

In [4]:
sts_train = dataset['train'].to_pandas()
sts_test = dataset['test'].to_pandas()
sts_dev = dataset['dev'].to_pandas()

sts_dev_test = pd.concat([sts_dev, sts_test], ignore_index=True)

In [5]:
def get_updated_df(df):
    
    char_count_columns = df[['sentence1', 'sentence2']].astype(str).applymap(len)
    df = df[char_count_columns.gt(30).all(axis=1)]
    return df



def format_float(x):
    if isinstance(x, float):
        if x.is_integer():
            return x
        else:
            return float("{:.3f}".format(x))
    else:
        return x
    
def sim_score_conversion(df):

    # scale the human-rated values between 0 and 1

    hr_value_scaling = sklearn.preprocessing.minmax_scale(df['similarity_score'].tolist(), 
                                                          feature_range=(0, 1), 
                                                          axis=0, copy=True).tolist()

    # set a threshold to convert the scaled values into boolean labels
    similarity_threshold = 0.60
    ground_truth_labels = [True if score >= similarity_threshold else False 
                           for score in hr_value_scaling]

    df['gt_similar'] = ground_truth_labels
    
    return df

In [6]:
sts_dev_test_updated = get_updated_df(sts_dev_test)
sts_dev_test_updated = sim_score_conversion(sts_dev_test_updated)

sts_train_updated = get_updated_df(sts_train)
sts_train_updated = sim_score_conversion(sts_train_updated)


sts_dev_test_updated['similarity_score'] = sts_dev_test_updated['similarity_score'].apply(format_float)

In [7]:
sts_dev_test_updated.gt_similar.value_counts()

gt_similar
False    1139
True     1034
Name: count, dtype: int64

#### 0.1 Preprocess STS dataset

In [8]:
class Preprocessing:
    def __init__(self):
        
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def rm_specialchars(self, text):

        line = re.sub(r"http\S+", "", text)
        line = re.sub("[^A-Za-z]+", " ", line)
        line = re.sub('\s+', ' ', line)
        line = line.replace('\t',' ')
        line = line.replace('\n',' ')
        line = line.replace('\r',' ')
        line = line.replace(',',' ')
        line = line.replace('-',' ')
        return line.strip()


    def rm_stopwords(self, text):

        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in self.stop_words]
        return ' '.join(filtered_text)


    def lemmatize_str(self, text):
        
        lemma_text = [self.lemmatizer.lemmatize(word) for word in text]
        return (''.join(lemma_text)).strip()
    

    def lowercase_str(self, text):
        
        return text.lower()


    def basic_preprocess(self, text):
        """
        In basic preprocess: lowercase the string, remove special characters and lemmatize.
        """
        
        l_str = self.lowercase_str(text)
        sc_str = self.rm_specialchars(l_str)
        updated_text = self.lemmatize_str(sc_str)
        
        return updated_text
    
    def preprocess(self, text):
        """
        In preprocess:  lowercase the string, remove special characters and stop words.
        Finally, lemmatize the string.
        
        """
        
        l_str = self.lowercase_str(text)
        sc_str = self.rm_specialchars(l_str)
        sw_str = self.rm_stopwords(sc_str)
        updated_text = self.lemmatize_str(sw_str)
        
        return updated_text
        

In [9]:
preprocessor = Preprocessing()

def apply_preprocessing(preprocessor, df):
    
    df['b_sentence1'] = df['sentence1'].apply(lambda x: preprocessor.basic_preprocess(x))
    df['b_sentence2'] = df['sentence2'].apply(lambda x: preprocessor.basic_preprocess(x))
    
    df['p_sentence1'] = df['sentence1'].apply(lambda x: preprocessor.preprocess(x))
    df['p_sentence2'] = df['sentence2'].apply(lambda x: preprocessor.preprocess(x))
    
    return df

In [10]:
sts_train_updated = apply_preprocessing(preprocessor, sts_train_updated)
sts_dev_test_updated = apply_preprocessing(preprocessor, sts_dev_test_updated)

In [11]:
sts_dev_test_updated.head()

Unnamed: 0,sentence1,sentence2,similarity_score,gt_similar,b_sentence1,b_sentence2,p_sentence1,p_sentence2
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0,True,a man with a hard hat is dancing,a man wearing a hard hat is dancing,man hard hat dancing,man wearing hard hat dancing
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0,True,a man is feeding a mouse to a snake,the man is feeding a mouse to the snake,man feeding mouse snake,man feeding mouse snake
6,A man is erasing a chalk board.,The man is erasing the chalk board.,5.0,True,a man is erasing a chalk board,the man is erasing the chalk board,man erasing chalk board,man erasing chalk board
13,The man cut down a tree with an axe.,A man chops down a tree with an axe.,5.0,True,the man cut down a tree with an axe,a man chops down a tree with an axe,man cut tree axe,man chops tree axe
16,The girl sang into a microphone.,The lady sang into the microphone.,2.4,False,the girl sang into a microphone,the lady sang into the microphone,girl sang microphone,lady sang microphone


#### 0.2 Util Functions

In [12]:
def get_prediction_values(similarities, df, preprocess_type):
    
    # get the highest similar index in each item
    highest_similarity_indices = np.argmax(similarities, axis=1).tolist()
    
    # get the top cosine similarity of sentence2 based on index value
    df['top_similar_sentence2_score'] = [similarities[i, index] for i, index 
                                         in enumerate(highest_similarity_indices)]
    
    if preprocess_type == 'basic':
        
        # get the top similar sentence2 based on index value
        df['top_similar_sentence2'] = [df['b_sentence2'].tolist()[index] for index 
                                       in highest_similarity_indices]
    
    elif preprocess_type == 'preprocess':
        
        # get the top similar sentence2 based on index value
        df['top_similar_sentence2'] = [df['p_sentence2'].tolist()[index] for index 
                                       in highest_similarity_indices]
    
    else:
        df['top_similar_sentence2'] = [df['sentence2'].tolist()[index] for index 
                                       in highest_similarity_indices]
        
    
    # add a column related to top similar sentence indicies
    df['top_similar_sentence2_index'] = highest_similarity_indices
    
    # check if index of row matches with the index of highest similar sentence index
    df['is_index_match'] = [i == index for i, index in enumerate(highest_similarity_indices)]
    
    return df


In [13]:
def evaluate_predictions(df):
    
    """
    True Positives (TP): Cases where is_index_match is True.

    False Negatives (FN): Cases where gt_similar is True (indicating the sentences are similar) 
    but is_index_match is False (the model did not identify them as the top match).

    False Positives (FP): Cases where the model identified them as a match (i.e., is_index_match is True), 
    but they are not actually matches according to gt_similar.

    """
    TP = df['is_index_match'].sum()

    FN = df[(df['gt_similar'] == True) & (df['is_index_match'] == False)].shape[0]

    FP = df[(df['gt_similar'] == False) & (df['is_index_match'] == True)].shape[0]

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return format_float(precision), format_float(recall), format_float(f1_score)

### Section 1: Weighted Representation Approach

#### 1.1 tfidf

In [15]:
def get_vectorizer(text_preprocessed):
    
    if text_preprocessed:
        vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, 
                                     ngram_range=(1,3), 
                                     stop_words=None)

        
    else:
        vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, 
                                     ngram_range=(1,3), 
                                     lowercase=False,
                                     stop_words=None) 
 
    return vectorizer


def get_train_vecs(df_train, columnA, columnB):
        
    train_sents = df_train[[columnA, columnB]].values.tolist()
    train_sents = [item for sublist in train_sents for item in sublist]

    return train_sents


def get_tfidf_results(df_train, df_test, columnA, columnB,
                      text_preprocessed, preprocess_type):
    
    train_sents = get_train_vecs(df_train, columnA, columnB)
    vectorizer = get_vectorizer(text_preprocessed)
    vectorizer.fit_transform(train_sents)
    
    list1 = vectorizer.transform(df_test[columnA].tolist())
    list2 = normalize(list1.toarray())
    
    list2 = vectorizer.transform(df_test[columnB].tolist())
    list2 = normalize(list2.toarray())
    
    sims_scores = cosine_similarity(list1, list2)
    
    updated_df = get_prediction_values(sims_scores, df_test, preprocess_type)
    
    precision, recall, f1_score = evaluate_predictions(updated_df)
    
    return precision, recall, f1_score, updated_df

In [16]:
tfidf_train_df = sts_train_updated.copy()
tfidf_test_df = sts_dev_test_updated.copy()

In [23]:
precision, recall, f1_score, tfidf_df = get_tfidf_results(tfidf_train_df, tfidf_test_df,
                                                          'sentence1', 'sentence2', 
                                                          text_preprocessed=False, 
                                                          preprocess_type=None)


bprecision, brecall, bf1_score, btfidf_df = get_tfidf_results(tfidf_train_df, tfidf_test_df,
                                                              'b_sentence1', 'b_sentence2', 
                                                              text_preprocessed=True, 
                                                              preprocess_type='basic')

pprecision, precall, pf1_score, ptfidf_df = get_tfidf_results(tfidf_train_df, tfidf_test_df,
                                                              'p_sentence1', 'p_sentence2', 
                                                              text_preprocessed=True, 
                                                              preprocess_type='preprocess')

In [24]:
ptfidf_df.head()

Unnamed: 0,sentence1,sentence2,similarity_score,gt_similar,b_sentence1,b_sentence2,p_sentence1,p_sentence2,top_similar_sentence2_score,top_similar_sentence2,top_similar_sentence2_index,is_index_match
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0,True,a man with a hard hat is dancing,a man wearing a hard hat is dancing,man hard hat dancing,man wearing hard hat dancing,0.819109,man wearing hard hat dancing,0,True
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0,True,a man is feeding a mouse to a snake,the man is feeding a mouse to the snake,man feeding mouse snake,man feeding mouse snake,1.0,man feeding mouse snake,1,True
6,A man is erasing a chalk board.,The man is erasing the chalk board.,5.0,True,a man is erasing a chalk board,the man is erasing the chalk board,man erasing chalk board,man erasing chalk board,1.0,man erasing chalk board,2,True
13,The man cut down a tree with an axe.,A man chops down a tree with an axe.,5.0,True,the man cut down a tree with an axe,a man chops down a tree with an axe,man cut tree axe,man chops tree axe,0.866955,man chops tree axe,3,True
16,The girl sang into a microphone.,The lady sang into the microphone.,2.4,False,the girl sang into a microphone,the lady sang into the microphone,girl sang microphone,lady sang microphone,0.592113,lady sang microphone,4,True


In [25]:
print(precision, recall, f1_score)
print(bprecision, brecall, bf1_score)
print(pprecision, precall, pf1_score)

0.773 0.761 0.767
0.775 0.783 0.779
0.786 0.777 0.782


### Section 2: String-level 

#### 2.1 JSI

In [26]:
jsi_df = sts_dev_test_updated.copy()

In [27]:
def jaccard_similarity(sentence1, sentence2):
    
    tokens1 = set(sentence1.split())
    tokens2 = set(sentence2.split())
    intersection = len(tokens1.intersection(tokens2))    
    return float(intersection) / (len(tokens1) + len(tokens2) - intersection)

def find_highest_jsi(df, columnA, columnB, preprocess_type):

    jsi_results = []
    ref_sent1 = df[columnA].tolist()
    ref_sent2 = df[columnB].tolist()
    
    similarity_matrix = np.zeros((len(ref_sent1), len(ref_sent2)))
    
    for i, sentence1 in enumerate(ref_sent1):
        for j, sentence2 in enumerate(ref_sent2):
            similarity_matrix[i, j] = jaccard_similarity(sentence1, sentence2)
            
    updated_df = get_prediction_values(similarity_matrix, df, preprocess_type)
    precision, recall, f1_score = evaluate_predictions(updated_df)

    return precision, recall, f1_score, updated_df


In [28]:
precision, recall, f1_score, updated_df = find_highest_jsi(jsi_df, 'sentence1', 'sentence2', 
                                                           preprocess_type=None)

bprecision, brecall, bf1_score, bupdated_df = find_highest_jsi(jsi_df, 'b_sentence1', 'b_sentence2',
                                                              preprocess_type='basic')

pprecision, precall, pf1_score, pupdated_df = find_highest_jsi(jsi_df, 'p_sentence1', 'p_sentence2', 
                                                               preprocess_type='preprocess')



In [29]:
print(precision, recall, f1_score)
print(bprecision, brecall, bf1_score)
print(pprecision, precall, pf1_score)

0.772 0.795 0.783
0.772 0.84 0.805
0.769 0.853 0.809


### Section 3: Distributed Representation Approaches

#### 3.1 Glove word2vec

In [190]:
glove_model_url = 'https://nlp.stanford.edu/data/glove.6B.zip'
save_dir = './models/glove_model'

response = requests.get(glove_model_url)

zip_file_path = os.path.join(save_dir, 'glove.6B.zip')

with open(zip_file_path, 'wb') as f:
    f.write(response.content)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('./models/glove_model/')

In [30]:
def load_glove_model(File):
    
    glove_model = {}
    
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_model = load_glove_model('./models/glove_model/glove.6B.300d.txt')
glove_vocab = list(glove_model.keys())


400000 words loaded!


In [31]:
def word2vec_similarity(df, columnA, columnB, 
                        glove_model, glove_vocab, preprocessor, preprocess_type):
    
    ref_sent1 = df[columnA].tolist()
    ref_sent2 = df[columnB].tolist()

    ref_sent1 = [preprocessor.lowercase_str(x) for x in ref_sent1]
    ref_sent2 = [preprocessor.lowercase_str(x) for x in ref_sent2]

    glove_embeds1 = [[glove_model[word] for word in sentence.split() if word in glove_vocab] 
                     for sentence in ref_sent1]
    
    glove_embeds2 = [[glove_model[word] for word in sentence.split() if word in glove_vocab] 
                     for sentence in ref_sent2]

    sent_vec1 = [np.mean(normalize(wordvec, axis=1), axis=0) for wordvec in glove_embeds1]
    sent_vec2 = [np.mean(normalize(wordvec, axis=1), axis=0) for wordvec in glove_embeds2]
    
    sims_scores = cosine_similarity(sent_vec1, sent_vec2)
    updated_df = get_prediction_values(sims_scores, df, preprocess_type)
    
    precision, recall, f1_score = evaluate_predictions(updated_df)
    
    return precision, recall, f1_score, updated_df

In [32]:
word2vec_glove_df = sts_dev_test_updated.copy()

In [33]:
precision, recall, f1_score, word2vec_glove_df_updated = word2vec_similarity(word2vec_glove_df,
                                                                             'sentence1', 'sentence2',  
                                                                             glove_model, glove_vocab, 
                                                                             preprocessor, 
                                                                             preprocess_type= None)

bprecision, brecall, bf1_score, bword2vec_glove_df_updated = word2vec_similarity(word2vec_glove_df,
                                                                                 'b_sentence1', 'b_sentence2',  
                                                                                 glove_model, glove_vocab, 
                                                                                 preprocessor, 
                                                                                 preprocess_type = 'basic')

pprecision, precall, pf1_score, pword2vec_glove_df_updated = word2vec_similarity(word2vec_glove_df,
                                                                                 'p_sentence1', 'p_sentence2',  
                                                                                 glove_model, glove_vocab, 
                                                                                 preprocessor,
                                                                                 preprocess_type='preprocess')


In [34]:
print(precision, recall, f1_score)
print(bprecision, brecall, bf1_score)
print(pprecision, precall, pf1_score)


0.774 0.76 0.767
0.766 0.81 0.787
0.775 0.851 0.812


#### 3.2 FastText Embeddings

In [30]:
import fasttext

#  download ft model
#import fasttext.util
#fasttext.util.download_model('en', if_exists='ignore')

In [31]:
ft = fasttext.load_model('./models/cc.en.300.bin')



In [32]:
def fasttext_similarity(df, columnA, columnB, ft, preprocess_type):
    
    ref_sent1 = df[columnA].tolist()
    ref_sent2 = df[columnB].tolist()

    ft_embeds1 = [ft.get_sentence_vector(sent) for sent in ref_sent1]    
    ft_embeds2 = [ft.get_sentence_vector(sent) for sent in ref_sent2]
    
    sims_scores = cosine_similarity(ft_embeds1, ft_embeds2)
    updated_df = get_prediction_values(sims_scores, df, preprocess_type)
    
    precision, recall, f1_score = evaluate_predictions(updated_df)
    
    return precision, recall, f1_score, updated_df

In [33]:
ft_df = sts_dev_test_updated.copy()

In [34]:
precision, recall, f1_score, ft_df_updated = fasttext_similarity(ft_df,
                                                                 'sentence1', 'sentence2',  
                                                                 ft, preprocess_type= None)

bprecision, brecall, bf1_score, bft_df_updated = fasttext_similarity(ft_df,
                                                                     'b_sentence1', 'b_sentence2',  
                                                                     ft, preprocess_type= 'basic')

pprecision, precall, pf1_score, pft_df_updated = fasttext_similarity(ft_df,
                                                                     'p_sentence1', 'p_sentence2',  
                                                                     ft, preprocess_type= 'preprocess')

In [35]:
print(precision, recall, f1_score)
print(bprecision, brecall, bf1_score)
print(pprecision, precall, pf1_score)

0.776 0.796 0.786
0.774 0.819 0.796
0.772 0.867 0.817


### Section 4: Contextual Representation Techniques

#### 4.1 Universal Sentence Encoder

In [47]:
def get_use():
    module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" 
    use_model = hub.load(module_url)
    print("module %s loaded" % module_url)
    return use_model

use_model = get_use()

module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [48]:
def use_cossim(df, columnA, columnB, use_model):
    

    sts_encode1 = tf.nn.l2_normalize(use_model(tf.constant(df[columnA].tolist())), 
                                     axis=1)
    sts_encode2 = tf.nn.l2_normalize(use_model(tf.constant(df[columnB].tolist())), 
                                     axis=1)
    
    cosine_similarities = cosine_similarity(sts_encode1, sts_encode2)

    return cosine_similarities

def get_use_results(df, columnA, columnB, preprocess_type):
    
    similarities = use_cossim(df, columnA, columnB, use_model)

    updated_df = get_prediction_values(similarities, df, preprocess_type)
    
    precision, recall, f1_score = evaluate_predictions(updated_df)
    
    return precision, recall, f1_score, updated_df

In [49]:
use_df = sts_dev_test_updated.copy()

In [50]:
use_precision, use_recall, use_f1_score, use_sts_df = get_use_results(use_df, 'sentence1', 
                                                                      'sentence2', preprocess_type=None)

buse_precision, buse_recall, buse_f1_score, buse_sts_df = get_use_results(use_df, 'b_sentence1', 
                                                                          'b_sentence2', preprocess_type='basic')

puse_precision, puse_recall, puse_f1_score, puse_sts_df = get_use_results(use_df, 'p_sentence1', 
                                                                          'p_sentence2',
                                                                          preprocess_type='preprocess')

In [51]:
print(use_precision, use_recall, use_f1_score)
print(buse_precision, buse_recall, buse_f1_score)
print(puse_precision, puse_recall, puse_f1_score)

0.755 0.903 0.822
0.757 0.902 0.823
0.761 0.897 0.824


#### 4.2 Sentence Transformers

In [16]:
def get_sent_model(model_name):
    return SentenceTransformer(model_name)

def call_sent_transformers(df, columnA, columnB, sent_model, preprocess_type):
    
        ref_sent1 = df[columnA].tolist()
        ref_sent2 = df[columnB].tolist()
        
        sent_embeddings1 = sent_model.encode(ref_sent1)
        sent_embeddings2 = sent_model.encode(ref_sent2)
        
        cosine_scores = cosine_similarity(normalize(sent_embeddings1, axis=1), 
                                          normalize(sent_embeddings2, axis=1))
        
        updated_df = get_prediction_values(cosine_scores, df, preprocess_type)

        precision, recall, f1_score = evaluate_predictions(updated_df)

        return precision, recall, f1_score, updated_df

#### 4.2.1 all-MiniLM-L6-v2

In [53]:
all_minilm_l6_df = sts_dev_test_updated.copy()

sent_model = get_sent_model('sentence-transformers/all-MiniLM-L6-v2')

In [54]:

precision, recall, f1_score, updated_df = call_sent_transformers(all_minilm_l6_df, 'sentence1', 
                                                                 'sentence2', 
                                                                 sent_model, 
                                                                 preprocess_type=None)

b_precision, b_recall, b_f1_score, b_updated_df = call_sent_transformers(all_minilm_l6_df, 'b_sentence1', 
                                                                         'b_sentence2', 
                                                                         sent_model, 
                                                                         preprocess_type='basic')

p_precision, p_recall, p_f1_score, p_updated_df = call_sent_transformers(all_minilm_l6_df, 'p_sentence1', 
                                                                         'p_sentence2', 
                                                                         sent_model, 
                                                                         preprocess_type='preprocess')


del sent_model
gc.collect()
torch.cuda.empty_cache()

In [56]:
print(precision, recall, f1_score)
print(b_precision, b_recall, b_f1_score)
print(p_precision, p_recall, p_f1_score)

0.754 0.909 0.824
0.752 0.91 0.823
0.757 0.908 0.826


#### 4.2.2 Stsb-roberta

In [17]:
stsb_roberta_df = sts_dev_test_updated.copy()

sent_model = get_sent_model('sentence-transformers/stsb-roberta-base-v2')

In [18]:
precision, recall, f1_score, updated_df = call_sent_transformers(stsb_roberta_df, 'sentence1', 
                                                                 'sentence2', 
                                                                 sent_model, preprocess_type=None)

b_precision, b_recall, b_f1_score, b_updated_df = call_sent_transformers(stsb_roberta_df, 'b_sentence1', 
                                                                         'b_sentence2', 
                                                                         sent_model, preprocess_type='basic')

p_precision, p_recall, p_f1_score, p_updated_df = call_sent_transformers(stsb_roberta_df, 'p_sentence1', 
                                                                         'p_sentence2', 
                                                                         sent_model, 
                                                                         preprocess_type='preprocess')


del sent_model
gc.collect()
torch.cuda.empty_cache()

In [19]:
print(precision, recall, f1_score)
print(b_precision, b_recall, b_f1_score)
print(p_precision, p_recall, p_f1_score)

0.784 0.899 0.838
0.782 0.902 0.838
0.784 0.888 0.832


#### 4.3 BERT Model

In [20]:
def get_bert_results(df, columnA, columnB, model, tokenizer, CLS_embedding, preprocess_type):
    
        ref_sent1 = df[columnA].tolist()
        ref_sent2 = df[columnB].tolist()
        
        ref_sent1 = [preprocessor.lowercase_str(x) for x in ref_sent1]
        ref_sent2 = [preprocessor.lowercase_str(x) for x in ref_sent2]
        
        encoded_input1 = tokenizer(ref_sent1, padding=True, 
                                   truncation=True, 
                                   return_tensors="pt")
        
        encoded_input2 = tokenizer(ref_sent2, padding=True, 
                                   truncation=True, 
                                   return_tensors="pt")
        
        with torch.no_grad():
            sent_embeddings1 = model(**encoded_input1)
            sent_embeddings2 = model(**encoded_input2)
            
        if CLS_embedding:
            embedds1 = sent_embeddings1.last_hidden_state[:, 0, :]
            embedds2 = sent_embeddings2.last_hidden_state[:, 0, :]
            
        else:
            embedds1 = sent_embeddings1.last_hidden_state.mean(dim=1)
            embedds2 = sent_embeddings2.last_hidden_state.mean(dim=1)
            
        norm_embeddings1 = torch.nn.functional.normalize(embedds1, p=2, dim=1).numpy()
        norm_embeddings2 = torch.nn.functional.normalize(embedds2, p=2, dim=1).numpy()

        cosine_scores = cosine_similarity(norm_embeddings1, norm_embeddings2)
        
        updated_df = get_prediction_values(cosine_scores, df, preprocess_type)

        precision, recall, f1_score = evaluate_predictions(updated_df)

        return precision, recall, f1_score, updated_df
    

In [21]:
model_name = "bert-base-uncased"

bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

#### 4.3.1 bert avg. embeddings

In [22]:
bert_avg_df = sts_dev_test_updated.copy()

In [23]:
precision, recall, f1_score, updated_df = get_bert_results(bert_avg_df, 'sentence1', 'sentence2', 
                                                          bert_model, bert_tokenizer, CLS_embedding=False,
                                                          preprocess_type= None)

bprecision, brecall, bf1_score, bupdated_df = get_bert_results(bert_avg_df, 'b_sentence1', 'b_sentence2', 
                                                           bert_model, bert_tokenizer, CLS_embedding=False,
                                                              preprocess_type = 'basic')


pprecision, precall, pf1_score, pupdated_df = get_bert_results(bert_avg_df, 'p_sentence1', 'p_sentence2', 
                                                            bert_model, bert_tokenizer, CLS_embedding=False,
                                                              preprocess_type='preprocess')


In [24]:
print(precision, recall, f1_score)
print(bprecision, brecall, bf1_score)
print(pprecision, precall, pf1_score)

0.778 0.77 0.774
0.775 0.768 0.772
0.784 0.745 0.764


#### 4.3.2 bert CLS embeddings

In [25]:
bert_cls_df = sts_dev_test_updated.copy()

In [26]:
precision, recall, f1_score, updated_df = get_bert_results(bert_cls_df, 'sentence1', 'sentence2', 
                                                          bert_model, bert_tokenizer, CLS_embedding=True,
                                                           preprocess_type= None)

bprecision, brecall, bf1_score, bupdated_df = get_bert_results(bert_cls_df, 'b_sentence1', 'b_sentence2', 
                                                               bert_model, bert_tokenizer, CLS_embedding=True,
                                                               preprocess_type= 'basic')


pprecision, precall, pf1_score, pupdated_df = get_bert_results(bert_cls_df, 'p_sentence1', 'p_sentence2', 
                                                               bert_model, bert_tokenizer, CLS_embedding=True,
                                                               preprocess_type= 'preprocess')

In [27]:
print(precision, recall, f1_score)
print(bprecision, brecall, bf1_score)
print(pprecision, precall, pf1_score)

0.785 0.573 0.662
0.783 0.593 0.675
0.814 0.591 0.685


In [28]:
del bert_model
gc.collect()
torch.cuda.empty_cache()