### Description

The notebook provides different NLP similarity pipelines, evaluated on public STS dataset.

The notebook details functions for preprocessing text and calculating similarities through cosine similarity. Various embedding strategies are explored, including weighted, string-level, distributed and contexual across different preprocessing techniques. It assesses model performance using pearson and spearson coorelation score, presenting comparative results to determine the most effective methods.

In [None]:
import warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import pandas as pd
pd.options.mode.chained_assignment = None

import torch
from datasets import load_dataset

import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

import math
import scipy

from sentence_transformers import SentenceTransformer
import gc

from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Section 0: Load & Preprocess STS Dataset 

In [2]:
# Load the 'stsb_multi_mt' dataset specifically for the English language ('en')
# 'stsb_multi_mt' refers to the multilingual STS benchmark dataset which includes various translations
# The 'name="en"' parameter specifies that we are interested in the English portion of this dataset
# This dataset is used for evaluating semantic textual similarity models

dataset = load_dataset("stsb_multi_mt", name = "en")

In [3]:
sts_train = dataset['train'].to_pandas()
sts_test = dataset['test'].to_pandas()
sts_dev = dataset['dev'].to_pandas()

sts_dev_test = pd.concat([sts_dev, sts_test], ignore_index=True)

In [4]:
def get_updated_df(df):
    
    char_count_columns = df[['sentence1', 'sentence2']].astype(str).applymap(len)
    df = df[char_count_columns.gt(30).all(axis=1)]
    return df

def format_float(x):
    if isinstance(x, float):
        if x.is_integer():
            return x
        else:
            return float("{:.2f}".format(x))
    else:
        return x

In [5]:
sts_dev_test_updated = get_updated_df(sts_dev_test)
sts_train_updated = get_updated_df(sts_train)

sts_dev_test_updated['similarity_score'] = sts_dev_test_updated['similarity_score'].apply(format_float)

In [6]:
sts_dev_test_updated.head()

Unnamed: 0,sentence1,sentence2,similarity_score
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0
6,A man is erasing a chalk board.,The man is erasing the chalk board.,5.0
13,The man cut down a tree with an axe.,A man chops down a tree with an axe.,5.0
16,The girl sang into a microphone.,The lady sang into the microphone.,2.4


In [7]:
class Preprocessing:
    def __init__(self):
        
        # Initialize the Preprocessing class with a lemmatizer and a set of English stop words
        
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def rm_specialchars(self, text):

        line = re.sub(r"http\S+", "", text)
        line = re.sub("[^A-Za-z]+", " ", line)
        line = re.sub('\s+', ' ', line)
        line = line.replace('\t',' ')
        line = line.replace('\n',' ')
        line = line.replace('\r',' ')
        line = line.replace(',',' ')
        line = line.replace('-',' ')
        return line.strip()


    def rm_stopwords(self, text):

        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in self.stop_words]
        return ' '.join(filtered_text)


    def lemmatize_str(self, text):
        
        lemma_text = [self.lemmatizer.lemmatize(word) for word in text]
        return (''.join(lemma_text)).strip()
    

    def lowercase_str(self, text):
        
        return text.lower()


    def basic_preprocess(self, text):
        """
        Perform a basic preprocessing routine on the input text:
        Lowercase the string, remove special characters, and lemmatize the result.
        
        """
        
        l_str = self.lowercase_str(text)
        sc_str = self.rm_specialchars(l_str)
        updated_text = self.lemmatize_str(sc_str)
        
        return updated_text
    
    def preprocess(self, text):
        """
        Perform a comprehensive preprocessing routine on the input text:
        Lowercase the string, remove special characters and stopwords, and lemmatize the result.
        
        """
        
        l_str = self.lowercase_str(text)
        sc_str = self.rm_specialchars(l_str)
        sw_str = self.rm_stopwords(sc_str)
        updated_text = self.lemmatize_str(sw_str)
        
        return updated_text
        

In [8]:
preprocessor = Preprocessing()

def apply_preprocessing(preprocessor, df):
    
    df['b_sentence1'] = df['sentence1'].apply(lambda x: preprocessor.basic_preprocess(x))
    df['b_sentence2'] = df['sentence2'].apply(lambda x: preprocessor.basic_preprocess(x))
    
    df['p_sentence1'] = df['sentence1'].apply(lambda x: preprocessor.preprocess(x))
    df['p_sentence2'] = df['sentence2'].apply(lambda x: preprocessor.preprocess(x))
    
    return df

In [9]:
sts_train_updated = apply_preprocessing(preprocessor, sts_train_updated)
sts_dev_test_updated = apply_preprocessing(preprocessor, sts_dev_test_updated)

In [10]:
sts_dev_test_updated.head()

Unnamed: 0,sentence1,sentence2,similarity_score,b_sentence1,b_sentence2,p_sentence1,p_sentence2
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0,a man with a hard hat is dancing,a man wearing a hard hat is dancing,man hard hat dancing,man wearing hard hat dancing
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0,a man is feeding a mouse to a snake,the man is feeding a mouse to the snake,man feeding mouse snake,man feeding mouse snake
6,A man is erasing a chalk board.,The man is erasing the chalk board.,5.0,a man is erasing a chalk board,the man is erasing the chalk board,man erasing chalk board,man erasing chalk board
13,The man cut down a tree with an axe.,A man chops down a tree with an axe.,5.0,the man cut down a tree with an axe,a man chops down a tree with an axe,man cut tree axe,man chops tree axe
16,The girl sang into a microphone.,The lady sang into the microphone.,2.4,the girl sang into a microphone,the lady sang into the microphone,girl sang microphone,lady sang microphone


### Util Functions

In [11]:
def format_corr_values(dictionary):
    
    modified_dict = {key: float(format(value, ".4e")) if 'e' in str(value) else float(format(value, ".4f")) 
                     for key, value in dictionary.items()}
    
    return modified_dict

In [12]:
def get_pearson_spearman_results(sts_pearson_corr, b_pearson_corr, p_pearson_corr,
                                 sts_spearmanr_corr, b_spearmanr_corr, p_spearmanr_corr):
    
    # Define a dictionary to store Pearson and Spearman correlation results for 
    #three different preprocessing stages

    corr_results = {
        "Pearson": {
            "sts": {
                "r": sts_pearson_corr.correlation,
                "pvalue":sts_pearson_corr.pvalue
                
            },
            "basic_sts": {
                "r": b_pearson_corr.correlation,
                "pvalue": b_pearson_corr.pvalue
                

            },
            "preprocess_sts": {
                "r": p_pearson_corr.correlation,
                "pvalue":p_pearson_corr.pvalue
 
            },
        },
        "Spearman": {
            "sts": {
                "r": sts_spearmanr_corr.correlation,
                "pvalue": sts_spearmanr_corr.pvalue,
                
            },
            "basic_sts": {
                "r": b_spearmanr_corr.correlation,
                "pvalue": b_spearmanr_corr.pvalue,

            },
            "preprocess_sts": {
                "r": p_spearmanr_corr.correlation,
                "pvalue": p_spearmanr_corr.pvalue,

            },
        },
    }
    
    #  Convert the nested dictionary into a DataFrame for better readability and further analysis

    corr_results_df = pd.DataFrame.from_dict(corr_results)
    
    
    corr_results_df['Pearson'] = corr_results_df['Pearson'].apply(format_corr_values)
    corr_results_df['Spearman'] = corr_results_df['Spearman'].apply(format_corr_values) 

    return corr_results_df

### Section 1: Weighted Representation Approach

#### 1.1 tfidf

In [31]:
def get_vectorizer(text_preprocessed):
    
    if text_preprocessed:
        vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, 
                                     ngram_range=(1,3), 
                                     stop_words=None)

        
    else:
        vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, 
                                     ngram_range=(1,3), 
                                     lowercase=False,
                                     stop_words=None) 
 
    return vectorizer


def get_train_vecs(df_train, columnA, columnB):
        
    train_sents = df_train[[columnA, columnB]].values.tolist()
    train_sents = [item for sublist in train_sents for item in sublist]

    return train_sents


def get_tfidf_results(df_train, df_test, columnA, columnB, score_column,
                      text_preprocessed):
    
    train_sents = get_train_vecs(df_train, columnA, columnB)
    vectorizer = get_vectorizer(text_preprocessed)
    vectorizer.fit_transform(train_sents)
    
    list1 = vectorizer.transform(df_test[columnA].tolist())
    list2 = normalize(list1.toarray())
    
    list2 = vectorizer.transform(df_test[columnB].tolist())
    list2 = normalize(list2.toarray())
    
    sims_scores = cosine_similarity(list1, list2)
    tfidf_scores = list(np.diagonal(sims_scores))
    tfidf_scores = [float("{:.2f}".format(i)) for i in tfidf_scores]
    
    df_test[score_column] = tfidf_scores
    
    sts_scores = df_test['similarity_score'].tolist()
        
    pearson_corr = scipy.stats.pearsonr(tfidf_scores, sts_scores)
    spearmanr_corr = scipy.stats.spearmanr(tfidf_scores, sts_scores)

    return df_test, pearson_corr, spearmanr_corr


In [32]:
tfidf_train_df = sts_train_updated.copy()
tfidf_test_df = sts_dev_test_updated.copy()

In [33]:
df_tfidf, pearsonr, spearmanr = get_tfidf_results(tfidf_train_df, tfidf_test_df,
                                                 'sentence1', 'sentence2', 
                                                 'sts_tfidf_score', 
                                                 text_preprocessed=False)

df_tfidf, b_pearson, b_spearmanr = get_tfidf_results(tfidf_train_df, tfidf_test_df,
                                                     'b_sentence1', 'b_sentence2', 
                                                     'b_sts_tfidf_score', 
                                                     text_preprocessed=True)


df_tfidf, p_pearson, p_spearmanr = get_tfidf_results(tfidf_train_df, tfidf_test_df,
                                                     'p_sentence1', 'p_sentence2', 
                                                     'p_sts_tfidf_score',
                                                     text_preprocessed=True)


In [34]:
tfidf_corr_results = get_pearson_spearman_results(pearsonr, b_pearson, p_pearson,
                                                  spearmanr, b_spearmanr, p_spearmanr)
tfidf_corr_results

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.5901, 'pvalue': 4.5673e-204}","{'r': 0.5918, 'pvalue': 1.5367e-205}"
basic_sts,"{'r': 0.6142, 'pvalue': 1.6358e-225}","{'r': 0.6148, 'pvalue': 4.5898e-226}"
preprocess_sts,"{'r': 0.6423, 'pvalue': 4.031e-253}","{'r': 0.635, 'pvalue': 1.1265e-245}"


In [35]:
df_tfidf.head()

Unnamed: 0,sentence1,sentence2,similarity_score,b_sentence1,b_sentence2,p_sentence1,p_sentence2,sts_tfidf_score,b_sts_tfidf_score,p_sts_tfidf_score
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0,a man with a hard hat is dancing,a man wearing a hard hat is dancing,man hard hat dancing,man wearing hard hat dancing,0.76,0.76,0.82
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0,a man is feeding a mouse to a snake,the man is feeding a mouse to the snake,man feeding mouse snake,man feeding mouse snake,0.82,0.82,1.0
6,A man is erasing a chalk board.,The man is erasing the chalk board.,5.0,a man is erasing a chalk board,the man is erasing the chalk board,man erasing chalk board,man erasing chalk board,0.7,0.69,1.0
13,The man cut down a tree with an axe.,A man chops down a tree with an axe.,5.0,the man cut down a tree with an axe,a man chops down a tree with an axe,man cut tree axe,man chops tree axe,0.86,0.87,0.87
16,The girl sang into a microphone.,The lady sang into the microphone.,2.4,the girl sang into a microphone,the lady sang into the microphone,girl sang microphone,lady sang microphone,0.32,0.32,0.59


In [17]:
df_tfidf.to_csv('./results/STS_results/TFIDF/tfidf_sts_results.csv', index=False)
tfidf_corr_results.to_csv('./results/STS_results/TFIDF/tfidf_sts_corr_results.csv', index=False)

### Section 2: String-Level

#### 2.1 JSI

In [18]:
jsi_df = sts_dev_test_updated.copy()

In [19]:
def jaccard_similarity(sentence1, sentence2):
    
    tokens1 = set(sentence1.split())
    tokens2 = set(sentence2.split())
    intersection = len(tokens1.intersection(tokens2))    
    return float(intersection) / (len(tokens1) + len(tokens2) - intersection)

def find_highest_jsi(df, result_column, columnA, columnB):

    jsi_results = []
    ref_sent1 = df[columnA].tolist()
    ref_sent2 = df[columnB].tolist()
    
    for sent1, sent2 in zip(ref_sent1, ref_sent2):
        
        jsi_score = jaccard_similarity(sent1, sent2)
        jsi_results.append(float("{:.2f}".format(jsi_score)))
    
    df[result_column] = jsi_results

    sts_scores = df['similarity_score'].tolist()
    
    pearson_corr = scipy.stats.pearsonr(jsi_results, sts_scores)
    spearmanr_corr = scipy.stats.spearmanr(jsi_results, sts_scores)
          
    return df, pearson_corr, spearmanr_corr

In [20]:
jsi_df, pearsonr, spearmanr = find_highest_jsi(jsi_df, 'sts_jsi_score',
                                               'sentence1', 'sentence2')

jsi_df, b_pearson, b_spearmanr = find_highest_jsi(jsi_df, 'b_sts_jsi_score',
                                                  'b_sentence1', 'b_sentence2')

jsi_df, p_pearson, p_spearmanr = find_highest_jsi(jsi_df, 'p_sts_jsi_score',
                                                  'p_sentence1', 'p_sentence2')
jsi_corr_results = get_pearson_spearman_results(pearsonr, b_pearson, p_pearson,
                                                spearmanr, b_spearmanr, p_spearmanr)


In [21]:
jsi_corr_results

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.5679, 'pvalue': 8.554e-186}","{'r': 0.5757, 'pvalue': 4.7407e-192}"
basic_sts,"{'r': 0.6493, 'pvalue': 2.2982e-260}","{'r': 0.6557, 'pvalue': 2.7874e-267}"
preprocess_sts,"{'r': 0.6982, 'pvalue': 1.8116e-317}","{'r': 0.7076, 'pvalue': 0.0}"


In [22]:
jsi_df.to_csv('./results/STS_results/JSI/JSI_sts_results.csv', 
              index=False)

jsi_corr_results.to_csv('./results/STS_results/JSI/JSI_sts_corr_results.csv', 
                        index=False)

### Section 3: Distributed Representation Approaches

#### 3.1 Glove Word2Vec

In [None]:
import requests
import os
import zipfile

glove_model_url = 'https://nlp.stanford.edu/data/glove.6B.zip'
save_dir = './models/glove_model'

response = requests.get(glove_model_url)

zip_file_path = os.path.join(save_dir, 'glove.6B.zip')

with open(zip_file_path, 'wb') as f:
    f.write(response.content)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('./models/glove_model/')

In [24]:
def load_glove_model(File):
    
    glove_model = {}
    
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_model = load_glove_model('./models/glove_model/glove.6B.300d.txt')
glove_vocab = list(glove_model.keys())


400000 words loaded!


In [25]:
def word2vec_similarity(df, result_column, columnA, columnB, 
                        glove_model, glove_vocab, preprocessor):
    

    ref_sent1 = df[columnA].tolist()
    ref_sent2 = df[columnB].tolist()
    
    if result_column == 'sts_word2vec_score':
            # special case where original sentences needs to be lower cased because the loaded 
            #glove model is pretrained on lowercase corpus
        ref_sent1 = [preprocessor.lowercase_str(x) for x in ref_sent1]
        ref_sent2 = [preprocessor.lowercase_str(x) for x in ref_sent2]

    
    glove_embeds1 = [[glove_model[word] for word in sentence.split() if word in glove_vocab] 
                     for sentence in ref_sent1]
    
    glove_embeds2 = [[glove_model[word] for word in sentence.split() if word in glove_vocab] 
                     for sentence in ref_sent2]

    sent_vec1 = [np.mean(normalize(wordvec, axis=1), axis=0) for wordvec in glove_embeds1]
    sent_vec2 = [np.mean(normalize(wordvec, axis=1), axis=0) for wordvec in glove_embeds2]
    
    word2vec_scores = list(np.diagonal(cosine_similarity(sent_vec1, sent_vec2)))
    word2vec_scores = [float("{:.2f}".format(i)) for i in word2vec_scores]
    
    df[result_column] = word2vec_scores
    
    sts_scores = df['similarity_score'].tolist()
        
    pearson_corr = scipy.stats.pearsonr(word2vec_scores, sts_scores)
    spearmanr_corr = scipy.stats.spearmanr(word2vec_scores, sts_scores)
    
    return df, pearson_corr, spearmanr_corr

In [26]:
word2vec_glove_df = sts_dev_test_updated.copy()

In [27]:
word2vec_glove_df, pearsonr, spearmanr = word2vec_similarity(word2vec_glove_df, 'sts_word2vec_score',
                                                        'sentence1', 'sentence2',  
                                                        glove_model, glove_vocab, preprocessor)

word2vec_glove_df, b_pearson, b_spearmanr = word2vec_similarity(word2vec_glove_df, 'b_sts_word2vec_score',
                                                           'b_sentence1', 'b_sentence2', 
                                                           glove_model, glove_vocab, preprocessor)

word2vec_glove_df, p_pearson, p_spearmanr = word2vec_similarity(word2vec_glove_df, 'p_sts_word2vec_score',
                                                           'p_sentence1', 'p_sentence2', 
                                                           glove_model, glove_vocab, preprocessor)

In [28]:
word2vec_glove_corr_results = get_pearson_spearman_results(pearsonr, b_pearson, p_pearson,
                                                           spearmanr, b_spearmanr, p_spearmanr)

In [29]:
word2vec_glove_corr_results

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.4006, 'pvalue': 1.4899e-84}","{'r': 0.445, 'pvalue': 3.438e-106}"
basic_sts,"{'r': 0.4769, 'pvalue': 7.9044e-124}","{'r': 0.5178, 'pvalue': 2.4746e-149}"
preprocess_sts,"{'r': 0.6636, 'pvalue': 5.4339e-276}","{'r': 0.6653, 'pvalue': 7.1907e-278}"


In [30]:
word2vec_glove_df.to_csv('./results/STS_results/Glove_word2vec/word2vec_sts_results.csv', 
                         index=False)
word2vec_glove_corr_results.to_csv('./results/STS_results/Glove_word2vec/word2vec_sts_corr_results.csv', 
                                   index=False)

#### 3.2 FastText Embeddings

In [25]:
# download ft model

#import fasttext.util
#import shutil

#fasttext.util.download_model('en', if_exists='ignore')

In [32]:
import fasttext
ft = fasttext.load_model('./models/cc.en.300.bin')



In [33]:
def fasttext_similarity(df, result_column, columnA, columnB, ft):
    
    ref_sent1 = df[columnA].tolist()
    ref_sent2 = df[columnB].tolist()

    ft_embeds1 = [ft.get_sentence_vector(sent) for sent in ref_sent1]    
    ft_embeds2 = [ft.get_sentence_vector(sent) for sent in ref_sent2]    
    
    ft_scores = list(np.diagonal(cosine_similarity(ft_embeds1, ft_embeds2)))
    ft_scores = [float("{:.2f}".format(i)) for i in ft_scores]
    
    df[result_column] = ft_scores
    sts_scores = df['similarity_score'].tolist()
        
    pearson_corr = scipy.stats.pearsonr(ft_scores, sts_scores)
    spearmanr_corr = scipy.stats.spearmanr(ft_scores, sts_scores)
    
    return df, pearson_corr, spearmanr_corr
    

In [34]:
ft_df = sts_dev_test_updated.copy()

In [35]:
ft_df, pearsonr, spearmanr = fasttext_similarity(ft_df, 'sts_ft_score',
                                                        'sentence1', 'sentence2', ft)

ft_df, b_pearson, b_spearmanr = fasttext_similarity(ft_df, 'b_ft_score',
                                                           'b_sentence1', 'b_sentence2', ft)

ft_df, p_pearson, p_spearmanr = fasttext_similarity(ft_df, 'p_sts_ft_score',
                                                           'p_sentence1', 'p_sentence2', ft)

In [36]:
ft_corr_results = get_pearson_spearman_results(pearsonr, b_pearson, p_pearson,
                                               spearmanr, b_spearmanr, p_spearmanr)
ft_corr_results

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.472, 'pvalue': 5.6961e-121}","{'r': 0.4871, 'pvalue': 7.2688e-130}"
basic_sts,"{'r': 0.5367, 'pvalue': 2.1036e-162}","{'r': 0.5486, 'pvalue': 5.4266e-171}"
preprocess_sts,"{'r': 0.7133, 'pvalue': 0.0}","{'r': 0.7064, 'pvalue': 0.0}"


In [37]:
ft_df.to_csv('./results/STS_results/Fasttext_model/ft_sts_results.csv', index=False)
ft_corr_results.to_csv('./results/STS_results/Fasttext_model/ft_sts_corr_results.csv', index=False)

### Section 4: Contextual Representation Techniques

#### 4.1 Universal Sentence Encoder

In [38]:
def get_use():
    module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" 
    use_model = hub.load(module_url)
    print("module %s loaded" % module_url)
    return use_model

In [39]:
use_model = get_use()

module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [40]:
def use_cossim(df, columnA, columnB, use_model):
    
    sts_encode1 = tf.nn.l2_normalize(use_model(tf.constant(df[columnA].tolist())), 
                                     axis=1)
    sts_encode2 = tf.nn.l2_normalize(use_model(tf.constant(df[columnB].tolist())), 
                                     axis=1)
    
    cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), 
                                        axis=1)
    return cosine_similarities

def get_use_results(df, columnA, columnB, result_column):
    
    cos_sim = use_cossim(df, columnA, columnB, use_model)    
    cos_sim_scores = [float("{:.2f}".format(i)) for i in cos_sim.numpy().tolist()]
    
    df[result_column] = cos_sim_scores
    
    sts_scores = df['similarity_score'].tolist()
    
    pearson_corr = scipy.stats.pearsonr(df[result_column].tolist(), sts_scores)
    spearmanr_corr = scipy.stats.spearmanr(df[result_column].tolist(), sts_scores)
    
    return pearson_corr, spearmanr_corr, df

In [41]:
use_df = sts_dev_test_updated.copy()

In [42]:
sts_pearson_corr, sts_spearmanr_corr, sts_df = get_use_results(use_df, 'sentence1', 
                                                               'sentence2', 'sts_use_score')

b_pearson_corr, b_spearmanr_corr, b_sts_df = get_use_results(use_df, 'b_sentence1', 
                                                             'b_sentence2', 'b_sts_use_score')

p_pearson_corr, p_spearmanr_corr, p_sts_df = get_use_results(use_df, 'p_sentence1', 
                                                             'p_sentence2', 'p_sts_use_score')

In [43]:
use_corr_results = get_pearson_spearman_results(sts_pearson_corr, b_pearson_corr, p_pearson_corr,
                                            sts_spearmanr_corr, b_spearmanr_corr, p_spearmanr_corr)
use_corr_results

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.8061, 'pvalue': 0.0}","{'r': 0.7954, 'pvalue': 0.0}"
basic_sts,"{'r': 0.8013, 'pvalue': 0.0}","{'r': 0.7896, 'pvalue': 0.0}"
preprocess_sts,"{'r': 0.781, 'pvalue': 0.0}","{'r': 0.7674, 'pvalue': 0.0}"


In [44]:
use_df.to_csv('./results/STS_results/USE_model/use_sts_results.csv', index=False)
use_corr_results.to_csv('./results/STS_results/USE_model/use_sts_corr_results.csv', index=False)

#### 4.2 Sentence Transformers

In [13]:
def get_sent_model(model_name):
    return SentenceTransformer(model_name)

def call_sent_transformers(df, columnA, columnB, result_column, sent_model):
    
        ref_sent1 = df[columnA].tolist()
        ref_sent2 = df[columnB].tolist()
        
        sent_embeddings1 = sent_model.encode(ref_sent1)
        sent_embeddings2 = sent_model.encode(ref_sent2)
        
        cosine_scores = cosine_similarity(normalize(sent_embeddings1, axis=1), 
                                          normalize(sent_embeddings2, axis=1))
        
        st_scores = [float("{:.2f}".format(x)) for x in list(np.diagonal(cosine_scores))]
        df[result_column] = st_scores
        
        sts_scores = df['similarity_score'].tolist()
        
        pearson_corr = scipy.stats.pearsonr(st_scores, sts_scores)
        spearmanr_corr = scipy.stats.spearmanr(st_scores, sts_scores)

        return df, pearson_corr, spearmanr_corr

In [14]:
def get_sent_transformer_results(df, model_name):
    
    sent_model = get_sent_model(model_name)
    
    df, pearsonr, spearmanr = call_sent_transformers(df, 'sentence1', 'sentence2', 
                                                     'sts_st_' + model_name + '_score', sent_model)

    df, b_pearsonr, b_spearmanr = call_sent_transformers(df, 'b_sentence1', 'b_sentence2', 
                                                         'b_sts_st_' + model_name + '_score', sent_model)

    df, p_pearsonr, p_spearmanr = call_sent_transformers(df, 'p_sentence1', 'p_sentence2', 
                                                         'p_sts_st_' + model_name + '_score', sent_model)

    corr_results = get_pearson_spearman_results(pearsonr, b_pearsonr, p_pearsonr,
                                                 spearmanr, b_spearmanr, p_spearmanr)
    
    del sent_model
    gc.collect()
    torch.cuda.empty_cache()
    
    return corr_results, df

#### 4.2.1 all-MiniLM-L6-v2

In [47]:
all_minilm_l6_df = sts_dev_test_updated.copy()

In [48]:
corr_res_allMiniLML6, all_minilm_l6_df = get_sent_transformer_results(all_minilm_l6_df, 
                                                                      model_name='sentence-transformers/all-MiniLM-L6-v2')


In [49]:
corr_res_allMiniLML6

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.8374, 'pvalue': 0.0}","{'r': 0.8242, 'pvalue': 0.0}"
basic_sts,"{'r': 0.8349, 'pvalue': 0.0}","{'r': 0.8202, 'pvalue': 0.0}"
preprocess_sts,"{'r': 0.8196, 'pvalue': 0.0}","{'r': 0.8041, 'pvalue': 0.0}"


In [50]:
all_minilm_l6_df.to_csv('./results/STS_results/Sentence_transformers/all-MiniLM-L6-v2/minilm_sts_results.csv', 
                     index=False)

corr_res_allMiniLML6.to_csv('./results/STS_results/Sentence_transformers/all-MiniLM-L6-v2/minilm_sts_corr_results.csv'
                            , index=False)

#### 4.2.2 Stsb-roberta

In [15]:
model_name = 'sentence-transformers/stsb-roberta-base-v2'
stsb_roberta_df = sts_dev_test_updated.copy()

corr_results_stsb_roberta, stsb_roberta_df = get_sent_transformer_results(stsb_roberta_df, 
                                                                          model_name)

In [16]:
corr_results_stsb_roberta

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.8766, 'pvalue': 0.0}","{'r': 0.8733, 'pvalue': 0.0}"
basic_sts,"{'r': 0.8548, 'pvalue': 0.0}","{'r': 0.8462, 'pvalue': 0.0}"
preprocess_sts,"{'r': 0.8181, 'pvalue': 0.0}","{'r': 0.8064, 'pvalue': 0.0}"


In [17]:
stsb_roberta_df.to_csv('./results/STS_results/Sentence_transformers/stsb-roberta-base-v2/stsb_roberta_sts_results.csv'
                       , index=False)

corr_results_stsb_roberta.to_csv('./results/STS_results/Sentence_transformers/stsb-roberta-base-v2/stsb_roberta_sts_corr_results.csv'
                                 , index=False)


#### 4.3 BERT Model

In [18]:
def get_bert_results(df, lowercase, columnA, columnB, result_column, CLS_embedding, model, tokenizer):
    
        ref_sent1 = df[columnA].tolist()
        ref_sent2 = df[columnB].tolist()
        
        if lowercase and result_column=='sts_st_' + model_name + '_score':
            # special case where original sentences needs to be lower cased because the loaded 
            #uncased models are pretrained on lowercase corpus
            
            ref_sent1 = [preprocessor.lowercase_str(x) for x in ref_sent1]
            ref_sent2 = [preprocessor.lowercase_str(x) for x in ref_sent2]
        
        encoded_input1 = tokenizer(ref_sent1, padding=True, 
                                   truncation=True, 
                                   return_tensors="pt")
        
        encoded_input2 = tokenizer(ref_sent2, padding=True, 
                                   truncation=True, 
                                   return_tensors="pt")
        
        with torch.no_grad():
            sent_embeddings1 = model(**encoded_input1)
            sent_embeddings2 = model(**encoded_input2)
            
        if CLS_embedding:
            embedds1 = sent_embeddings1.last_hidden_state[:, 0, :]
            embedds2 = sent_embeddings2.last_hidden_state[:, 0, :]
            result_column = result_column + '_CLS'
            
        else:
            embedds1 = sent_embeddings1.last_hidden_state.mean(dim=1)
            embedds2 = sent_embeddings2.last_hidden_state.mean(dim=1)
            
        norm_embeddings1 = torch.nn.functional.normalize(embedds1, p=2, dim=1).numpy()
        norm_embeddings2 = torch.nn.functional.normalize(embedds2, p=2, dim=1).numpy()

        cosine_scores = cosine_similarity(norm_embeddings1, norm_embeddings2)
        bert_scores = [float("{:.2f}".format(x)) for x in list(np.diagonal(cosine_scores))]
        
        df[result_column] = bert_scores
        sts_scores = df['similarity_score'].tolist()
        
        pearson_corr = scipy.stats.pearsonr(bert_scores, sts_scores)
        spearmanr_corr = scipy.stats.spearmanr(bert_scores, sts_scores)
        
        return df, pearson_corr, spearmanr_corr
    

In [19]:
def call_bert_results(df, model_name, model, tokenizer, lowercase, CLS_embedding):
    
    
    df, pearsonr, spearmanr = get_bert_results(df, lowercase, 'sentence1', 'sentence2', 
                                               'sts_st_' + model_name + '_score', CLS_embedding,
                                               model, tokenizer)

    df, b_pearsonr, b_spearmanr = get_bert_results(df, lowercase, 'b_sentence1', 'b_sentence2', 
                                                   'b_sts_' + model_name + '_score', CLS_embedding,
                                                   model, tokenizer)

    df, p_pearsonr, p_spearmanr = get_bert_results(df, lowercase, 'p_sentence1', 'p_sentence2', 
                                                   'p_sts_' + model_name + '_score', CLS_embedding,
                                                   model, tokenizer)

    corr_results = get_pearson_spearman_results(pearsonr, b_pearsonr, p_pearsonr,
                                                 spearmanr, b_spearmanr, p_spearmanr)
    
    return corr_results, df


In [20]:
model_name = "bert-base-uncased"

bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

#### 4.3.1 bert avg. embeddings

In [21]:
bert_avg_df = sts_dev_test_updated.copy()

In [22]:
bert_res_avg, bert_avg_df = call_bert_results(bert_avg_df, model_name, bert_model, 
                                              bert_tokenizer,
                                              lowercase=True, CLS_embedding=False)

In [23]:
bert_res_avg

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.5921, 'pvalue': 9.4414e-206}","{'r': 0.5924, 'pvalue': 5.4604e-206}"
basic_sts,"{'r': 0.5866, 'pvalue': 4.152e-201}","{'r': 0.5855, 'pvalue': 3.7507e-200}"
preprocess_sts,"{'r': 0.6141, 'pvalue': 2.0284e-225}","{'r': 0.6029, 'pvalue': 3.2489e-215}"


In [24]:
bert_avg_df.to_csv('./results/STS_results/BERT_models/BERT_Avg_Embeddings/bert_avg_sts_results.csv', 
                     index=False)
bert_res_avg.to_csv('./results/STS_results/BERT_models/BERT_Avg_Embeddings/bert_avg_sts_corr_results.csv', 
                    index=False)

#### 4.3.2 bert CLS embeddings

In [25]:
bert_cls_df = sts_dev_test_updated.copy()

In [26]:
bert_corr_res_cls, bert_cls_df = call_bert_results(bert_cls_df, model_name, bert_model, 
                                              bert_tokenizer,
                                              lowercase=True, CLS_embedding=True)

In [27]:
bert_corr_res_cls

Unnamed: 0,Pearson,Spearman
sts,"{'r': 0.313, 'pvalue': 1.3621e-50}","{'r': 0.3445, 'pvalue': 1.3898e-61}"
basic_sts,"{'r': 0.3414, 'pvalue': 1.8817e-60}","{'r': 0.3723, 'pvalue': 2.0093e-72}"
preprocess_sts,"{'r': 0.4081, 'pvalue': 5.3893e-88}","{'r': 0.437, 'pvalue': 4.9384e-102}"


In [28]:
bert_cls_df.to_csv('./results/STS_results/BERT_models/BERT_CLS_Embeddings/bert_cls_sts_results.csv', 
                     index=False)
bert_corr_res_cls.to_csv('./results/STS_results/BERT_models/BERT_CLS_Embeddings/bert_cls_sts_corr_results.csv', 
                    index=False)

In [29]:
del bert_model
gc.collect()
torch.cuda.empty_cache()