## Installing Sentence Transsformer and other models/frameworks

In [2]:
!pip install sentence_transformers -q
!pip install gensim -q

# Kindly add all your installations and versions if any in this cell.

## Importing necessary libraries. 
In the final version all imports should be stricly enlisted here.

In [3]:
import pandas as pd
import numpy as np
import spacy
from scipy import stats
from sklearn import linear_model
import string

from sentence_transformers import SentenceTransformer, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

import torch 
from torch.utils.data import DataLoader

import gensim.downloader


  from .autonotebook import tqdm as notebook_tqdm


## Load dataset: 7 marks
1 Download and unzip the dataset from this link http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz  **1 mark**

2 Complete the code in `read_sts_csv()`. **4.5 marks**

3 Create 3 dataframes one each for train, test and val and print their final shapes. **1.5 marks**

In [4]:
INPUT_PATH = 'stsbenchmark/'

def read_sts_csv(dataset_type="train", columns=['source', 'type', 'year', 'id', 'score', 'sent_a', 'sent_b']):
  path = INPUT_PATH + "sts-"+ dataset_type + ".csv"
  """
  Take the input path and return the dataframe
  """
  # Open File as Text File
  with open(path, 'r', encoding='utf-8') as f:
    # Read the file as a list of lines
    lines = f.readlines()

  output = []
  for line in lines:
    # Split the line by tab
    line = line.strip().split('\t')
    # Append the line to output, discarding extra columns which occur in some rows
    output.append(line[:len(columns)])

  # Convert the output to a dataframe
  df = pd.DataFrame(output, columns=columns)
  return df

# df_<dataset_type> = read_sts_csv(dataset_type) # create the train, dev and test dataframes
df_train = read_sts_csv("train")
df_dev = read_sts_csv("dev")
df_test = read_sts_csv("test")

## Hyperparameters: 5 Marks
Update this cell with you choosen parameters except, NUM_EPOCHS

In [39]:
NON_CONEXTUAL_MODEL_TYPE = 'fasttext-wiki-news-subwords-300'
CONEXTUAL_MODEL_TYPE = 'paraphrase-multilingual-mpnet-base-v2'
HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-mpnet-base-v2" # USE THE HUGGAING FACE VERSION OF SENTENCE_TRANSFORMER_TYPE
INPUT_PATH = 'stsbenchmark/'
BATCH_SIZE = 32
OUT_DIM_DENSE = 556
NUM_EPOCHS = 2 ## THIS IS FIXED DO NOT CHANGE

# You are free to add your own hyperparameters as well.
NUM_WARMUP = 500
MODEL_SAVE_PATH = "model/"

## CONFIGURATION 1: Non-contextual Embeddings + ML Regression: 8 marks
1 Load the non-contextual embedding model in variable `non_cont_model1`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model1()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model1`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model1`. **1.5 mark**



In [30]:
def get_sentence_vector(sentence, model):
    # Preprocess the sentence
    sentence = sentence.lower()
    words = sentence.split()
    # Remove punctuation
    words = [word.strip(string.punctuation) for word in words]
    # Create a vector of zeros that has the same length as the output of the model
    vector = np.zeros(len(model[0]))
    count = 0
    # Loop through each word, adding the vector for that word to the total, if the word is not in the model, ignore it
    for word in words:
        if word in model:
            vector += model[word]
            count += 1
    # Return the average of the vectors (the count is the number of words in the sentence that were in the model)
    return vector / count

def get_feature_model1(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using non_cont_model1,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """

  sent_a = data_frame['sent_a'].values
  sent_b = data_frame['sent_b'].values

  sent_a_vectors = np.zeros((len(sent_a), 300))
  sent_b_vectors = np.zeros((len(sent_b), 300))

  for i in range(len(sent_a)):
    sent_a_vectors[i] = get_sentence_vector(sent_a[i], non_cont_model1)
    sent_b_vectors[i] = get_sentence_vector(sent_b[i], non_cont_model1)

  return sent_a_vectors, sent_b_vectors
  
# non_cont_model1 = gensim.downloader.load('fasttext-wiki-news-subwords-300')

# feature_1_<dataset_type>, feature_2_<dataset_type> = get_feature_model1(data_frame)
# feature_1_train, feature_2_train = get_feature_model1(df_train)
# feature_1_dev, feature_2_dev = get_feature_model1(df_dev)
# feature_1_test, feature_2_test = get_feature_model1(df_test)

# Save the features to disk
# np.save('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_1_train.npy', feature_1_train)
# np.save('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_2_train.npy', feature_2_train)
# np.save('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_1_dev.npy', feature_1_dev)
# np.save('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_2_dev.npy', feature_2_dev)
# np.save('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_1_test.npy', feature_1_test)
# np.save('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_2_test.npy', feature_2_test)

# Load the features from disk
feature_1_train = np.load('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_1_train.npy')
feature_2_train = np.load('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_2_train.npy')
feature_1_dev = np.load('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_1_dev.npy')
feature_2_dev = np.load('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_2_dev.npy')
feature_1_test = np.load('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_1_test.npy')
feature_2_test = np.load('WordEmbeddingSaves/' + 'non_cont_model1' + '_feature_2_test.npy')

# X_<dataset_type>, Y_<dataset_type> = 
# Combine by mean
X_train, Y_train = np.mean([feature_1_train, feature_2_train], axis=0), df_train['score'].values
X_dev, Y_dev = np.mean([feature_1_dev, feature_2_dev], axis=0), df_dev['score'].values
X_test, Y_test = np.mean([feature_1_test, feature_2_test], axis=0), df_test['score'].values

# Initiate a regression model and train it.
regression_model = linear_model.Ridge(alpha=0.5)
regression_model.fit(X_train, Y_train)

# Print spearmanr correlation on the predicted output of the dev and test sets.
print("For Train Set", round(stats.spearmanr(regression_model.predict(X_train), Y_train)[0], 3))
print("For Dev Set", round(stats.spearmanr(regression_model.predict(X_dev), Y_dev)[0], 3))
print("For Test Set", round(stats.spearmanr(regression_model.predict(X_test), Y_test)[0], 3))

For Train Set 0.35
For Dev Set 0.178
For Test Set 0.263


## CONFIGURATION 2: Contextual Embeddings + ML Regression: 7 marks
1 Load the contextual embedding model in variable `non_cont_model2`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model2()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model2`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model2`. **1.5 mark**

Useful references: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [33]:
def get_feature_model2(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using model2,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  sent_a = data_frame['sent_a'].values
  sent_b = data_frame['sent_b'].values

  sent_a_vectors = non_cont_model2.encode(sent_a)
  sent_b_vectors = non_cont_model2.encode(sent_b)

  return sent_a_vectors, sent_b_vectors

## After Testing Some Models, we found that this model is the best for our task/
# non_cont_model2 = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# feature_1_<dataset_type>, feature_2_<dataset_type> = get_feature_model2(data_frame)
# feature_1_train, feature_2_train = get_feature_model2(df_train)
# feature_1_dev, feature_2_dev = get_feature_model2(df_dev)
# feature_1_test, feature_2_test = get_feature_model2(df_test)

# Save the features to disk
# np.save('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_1_train.npy', feature_1_train)
# np.save('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_2_train.npy', feature_2_train)
# np.save('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_1_dev.npy', feature_1_dev)
# np.save('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_2_dev.npy', feature_2_dev)
# np.save('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_1_test.npy', feature_1_test)
# np.save('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_2_test.npy', feature_2_test)

# Load the features from disk
feature_1_train = np.load('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_1_train.npy')
feature_2_train = np.load('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_2_train.npy')
feature_1_dev = np.load('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_1_dev.npy')
feature_2_dev = np.load('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_2_dev.npy')
feature_1_test = np.load('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_1_test.npy')
feature_2_test = np.load('WordEmbeddingSaves/' + 'non_cont_model2' + '_feature_2_test.npy')

# X_<dataset_type>, Y_<dataset_type> = 
X_train, Y_train = np.mean([feature_1_train, feature_2_train], axis=0), df_train['score'].values
X_dev, Y_dev = np.mean([feature_1_dev, feature_2_dev], axis=0), df_dev['score'].values
X_test, Y_test = np.mean([feature_1_test, feature_2_test], axis=0), df_test['score'].values

# Initiate a regression model and train it.
regression_model = linear_model.Ridge(alpha=0.5)
regression_model.fit(X_train, Y_train)

# Print spearman correlation on the predicted output of the dev and test sets.
print("For Train Set", round(stats.spearmanr(regression_model.predict(X_train), Y_train)[0], 3))
print("For Dev Set", round(stats.spearmanr(regression_model.predict(X_dev), Y_dev)[0], 3))
print("For Test Set", round(stats.spearmanr(regression_model.predict(X_test), Y_test)[0], 3))

For Train Set 0.491
For Dev Set 0.173
For Test Set 0.254


## CONFIGURATION 3: Fine-Tune a Contextual Embeddings Model: 18 marks
1 Prepare data samples to be for the DL model to consume. Add the code in the `form_data()`. **4 marks**

3 Create the data loader, one each for train/dev/test data_input sample set obtained from `form_input_example()`. **1.5 marks**

4 Initiate `model3` consisting of **atleast** the following 3 components - `base_LM`, a `pooling_layer` and a `dense_layer`. Use appropriate activation function in dense. **Atleast** one layer of `base_LM` should be set to trainable. **5 marks**

6 Initiate the `loss`. **0.5 marks**

7 Fit the `model3`. Use `NUM_EPOCHS = 2`. **MAX_NUM_EPOCHS allowed will be 3**. **2 marks** 

8 Complete the `get_model_predicts()` to obtain predicted scores for input sentence pairs. **3.5 marks** 

9 Print the correlation scores on the dev and test set predictions. **1.5 mark**

Useful References: https://huggingface.co/blog/how-to-train-sentence-transformers 

In [40]:
def form_data(data_frame):
    """
    Input a data frame and return the dataloder.
    """
    sent_a_samples = data_frame["sent_a"].values
    sent_b_samples = data_frame["sent_b"].values
    labels = data_frame["score"].values.astype(float)
    labels /= 5

    labels = torch.from_numpy(labels).float()

    examples = []
    for i in range(sent_a_samples.shape[0]):
        examples.append(InputExample(
                            texts = [sent_a_samples[i], sent_b_samples[i]],
                            label = labels[i]
                        ))

    dataloader = DataLoader(examples, shuffle = True, batch_size = BATCH_SIZE)
    return dataloader

def get_model_predicts(data_type, trained_model):
    """
    Input the dataset list and return a list of cosine similarity scores. Use the fitted final_trainable_model for obtaining encodings.
    """
    x1, x2 = trained_model.encode(data_type["sent_a"].values), trained_model.encode(data_type["sent_b"].values)
    return util.cos_sim(x1, x2).diagonal()


# dataloader_train = form_data(df_train)
# dataloader_dev = form_data(df_dev)
# dataloader_test = form_data(df_test)

# base_model = models.Transformer(HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL)
# layer_pooling = models.Pooling(base_model.get_word_embedding_dimension())
# layer_dense = models.Dense(in_features = layer_pooling.get_sentence_embedding_dimension(), out_features = OUT_DIM_DENSE)
# model3 = SentenceTransformer(modules = [base_model, layer_pooling, layer_dense])
# loss = losses.CosineSimilarityLoss(model3)

# model_evaluator = EmbeddingSimilarityEvaluator(df_dev["sent_a"].values, df_dev["sent_b"].values, df_dev["score"].values.astype(float)/5, batch_size = BATCH_SIZE)

# # Fit the model3.
# model3.fit(train_objectives = [(dataloader_train, loss)], evaluator = model_evaluator, epochs = NUM_EPOCHS, warmup_steps = NUM_WARMUP, output_path = MODEL_SAVE_PATH)
# # Print spearman correlation on the predicted output of the dev and test sets.

# We have saved the model in the MODEL_SAVE_PATH so that we can save the trouble of running the fit() function everytime
model3 = SentenceTransformer(MODEL_SAVE_PATH)

# Using the get_model_predicts() function, we can get the cosine similarity scores between the sentences
train_preds = get_model_predicts(df_train, model3)
dev_preds = get_model_predicts(df_dev, model3)
test_preds = get_model_predicts(df_test, model3)

# Using the stats.spearmanr function we get the spearman correlation
print("For Train Set", round(stats.spearmanr(train_preds, df_train["score"].values)[0], 3))
print("For Dev Set", round(stats.spearmanr(dev_preds, df_dev["score"].values)[0], 3))
print("For Test Set", round(stats.spearmanr(test_preds, df_test["score"].values)[0], 3))

Iteration: 100%|██████████| 180/180 [51:11<00:00, 17.06s/it]
Iteration: 100%|██████████| 180/180 [49:00<00:00, 16.34s/it]
Epoch: 100%|██████████| 2/2 [1:45:23<00:00, 3161.67s/it]


For Train Set 0.928
For Dev Set 0.897
For Test Set 0.864


