# Defining SBERT

In [12]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


# QUESTION: 
## SHOULD WE TRAIN FROM SCRATCH OR CAN WE START FROM bert-base-uncased or any other starting point?

## TODO: Add validation set and test on testing set for SBERT classification

## TODO: After finishing training for classification, save the model please to stop training everytime <3

## TODO: After training on the classification data, check how to "fine-tune" on the STS regression data (maybe only the head without the bert itself?)
## Check my question please <3 https://docs.google.com/document/d/1YeohuAr55fKF2nI1RiCgpq_Wa3Yn-CLAwfPoBmViNIM/edit

class SBERT(nn.Module):
    def __init__(self):
        super(SBERT, self).__init__()
        # sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained("bert-base-uncased")
                
        self.pooling = nn.AvgPool1d(kernel_size=3,stride=1)
        # 768 / 3 -> 256
        self.linear = nn.Linear(in_features=2298, out_features=3) # 2298=(768-2)*3; 153 is the embedding dimension after pooling and stuff..
        self.softmax = nn.Softmax(dim=1)

    def forward(self, sent1, sent2, objective):
        encoded_input1 = self.tokenizer(sent1, padding=True, truncation=True, return_tensors='pt')
        output1 = self.model(**encoded_input1)
        output1 = self.pooling(output1["pooler_output"])

        encoded_input2 = self.tokenizer(sent2, padding=True, truncation=True, return_tensors='pt')
        output2 = self.model(**encoded_input2)
        output2 = self.pooling(output2["pooler_output"])
                
        if objective == "regression":
            return cosine_similarity(output1.detach().numpy(),output2.detach().numpy())

        if objective == "classification":
            diff = abs(torch.subtract(output1,output2))
            concat = torch.cat([output1,output2,diff],axis=1)            
            result = self.linear(concat)
            out = self.softmax(result)
            return out

        # return output

#     def regression_objective(self, text1, text2):
#         vec1 = self.forward(text1).detach().numpy()
#         vec2 = self.forward(text2).detach().numpy()
#         return cosine_similarity(vec1,vec2)
    
#     def classification_objective(self,text1,text2):
#         vec1 = self.forward(text1)
#         vec2 = self.forward(text2)
        
#         diff = abs(torch.subtract(vec1,vec2))

#         concat = torch.cat([vec1,vec2,diff])
        
#         result = self.linear(concat)
        
#         out = self.softmax(result)
#         pred = torch.argmax(out,axis=0)
        
#         return pred

sbert = SBERT()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
sbert

SBERT(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

# Regression Objective Evaluation

In [3]:
import pandas as pd

df_test = pd.read_csv("Stsbenchmark/sts-test.csv",header=0,names=["main-caption","genre","filename","year","score","sentence1","sentence2"])#,usecols=['score','sentence1','sentence2'])

df_test = df_test[['score','sentence1','sentence2']]
df_test.head()

Unnamed: 0,score,sentence1,sentence2
0,3.6,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
1,5.0,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
2,4.2,A man is cutting up a cucumber.,A man is slicing a cucumber.
3,1.5,A man is playing a harp.,A man is playing a keyboard.
4,1.8,A woman is cutting onions.,A woman is cutting tofu.


In [4]:
df_test.isna().sum()

score        0
sentence1    0
sentence2    5
dtype: int64

In [5]:
df_test.dropna(inplace=True)

In [6]:
df_test.isna().sum()

score        0
sentence1    0
sentence2    0
dtype: int64

In [7]:
def map_score(value, leftMin=0, leftMax=5, rightMin=-1, rightMax=1):
    # Figure out how 'wide' each range is
    leftSpan = leftMax - leftMin
    rightSpan = rightMax - rightMin

    # Convert the left range into a 0-1 range (float)
    valueScaled = float(value - leftMin) / float(leftSpan)

    # Convert the 0-1 range into a value in the right range.
    return rightMin + (valueScaled * rightSpan)

def map_to_label(x): # assuming x is between -1 and 1
    threshold = 0.25
    if abs(x- (-1) ) <= threshold: return 1
    if abs(x- (-0.5) ) <= threshold: return 2
    if abs(x- (0) ) <= threshold: return 3
    if abs(x- (0.5) ) <= threshold: return 4
    if abs(x- (1) ) <= threshold: return 5

In [8]:
preds = []
trues = []
for i,row in df_test.iterrows():
    sentence1 = row.sentence1
    sentence2 = row.sentence2
    
    score_theoretical = row.score
    
    try:
        # score_pred = sbert.regression_objective(sentence1,sentence2)
        score_pred = sbert(sentence1,sentence2,objective="regression")
        label_pred = map_to_label(score_pred)
        
        score_theoretical = map_score(score_theoretical)        
        label_true = map_to_label(score_theoretical)
        
        preds.append(label_pred)
        trues.append(label_true)
        
    except:
        print("i",i)

In [9]:
from sklearn.metrics import classification_report
import json

report = classification_report(trues, preds,output_dict=True)
print(json.dumps(report, sort_keys=True, indent=4))

{
    "1": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 169
    },
    "2": {
        "f1-score": 0.011627906976744186,
        "precision": 1.0,
        "recall": 0.005847953216374269,
        "support": 171
    },
    "3": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 252
    },
    "4": {
        "f1-score": 0.07482993197278912,
        "precision": 0.275,
        "recall": 0.04330708661417323,
        "support": 254
    },
    "5": {
        "f1-score": 0.239622641509434,
        "precision": 0.1364124597207304,
        "recall": 0.9844961240310077,
        "support": 129
    },
    "accuracy": 0.14256410256410257,
    "macro avg": {
        "f1-score": 0.06521609609179346,
        "precision": 0.28228249194414606,
        "recall": 0.20673023277231106,
        "support": 975
    },
    "weighted avg": {
        "f1-score": 0.0532374313526448,
        "precision": 0.2650740587733069,


  _warn_prf(average, modifier, msg_start, len(result))


# Classification Objective Evaluation

In [13]:
import json
json_list = list(open("snli_1.0/snli_1.0_train.jsonl","r"))

label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}

In [14]:
import pandas as pd

data = {'sentence1': [], 'sentence2': [], 'gold_label': []}
for json_str in json_list:
    try:
        result = json.loads(json_str)
        result['gold_label']=label2int[result['gold_label']]
        for key in data:
            data[key].append(result[key])
    except:
        pass
df_train = pd.DataFrame.from_dict(data)#.head()

In [15]:
from torch.utils.data import Dataset,DataLoader

class SNLI_Dataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent1 = row['sentence1']
        sent2 = row['sentence2']
        label = row['gold_label']
        return (sent1, sent2), label

In [16]:
training_data = SNLI_Dataset(df_train)
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=False)

In [17]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(sbert.parameters(), lr=0.001, momentum=0.9)

- outputs of shape (N,C) where N is the batch size C is the number of classes
- target of shape (N)

In [24]:
training_data = SNLI_Dataset(df_train)
train_dataloader = DataLoader(training_data, batch_size=16, shuffle=False)

n_epochs = 100

for epoch in range(n_epochs):

    for i,((sent1,sent2),label) in enumerate(train_dataloader):
        
        output = sbert(sent1,sent2,objective="classification")
        optimizer.zero_grad()
        
        loss = criterion(output, label)
        
        print(f"epoch={epoch+1} iteration={i+1}/{len(train_dataloader)} loss={loss.detach().numpy()}")
        
        loss.backward()
        optimizer.step()
        
        # break
print('Finished Training')

epoch=0 iteration=0/1 loss=0.9991500973701477
epoch=1 iteration=0/1 loss=0.9819676280021667
epoch=2 iteration=0/1 loss=0.9626067876815796
epoch=3 iteration=0/1 loss=0.9417279958724976
epoch=4 iteration=0/1 loss=0.9187717437744141
epoch=5 iteration=0/1 loss=0.8931153416633606
epoch=6 iteration=0/1 loss=0.8695991635322571
epoch=7 iteration=0/1 loss=0.8479693531990051
epoch=8 iteration=0/1 loss=0.8287663459777832
epoch=9 iteration=0/1 loss=0.8116615414619446
epoch=10 iteration=0/1 loss=0.7956103086471558
epoch=11 iteration=0/1 loss=0.7812157273292542
epoch=12 iteration=0/1 loss=0.7702778577804565
epoch=13 iteration=0/1 loss=0.76219642162323
epoch=14 iteration=0/1 loss=0.7545262575149536
epoch=15 iteration=0/1 loss=0.7498899698257446
epoch=16 iteration=0/1 loss=0.7453569769859314
epoch=17 iteration=0/1 loss=0.7419780492782593
epoch=18 iteration=0/1 loss=0.7394949197769165
epoch=19 iteration=0/1 loss=0.7367589473724365
epoch=20 iteration=0/1 loss=0.735549807548523
epoch=21 iteration=0/1 los

In [25]:
df_train

Unnamed: 0,sentence1,sentence2,gold_label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,2
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
3,Children smiling and waving at camera,They are smiling at their parents,2
4,Children smiling and waving at camera,There are children present,1


In [51]:
for idx in range(5):
    pred = torch.argmax(sbert(df_train.iloc[idx].sentence1,df_train.iloc[idx].sentence2,"classification")).numpy().item()
    true = df_train.iloc[idx]['gold_label']
    print(pred==true)

True
True
True
True
True
