# SentenceBERT Model

### Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


## Loading the data

Since we are going to use the SentenceBERT model in a zero-shot setting, we only load the val and test datasets.

In [4]:
val_df = pd.read_json('../../data/shroom/train-dev-test-split/SHROOM_dev-v2/val.model-agnostic.json')
test_df = pd.read_json('../../data/shroom/train-dev-test-split/SHROOM_test-labeled/test.model-agnostic.json')

In [6]:
val_df.head()

Unnamed: 0,hyp,ref,src,tgt,model,task,labels,label,p(Hallucination)
0,Resembling or characteristic of a weasel.,tgt,The writer had just entered into his eighteent...,Resembling a weasel (in appearance).,,DM,"[Hallucination, Not Hallucination, Not Halluci...",Not Hallucination,0.2
1,Alternative form of sheath knife,tgt,Sailors ' and fishermen 's <define> sheath - k...,.,,DM,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,0.8
2,(obsolete) A short period of time.,tgt,"As to age , Bead could not form any clear impr...","(poetic) An instant, a short moment.",,DM,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,0.0
3,(slang) An incel.,tgt,Because redpillers are usually normies or <def...,"(incel, _, slang) A man of a slightly lower ra...",,DM,"[Not Hallucination, Not Hallucination, Halluci...",Not Hallucination,0.2
4,"An island in Lienchiang County, Taiwan.",tgt,On the second day of massive live - fire drill...,"An island in Dongyin, Lienchiang, Taiwan, in t...",,DM,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,0.0


In [7]:
test_df.head()

Unnamed: 0,id,src,tgt,hyp,task,labels,label,p(Hallucination)
0,1,"Ты удивишься, если я скажу, что на самом деле ...",Would you be surprised if I told you my name i...,You're gonna be surprised if I say my real nam...,MT,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,0.0
1,2,Еды будет полно.,There will be plenty of food.,The food will be full.,MT,"[Hallucination, Not Hallucination, Hallucinati...",Hallucination,0.8
2,3,"Думаете, Том будет меня ждать?",Do you think that Tom will wait for me?,You think Tom's gonna wait for me?,MT,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,0.2
3,6,Два брата довольно разные.,The two brothers are pretty different.,There's a lot of friends.,MT,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,1.0
4,7,<define> Infradiaphragmatic </define> intra- a...,(medicine) Below the diaphragm.,(anatomy) Relating to the diaphragm.,DM,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,0.8


## Loading the SentenceBERT model

Here we use the LaBSE model, which is a multilingual SentenceBERT model.

In [8]:
model = SentenceTransformer("sentence-transformers/LaBSE")

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

### Running the model

In [9]:
## Val embeddings

val_src_embeddings = model.encode(val_df['src'].tolist(), show_progress_bar=True)
val_hyp_embeddings = model.encode(val_df['hyp'].tolist(), show_progress_bar=True)
val_tgt_embeddings = model.encode(val_df['tgt'].tolist(), show_progress_bar=True)

val_df['src_embeddings'] = val_src_embeddings.tolist()
val_df['hyp_embeddings'] = val_hyp_embeddings.tolist()
val_df['tgt_embeddings'] = val_tgt_embeddings.tolist()

## Test embeddings
test_tgt_embeddings = model.encode(test_df['tgt'].tolist(), show_progress_bar=True)
test_hyp_embeddings = model.encode(test_df['hyp'].tolist(), show_progress_bar=True)
test_src_embeddings = model.encode(test_df['src'].tolist(), show_progress_bar=True)

test_df['src_embeddings'] = test_src_embeddings.tolist()
test_df['hyp_embeddings'] = test_hyp_embeddings.tolist()
test_df['tgt_embeddings'] = test_tgt_embeddings.tolist()

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

### Getting similarity scores

In [10]:
val_src_tgt = np.array([np.dot(val_src_embeddings[i], val_tgt_embeddings[i]) for i in range(len(val_tgt_embeddings))])
val_src_hyp = np.array([np.dot(val_src_embeddings[i], val_hyp_embeddings[i]) for i in range(len(val_tgt_embeddings))])
val_hyp_tgt = np.array([np.dot(val_hyp_embeddings[i], val_tgt_embeddings[i]) for i in range(len(val_tgt_embeddings))])

val_df['src_tgt'] = val_src_tgt
val_df['src_hyp'] = val_src_hyp
val_df['hyp_tgt'] = val_hyp_tgt

test_src_tgt = np.array([np.dot(test_src_embeddings[i], test_tgt_embeddings[i]) for i in range(len(test_tgt_embeddings))])
test_src_hyp = np.array([np.dot(test_src_embeddings[i], test_hyp_embeddings[i]) for i in range(len(test_tgt_embeddings))])
test_hyp_tgt = np.array([np.dot(test_hyp_embeddings[i], test_tgt_embeddings[i]) for i in range(len(test_tgt_embeddings))])

test_df['src_tgt'] = test_src_tgt
test_df['src_hyp'] = test_src_hyp
test_df['hyp_tgt'] = test_hyp_tgt

### Train an LR model for the classification

Here we train a simple Logistic Regression model to classify the sentences into the 2 classes by using the similarity scores obtained from the SentenceBERT model.

In [11]:
val_ph = val_df['p(Hallucination)'].tolist()
test_ph = test_df['p(Hallucination)'].tolist()

X_train = np.array([val_src_tgt, val_src_hyp, val_hyp_tgt]).T
y_train = np.array(val_ph)

X_test = np.array([test_src_tgt, test_src_hyp, test_hyp_tgt]).T
y_test = np.array(test_ph)

reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE Loss: {mse}")

MSE Loss: 0.09817203001661853


## Results

In [12]:
test_labels = [1 if x > 0.5 else 0 for x in y_test]
test_pred = [1 if x > 0.5 else 0 for x in y_pred]
test_pred_hyp_tgt = [1 if x > 0.5 else 0 for x in test_hyp_tgt]

print("Hyp-Tgt Similarity Results: ")
print(classification_report(test_labels, test_pred_hyp_tgt))
print("Logistic Regression Results: ")
print(classification_report(test_labels, test_pred))

Hyp-Tgt Similarity Results: 
              precision    recall  f1-score   support

           0       0.36      0.14      0.20       889
           1       0.34      0.64      0.45       611

    accuracy                           0.35      1500
   macro avg       0.35      0.39      0.32      1500
weighted avg       0.35      0.35      0.30      1500

Logistic Regression Results: 
              precision    recall  f1-score   support

           0       0.69      0.84      0.76       889
           1       0.66      0.45      0.53       611

    accuracy                           0.68      1500
   macro avg       0.67      0.64      0.65      1500
weighted avg       0.68      0.68      0.67      1500



As seen in the results, the LR model performs much better than by just considering the similarity scores. This is due to the fact that it also takes into account the similarity scores for the src-tgt and src-hyp pairs.

## Results for each task

### DM

In [14]:
## Get the similarity scores where task is DM
test_df_dm = test_df[test_df['task'] == 'DM']
X_test_dm = np.array([test_df_dm['src_tgt'].tolist(), test_df_dm['src_hyp'].tolist(), test_df_dm['hyp_tgt'].tolist()]).T
y_test_dm = np.array(test_df_dm['p(Hallucination)'].tolist())
y_pred_dm = reg.predict(X_test_dm)
test_labels_dm = [1 if x > 0.5 else 0 for x in y_test_dm]
test_pred_dm = [1 if x > 0.5 else 0 for x in y_pred_dm]
print("DM Task Results: ")
print(classification_report(test_labels_dm, test_pred_dm))

DM Task Results: 
              precision    recall  f1-score   support

           0       0.60      0.62      0.61       275
           1       0.63      0.61      0.62       288

    accuracy                           0.61       563
   macro avg       0.61      0.61      0.61       563
weighted avg       0.61      0.61      0.61       563



### MT

In [15]:
## Get the similarity scores where task is MT
test_df_mt = test_df[test_df['task'] == 'MT']
X_test_mt = np.array([test_df_mt['src_tgt'].tolist(), test_df_mt['src_hyp'].tolist(), test_df_mt['hyp_tgt'].tolist()]).T
y_test_mt = np.array(test_df_mt['p(Hallucination)'].tolist())
y_pred_mt = reg.predict(X_test_mt)
test_labels_mt = [1 if x > 0.5 else 0 for x in y_test_mt]
test_pred_mt = [1 if x > 0.5 else 0 for x in y_pred_mt]
print("MT Task Results: ")
print(classification_report(test_labels_mt, test_pred_mt))

MT Task Results: 
              precision    recall  f1-score   support

           0       0.65      0.98      0.78       336
           1       0.88      0.23      0.36       226

    accuracy                           0.68       562
   macro avg       0.77      0.60      0.57       562
weighted avg       0.74      0.68      0.61       562



### PG

In [16]:
## Get the similarity scores where task is PG
test_df_pg = test_df[test_df['task'] == 'PG']
X_test_pg = np.array([test_df_pg['src_tgt'].tolist(), test_df_pg['src_hyp'].tolist(), test_df_pg['hyp_tgt'].tolist()]).T
y_test_pg = np.array(test_df_pg['p(Hallucination)'].tolist())
y_pred_pg = reg.predict(X_test_pg)
test_labels_pg = [1 if x > 0.5 else 0 for x in y_test_pg]
test_pred_pg = [1 if x > 0.5 else 0 for x in y_pred_pg]
print("PG Task Results: ")
print(classification_report(test_labels_pg, test_pred_pg))

PG Task Results: 
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       278
           1       0.60      0.51      0.55        97

    accuracy                           0.79       375
   macro avg       0.72      0.70      0.71       375
weighted avg       0.78      0.79      0.78       375



As seen in the results, the model performs well in the PG task. It also performs fairly well in the DM and MT tasks, but not as well as in the PG task.