# Models training and evaluation

In [24]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sentence_transformers import SentenceTransformer, CrossEncoder

In [25]:
path_train = 'data/sts-train.csv'
path_dev = 'data/sts-dev.csv'
path_test = 'data/sts-test.csv'

In [26]:
columns=['genre', 'file', 'year', 'index', 'score', 'sentence1', 'sentence2']

In [27]:
df_train = pd.read_csv(path_train, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_dev = pd.read_csv(path_dev, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_test = pd.read_csv(path_test, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')

# Pre-processing

In [28]:
def pre_processing(df: pd.DataFrame) -> pd.DataFrame:
    df['genre'] = df['genre'].replace('main-', '', regex=True)
    df['genre'] = df['genre'].replace('forum', 'forums')
    df['year'] = df['year'].replace(r'\D', '', regex=True)
    df['score'] = MinMaxScaler().fit_transform(df[['score']])
    return df

In [29]:
df_train = pre_processing(df_train)
df_dev = pre_processing(df_dev)
df_test = pre_processing(df_test)
df_train_dev = pd.concat([df_train, df_dev]).reset_index(drop=True)

# Sentence Transformer

In [30]:
model_st = SentenceTransformer('all-mpnet-base-v2', device='cuda')

In [31]:
s1_test = model_st.encode(df_test['sentence1'], show_progress_bar=True)
s2_test = model_st.encode(df_test['sentence2'], show_progress_bar=True)

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

## Pre-trained

In [32]:
def evaluate(scores: np.ndarray) -> None:
    actual_scores = df_test['score'].to_numpy()
    print(f"R2: {r2_score(actual_scores, scores):.5f}")
    print(f"MAE: {mean_absolute_error(actual_scores, scores):.5f}")
    print(f"RMSE: {mean_squared_error(actual_scores, scores, squared=False):.5f}")

In [33]:
s1_test.shape, s2_test.shape

((1379, 768), (1379, 768))

In [34]:
pred_scores = 1 - paired_cosine_distances(s1_test, s2_test)
evaluate(pred_scores)

R2: 0.64980
MAE: 0.13896
RMSE: 0.18049


## Fine-tuning

In [35]:
from sentence_transformers import losses, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from torch.utils.data import DataLoader
from math import ceil

In [37]:
train_samples = df_train_dev.apply(lambda row: InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']), axis=1).to_list()
test_samples = df_test.apply(lambda row: InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']), axis=1).to_list()
len(train_samples), len(test_samples)

(7249, 1379)

In [38]:
# Training hyper-parameters
train_batch_size = 32
num_epochs = 5

In [39]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
loss = losses.CosineSimilarityLoss(model=model_st)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples)
warmup_steps = ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up

In [40]:
# Train the model
# model_st.fit(train_objectives=[(train_dataloader, loss)],
#           evaluator=evaluator,
#           epochs=num_epochs,
#           evaluation_steps=1000,
#           warmup_steps=warmup_steps,
#           output_path="all-mpnet-base-v2-fine-tuned")

Performance after fine-tuning

In [41]:
st_fine_tuned = SentenceTransformer("models/all-mpnet-base-v2-fine-tuned")

In [42]:
s1_test = st_fine_tuned.encode(df_test['sentence1'])
s2_test = st_fine_tuned.encode(df_test['sentence2'])

In [43]:
pred_scores = 1 - paired_cosine_distances(s1_test, s2_test)
evaluate(pred_scores)

R2: 0.75328
MAE: 0.11547
RMSE: 0.15149


# Cross-encoders

In [44]:
model_ce = CrossEncoder('cross-encoder/stsb-roberta-large', device='cuda')

## Pre-trained

In [45]:
sentence_pairs = df_test[['sentence1', 'sentence2']].to_numpy().tolist()

In [46]:
pred_scores = model_ce.predict(sentence_pairs, show_progress_bar=True)
evaluate(pred_scores)

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

R2: 0.83807
MAE: 0.09132
RMSE: 0.12273


# Fine-tuning

In [47]:
# Training hyper-parameters
train_batch_size = 24
num_epochs = 5

In [48]:
train_samples = df_train_dev.apply(lambda row: InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']), axis=1).to_list()
test_samples = df_test.apply(lambda row: InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']), axis=1).to_list()
len(train_samples), len(test_samples)

(7249, 1379)

In [49]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
cross_evaluator = CECorrelationEvaluator.from_input_examples(test_samples)

In [50]:
# # Train the model
# model_ce.fit(train_dataloader=train_dataloader,
#           evaluator=cross_evaluator,
#           epochs=num_epochs,
#           warmup_steps=warmup_steps,
#           output_path="stsb-roberta-large-fine-tuned")

In [51]:
ce_fine_tuned = CrossEncoder("models/stsb-roberta-large-fine-tuned")

In [52]:
pred_scores = ce_fine_tuned.predict(sentence_pairs, show_progress_bar=True)
evaluate(pred_scores)

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

R2: 0.83811
MAE: 0.09161
RMSE: 0.12272
