This file fine-tunes the model which is later used for the sentence-bert model.

Mostly written by Mathis Lucka ([GitHub](https://github.com/mathislucka), [Kaggle](https://www.kaggle.com/mathislucka)).

# Imports

In [None]:
!pip install sentence-transformers
!pip install torch

In [None]:
from sentence_transformers import SentenceTransformer, util, losses, models
import torch
import numpy as np
import pandas as pd
import math
import random
import os

In [None]:
# Open up gdrive to get files

from google.colab import drive
drive.mount('gdrive')

In [None]:
### GLOBAL VARIABLES ###

BASE_PATH = 'gdrive/MyDrive/colabNotebooks/commonLitReadabilityPrize/firstPlace_CodeFiles'

# Functions

In [None]:
def normalize_scores(df):
  x = np.array(df.target.values)
  x -= x.min()
  x /= x.ptp()
  df['target_norm'] = x
  return df

In [None]:
def draw_random_pairs(items, use_both_directions=False):
  to_prepare = items

  split = math.floor(len(to_prepare)/2)
  random.shuffle(to_prepare)
  first_half = to_prepare[:split-1]
  second_half = to_prepare[split:-1]

  results = []
  for item in first_half:
    selected = second_half.pop()
    results.append((item, selected))
    if use_both_directions:
      results.append((selected, item))
  
  return results

In [None]:
def make_easy_hard_cv_data(fold_dir, out_dir, n_folds=6, use_both_directions=True):
  for fold in range(n_folds):
    train = pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '.csv')
    val = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')
    
    train_norm = normalize_scores(train)
    val_norm = normalize_scores(val)

    train_tx = [str(t) for t in train_norm['excerpt'].values]
    train_sc = [float(t) for t in train_norm['target_norm'].values]
    train_pairs = draw_random_pairs(list(zip(train_tx, train_sc)), use_both_directions=use_both_directions)

    val_tx = [str(t) for t in val_norm['excerpt'].values]
    val_sc = [float(t) for t in val_norm['target_norm'].values]
    val_pairs = draw_random_pairs(list(zip(val_tx, val_sc)))
    train_easy_labels = []
    train_left = []
    train_right = []
    for pair in train_pairs:
      train_easy_labels.append(0 if pair[0][1] < pair[1][1] else 1)
      train_left.append(pair[0][0])
      train_right.append(pair[1][0])

    train_df = pd.DataFrame.from_dict({
        'left_text': train_left,
        'right_text': train_right,
        'distance': train_easy_labels
    })
        
    train_df.to_csv(out_dir + '/train_fold_' + str(fold) + '_simplerAlgo' + '.csv')

    # YANISA
    val_easy_labels = []
    val_left = []
    val_right = []
    for pair in val_pairs:
      val_easy_labels.append(0 if pair[0][1] < pair[1][1] else 1)
      val_left.append(pair[0][0])
      val_right.append(pair[1][0])
    
    val_df = pd.DataFrame.from_dict({
        'left_text': val_left,
        'right_text': val_right,
        'distance': val_easy_labels
    })

    val_df.to_csv(out_dir + '/val_fold_' + str(fold) + '_simplerAlgo' + '.csv')

In [None]:
def make_distance_cv_data(fold_dir, out_dir, n_folds=6, pseudolabel_dir=None, num_draws=1):
  for fold in range(n_folds):
    train = pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '.csv')
    val = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')
    
    train_norm = normalize_scores(train)
    val_norm = normalize_scores(val)

    train_tx = [str(t) for t in train_norm['excerpt'].values]
    train_sc = [float(t) for t in train_norm['target_norm'].values]

    if pseudolabel_dir:
      pseudo = pd.read_csv(pseudolabel_dir)
      pseudo_norm = normalize_scores(pseudo)
      pseudo_tx = [str(t) for t in pseudo_norm['excerpt'].values]
      pseudo_sc = [float(t) for t in pseudo_norm['target_norm'].values]
      train_tx = train_tx + pseudo_tx
      train_sc = train_sc + pseudo_sc

    train_pairs = []
    draws = 0
    while draws <= num_draws:
      train_pairs.extend(draw_random_pairs(list(zip(train_tx, train_sc))))
      draws += 1

    val_tx = [str(t) for t in val_norm['excerpt'].values]
    val_sc = [float(t) for t in val_norm['target_norm'].values]
    val_pairs = draw_random_pairs(list(zip(val_tx, val_sc)))

    train_distances = []
    train_left = []
    train_right = []
    for pair in train_pairs:
      distance = 1-abs(pair[0][1] - pair[1][1])
      train_left.append(pair[0][0])
      train_right.append(pair[1][0])
      train_distances.append(float(distance))
    
    train_df = pd.DataFrame.from_dict({
        'left_text': train_left,
        'right_text': train_right,
        'distance': train_distances
    })

    train_df.to_csv(out_dir + '/train_fold_' + '_simplerAlgo' + str(fold) + '.csv')

    val_distances = []
    val_left = []
    val_right = []
    for pair in val_pairs:
      distance = 1-abs(pair[0][1] - pair[1][1])
      val_left.append(pair[0][0])
      val_right.append(pair[1][0])
      val_distances.append(float(distance))
    
    val_df = pd.DataFrame.from_dict({
        'left_text': val_left,
        'right_text': val_right,
        'distance': val_distances
    })

    val_df.to_csv(out_dir + '/val_fold_' + str(fold) + '_simplerAlgo' + '.csv')

In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator, CECorrelationEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv

def train_crossencoder(model_dir, out_dir, train_data, eval_data, hyperparams):
  """
  Script taken from https://github.com/UKPLab/sentence-transformers
  This examples trains a CrossEncoder for the STSbenchmark task. A CrossEncoder takes a sentence pair
  as input and outputs a label. Here, it output a continious labels 0...1 to indicate the similarity between the input pair.
  It does NOT produce a sentence embedding and does NOT work for individual sentences.
  Usage:
  python training_stsbenchmark.py
  """

  #### Just some code to print debug information to stdout
  logging.basicConfig(format='%(asctime)s - %(message)s',
                      datefmt='%Y-%m-%d %H:%M:%S',
                      level=logging.INFO,
                      handlers=[LoggingHandler()])
  logger = logging.getLogger(__name__)
  #### /print debug information to stdout



  #Define our Cross-Encoder
  train_batch_size = hyperparams['bs']
  num_epochs = hyperparams['ep']
  model_save_path = out_dir

  model = CrossEncoder(model_dir, num_labels=hyperparams['num_labels'])

  train_samples = []
  dev_samples = []
  for index, row in train_data.iterrows():
    train_samples.append(InputExample(texts=[row['left_text'], row['right_text']], label=row['distance']))

  for index, row in eval_data.iterrows():
    dev_samples.append(InputExample(texts=[row['left_text'], row['right_text']], label=row['distance']))

  train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

  # We add an evaluator, which evaluates the performance during training
  evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='clrp-dev')


  # Configure the training
  warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
  logger.info("Warmup-steps: {}".format(warmup_steps))


  # Train the model
  model.fit(train_dataloader=train_dataloader,
            evaluator=evaluator,
            epochs=num_epochs,
            warmup_steps=warmup_steps,
            evaluation_steps=hyperparams['logging'],
            optimizer_params = {'lr': hyperparams['lr']},
            weight_decay=hyperparams['weight_decay'],
            output_path=model_save_path,
            save_best_model=True,
            use_amp=True)


In [None]:
def train_sent_transformer_cv(fold_dir, model_dir, out_dir, hyperparams, kfolds=[0,1,2,3,4,5], continue_training=False, use_crossencoder=False):
  for fold in kfolds:
    torch.cuda.empty_cache()
    train_data = pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '_simplerAlgo' + '.csv')
    val_data = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '_simplerAlgo' + '.csv')
    if continue_training:
      final_model_dir = model_dir + '/model_fold_' + str(fold) + '_simplerAlgo'
    else:
      final_model_dir = model_dir
    
    if use_crossencoder:
      train_crossencoder(
          model_dir=final_model_dir,
          out_dir=out_dir + '/model_fold_' + str(fold) + '_simplerAlgo',
          train_data=train_data,
          eval_data=val_data,
          hyperparams=hyperparams
      )
    else:
      train_bi_encoder(
          model_dir=final_model_dir,
          out_dir=out_dir + '/model_fold_' + str(fold) + '_simplerAlgo',
          train_data=train_data,
          eval_data=val_data,
          hyperparams=hyperparams
      )   

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.evaluation import TripletEvaluator

def train_bi_encoder(model_dir, out_dir, train_data, eval_data, hyperparams):
  print('called sent trans', train_data.head())
  #### Just some code to print debug information to stdout
  logging.basicConfig(format='%(asctime)s - %(message)s',
                      datefmt='%Y-%m-%d %H:%M:%S',
                      level=logging.INFO,
                      handlers=[LoggingHandler()])

  train_batch_size = hyperparams['bs']
  model_save_path = out_dir
  max_seq_length = 256


  # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
  word_embedding_model = models.Transformer(model_dir, max_seq_length=max_seq_length)

  # Apply mean pooling to get one fixed sized sentence vector
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

  model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
  
  train_samples = []
  dev_samples = []
  for index, row in train_data.iterrows():
    if hyperparams['task_type'] == 'sts':
      train_samples.append(InputExample(texts=[row['left_text'], row['right_text']], label=row['distance']))
    elif hyperparams['task_type'] == 'triplet':
      train_samples.append(InputExample(texts=[row['anchor'], row['similar'], row['dissimilar']], label=0))
  
  for index, row in eval_data.iterrows():
    if hyperparams['val_type'] == 'sts':
      dev_samples.append(InputExample(texts=[row['left_text'], row['right_text']], label=row['distance']))
    elif hyperparams['val_type'] == 'triplet':
      dev_samples.append(InputExample(texts=[row['anchor'], row['similar'], row['dissimilar']], label=0))
  
  train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
  if hyperparams['task_type'] == 'sts':
    train_loss = losses.CosineSimilarityLoss(model=model)
  elif hyperparams['task_type'] == 'triplet':
    train_loss = losses.TripletLoss(model=model)
  
  if hyperparams['val_type'] == 'sts':
    dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='clrp-dev')
  elif hyperparams['val_type'] == 'triplet':
    dev_evaluator = TripletEvaluator.from_input_examples(dev_samples, name='clrp-dev')


  warmup_steps = math.ceil(len(train_dataloader) * hyperparams['ep'] * 0.1) #10% of train data for warm-up
  logging.info("Warmup-steps: {}".format(warmup_steps))

  # Train the model
  model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=dev_evaluator,
            epochs=hyperparams['ep'],
            evaluation_steps=hyperparams['logging'],
            warmup_steps=warmup_steps,
            output_path=model_save_path,
            optimizer_params = {'lr': hyperparams['lr']},
            weight_decay=hyperparams['weight_decay']
            )

In [None]:
def prepare_triplets(df, n_samples, replace=False, margin=0.3):
  triplets= []
  while len(triplets) < n_samples:
    samples = df.sample(n=3, replace=replace)
    sample_texts = [str(t) for t in samples.excerpt.values]
    sample_scores = [float(s) for s in samples.target_norm.values]
    anchor_idx = random.randrange(0, 3)
    anchor_text = sample_texts.pop(anchor_idx)
    anchor_score = sample_scores.pop(anchor_idx)
    triplet = [anchor_text]
    sim_1 = 1-(abs(anchor_score - sample_scores[0]))
    sim_2 = 1-(abs(anchor_score - sample_scores[1]))
    if abs(sim_1 - sim_2) < margin:
      continue
    text_1 = sample_texts[0]
    text_2 = sample_texts[1]
    if sim_1 >= sim_2:
      triplet.extend([text_1, text_2, sim_1, sim_2])
    else:
      triplet.extend([text_2, text_1, sim_2, sim_1])
    triplets.append(triplet)
  
  return triplets

In [None]:
def make_cv_triplet_data(fold_dir, out_dir, n_samples, kfolds=[0,1,2,3,4,5], replace=True, margin=0.3, remove_duplicates=True, generate_dev=True):
  for fold in kfolds:
    train_df = normalize_scores(pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '.csv'))
    val_df = normalize_scores(pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv'))
    train_triplets = prepare_triplets(train_df, n_samples, replace, margin)
    if remove_duplicates:
      train_triplets = list(set(tuple(sub) for sub in train_triplets))
    if generate_dev:
      val_triplets = prepare_triplets(val_df, math.floor(n_samples * 0.2), replace, margin)
      if remove_duplicates:
        val_triplets = list(set(tuple(sub) for sub in val_triplets))
    train_triplet_df = pd.DataFrame(train_triplets, columns=['anchor', 'similar', 'dissimilar', 'sim_similar', 'sim_dissimilar'])
    train_triplet_df.to_csv(out_dir + '/train_fold_' + str(fold) + '_simplerAlgo' + '.csv')
    if generate_dev:
      val_triplet_df = pd.DataFrame(val_triplets, columns=['anchor', 'similar', 'dissimilar', 'sim_similar', 'sim_dissimilar'])
      val_triplet_df.to_csv(out_dir + '/val_fold_' + str(fold) + '_simplerAlgo' + '.csv')


# Training

I used a few different strategies for training bi-encoders. Results were similar but they might change if more data is available. You can prepare the data in a few different ways and use different loss-functions for training. There is also the possibility to train a model on some intermediate task first. This gives slightly better results in my experience.

In [None]:
# First, we prepare the data for training.
fold_dir = os.path.join(BASE_PATH, 'data/training/simplerAlgo/cv') # structure is the same as in other notebooks (train_fold_0.csv, ...)
pseudo_labeled_dir = os.path.join(BASE_PATH, 'data/training/simplerAlgo/predicted/predicted.csv') # the dataframe that was pseudo-labeled
distance_data_dir = os.path.join(BASE_PATH, 'data/training/simplerAlgo/trainingData1')
easy_hard_data_dir = os.path.join(BASE_PATH, 'data/training/simplerAlgo/trainingData2')
triplet_data_dir = os.path.join(BASE_PATH, 'data/training/simplerAlgo/trainingData3')

# this will generate random pairs of excerpts with a distance score between 0 and 1
make_distance_cv_data(fold_dir=fold_dir, out_dir=distance_data_dir, pseudolabel_dir=pseudo_labeled_dir, num_draws=1)

# this will generate random pairs of excerpts were one excerpt is labeled 0 if it is easier than the other, or 1 if it is harder
make_easy_hard_cv_data(fold_dir, easy_hard_data_dir)

# this will generate random triplets of excerpts following the schema of: anchor, positive (closer), negative (farther)
make_cv_triplet_data(fold_dir, triplet_data_dir, 10000, margin=0.2, generate_dev=False)

In [None]:
# We will now train a cross-encoder on the easy/hard data as an intermediate training step

#### IF CHANGING THE MODEL BELOW, ALSO HAVE TO CHANGE IT IN THE NEXT BLOCK FINAL MODEL ####
base_model = 'roberta-base'
base_model_save_dir = os.path.join(BASE_PATH, 'models/roberta-base_simplerAlgo')

hyperparams = {
    'ep': 5,
    'bs': 8, # changed batch size (Mathis had 16)
    'logging': 500,
    'lr': 2e-5,
    'weight_decay': 0.01,
    'num_labels': 1,
    'task_type': 'sts',
    'val_type': 'sts'
}

train_sent_transformer_cv(fold_dir=easy_hard_data_dir, out_dir=base_model_save_dir, model_dir=base_model, hyperparams=hyperparams, use_crossencoder=True, kfolds=[0,1,2,3,4,5], continue_training=False)

In [None]:
# Afterwards, we will use the distance data to train a bi-encoder
final_model_save_dir = os.path.join(BASE_PATH, 'models/finalModel_robertabase_simplerAlgo')

hyperparams = {
    'ep': 5,
    'bs': 8, 
    'logging': 500,
    'lr': 2e-5,
    'weight_decay': 0.01,
    'num_labels': 1,
    'task_type': 'sts',
    'val_type': 'sts'
}

train_sent_transformer_cv(fold_dir=distance_data_dir, out_dir=final_model_save_dir, model_dir=base_model_save_dir, hyperparams=hyperparams, use_crossencoder=False, kfolds=[0,1,2,3,4,5], continue_training=True)