# README

#Setup

In [None]:
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas
!pip install sentence-transformers
!pip install sklearn
!pip install datasets


In [None]:
import numpy as np
import pandas as pd
import math
import itertools
import random
import torch
import os
import gzip
import json
from tqdm import tqdm
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer, util, losses, models
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForMaskedLM, DataCollatorForWholeWordMask, DataCollatorForLanguageModeling, pipeline
from transformers import AdamW, get_linear_schedule_with_warmup, TrainerCallback
from sklearn.model_selection import StratifiedKFold
import shutil
from datasets import load_metric
import gc
gc.enable()
from sklearn.svm import SVR, LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso, BayesianRidge, Perceptron, SGDRegressor

In [None]:
from google.colab import drive
drive.mount('gdrive')

# Constants

In [None]:
BASE_PATH = 'gdrive/My Drive/colabNotebooks/commonLitReadabilityPrize/firstPlace_CodeFiles'

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 28
seed_everything(seed=SEED)
MAX_LENGTH = 256

In [None]:

# fine-tuned model paths
# adjust path if you have saved the models in different directories
ALBERT_TRAINED_1 = os.path.join(BASE_PATH, 'models/albertxxlarge2models')
ALBERT_TRAINED_2 = os.path.join(BASE_PATH, 'models/albertxxlargelowlr')
ALBERT_TRAINED_3 = os.path.join(BASE_PATH, 'models/albertxxlargealldata')
DEBERTA_TRAINED_1 = os.path.join(BASE_PATH, 'models/debertalarge')
DEBERTA_TRAINED_2 = os.path.join(BASE_PATH, 'models/debertalargelowlr')
DEBERTA_TRAINED_3 = os.path.join(BASE_PATH, 'models/debertabootstrap')
ROBERTA_TRAINED_1 = os.path.join(BASE_PATH, 'models/robertalargetwomodels')
ELECTRA_TRAINED_1 = os.path.join(BASE_PATH, 'models/electralarge')

# Functions

In [None]:
def train_model(
    model_dir,
    out_dir,
    data,
    data_labels,
    test_data=None,
    test_labels=None,
    do_eval=False,
    do_epoch_eval=False,
    do_save_best=False,
    hyperparams={'bs': 16, 'lr': 1e-4, 'ep': 5, 'bias': False, 'init': None},
    cfg={'num_labels': 1, 'logging_steps': 500, 'is_multilabel': False, 'keep_layers': None}
    ):
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
  
  train_encodings = tokenizer(data, truncation=True, padding=True, max_length=MAX_LENGTH)
  if test_data:
    test_encodings = tokenizer(test_data, truncation=True, padding=True, max_length=MAX_LENGTH)
  

  class LitDataset(torch.utils.data.Dataset):
      def __init__(self, encodings, labels):
          self.encodings = encodings
          self.labels = labels

      def __getitem__(self, idx):
          item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
          item['labels'] = torch.tensor(self.labels[idx])
          return item

      def __len__(self):
          return len(self.labels)

  train_dataset = LitDataset(train_encodings, data_labels)
  if test_data:
    test_dataset = LitDataset(test_encodings, test_labels)
  
  train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=hyperparams['bs'])
  training_steps = len(train_dataloader) * hyperparams['ep'] 
  warmup_steps = math.ceil(training_steps  * 0.06)

  training_args = TrainingArguments(
      output_dir=out_dir,          # output directory
      num_train_epochs=hyperparams['ep'],              # total number of training epochs
      per_device_train_batch_size=hyperparams['bs'],  # batch size per device during training
      per_device_eval_batch_size=1,   # batch size for evaluationing rate scheduler
      logging_dir='/tmp/logs',            # directory for storing logs
      logging_steps=cfg['logging_steps'],
      seed=SEED,
      weight_decay=hyperparams['weight_decay'],
      learning_rate=hyperparams['lr'],
      save_strategy='no'
  )
  config = AutoConfig.from_pretrained(
      model_dir,
      num_labels=cfg['num_labels'],
      hidden_dropout_prob=hyperparams['hidden_dropout'],
      attention_probs_dropout_prob=hyperparams['attention_probs_dropout'])
  model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=cfg['num_labels'])
  if hyperparams['init']:
    model = reinitialize_layers(model, hyperparams['init'])
  model.config = AutoConfig.from_pretrained(model_dir, num_labels=cfg['num_labels'])
  model.num_labels = cfg['num_labels']
  if cfg['keep_layers']:
    new_layers = torch.nn.ModuleList([layer_module for i, layer_module in enumerate(model.base_model.encoder.layer) if i in cfg['keep_layers']])
    model.base_model.encoder.layer = new_layers
    model.config.num_hidden_layers = len(cfg['keep_layers'])

  optimizer = AdamW(model.parameters(), correct_bias=hyperparams['bias'], lr=hyperparams['lr'])
  scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_training_steps=training_steps, num_warmup_steps=warmup_steps)
  device = "cuda:0"
  scores = []
  best_score = 1.0
  metric = load_metric("accuracy")

  class EvalCallback(TrainerCallback):
    def on_log(self, args, state, control, **kwargs):
      if do_save_best:
        model = kwargs['model']
        y_pred = predict_fast(init_model=model, tokenizer=tokenizer, data=test_data, num_labels=cfg['num_labels'], is_multilabel=cfg['is_multilabel'])
        model.train()
        curr_score = rms(test_labels, y_pred) if not cfg['is_multilabel'] else metric.compute(predictions=y_pred, references=test_labels)['accuracy']
        print('Score: ', curr_score)

        if len(scores) == 0 or min(scores) > curr_score:
          print(f'is min {curr_score} is smaller than {scores}')
          best_score = curr_score
          save_dir = os.path.join(out_dir, 'best')
          model.save_pretrained(save_dir)
          tokenizer.save_pretrained(save_dir)
          with open(os.path.join(save_dir, 'hyperparams.txt'), 'w') as f:
            hyperparams['score'] = curr_score
            hyperparams['step'] = state.global_step
            hyperparams['trainset_size'] = len(data_labels)
            f.write(json.dumps(hyperparams))
        scores.append(curr_score)

  trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      optimizers=(optimizer, scheduler),
      callbacks=[EvalCallback]             # evaluation dataset
  )

  trainer.train()

  if not do_save_best:
    model.save_pretrained(out_dir)
    tokenizer.save_pretrained(out_dir)
  print('Training done')

  if do_save_best:
    del model
    gc.collect()
    return min(scores)

In [None]:
def train_cv_v2(model_dir, out_dir, fold_dir, hyperparams, cfg, kfolds=[0, 1, 2, 3, 4, 5], continue_training=False, deduplicate=False, soft_label_model=None):
  scores = []
  for fold in kfolds:
    train_df = pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '.csv')
    val_df = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')
    if deduplicate:
      train_df = train_df.drop_duplicates(subset=['excerpt'])
    train_tx = [str(t) for t in train_df.excerpt.values]
    train_sc = [float(t) for t in train_df.target.values]
    val_tx = [str(t) for t in val_df.excerpt.values]
    val_sc = [float(t) for t in val_df.target.values]

    model_out_dir = out_dir + '/model_fold_' + str(fold)
    if continue_training:
      final_model_dir = model_dir + '/model_fold_' + str(fold) + '/best'
    else:
      final_model_dir = model_dir
    
    if cfg['soft_labels'] == 'add':
      preds = predict_fast(final_model_dir, train_tx)
      train_tx = train_tx + train_tx
      train_sc = train_sc + preds
    if cfg['soft_labels'] == 'only':
      preds = predict_fast(final_model_dir, train_tx)
      train_tx = train_tx
      train_sc = preds
    if soft_label_model and cfg['soft_labels'] == 'add':
      preds = predict_fast(soft_label_model + '/model_fold_' + str(fold) + '/best', train_tx)
      train_sc = train_sc + preds
      train_tx = train_tx + train_tx
    if soft_label_model and cfg['soft_labels'] == 'only':
      preds = predict_fast(soft_label_model + '/model_fold_' + str(fold) + '/best', train_tx)
      train_sc = preds
      train_tx = train_tx
      
    best_score = train_model(
        model_dir=final_model_dir,
        out_dir=model_out_dir,
        data=train_tx,
        data_labels=train_sc,
        test_data=val_tx,
        test_labels=val_sc,
        do_save_best=True,
        hyperparams=hyperparams,
        cfg=cfg
      )
    scores.append(best_score)
  cv_score = np.mean(scores)
  with open(out_dir + '/eval.txt', 'w') as f:
    f.write('CV score is ' + str(cv_score))

In [None]:
def predict_fast(model_name=None, data=None, init_model=None, tokenizer=None, num_labels=1, is_multilabel=False, output_logits=False, use_softmax=False):
  device = "cuda:0"
  tokenizer = AutoTokenizer.from_pretrained(model_name) if model_name else tokenizer
  config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) if model_name else None
  model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config) if model_name else init_model
  model.to(device)
  model.eval()
  y_pred = []
  batches = chunks(data, 32)
  for batch in tqdm(batches):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    input_ids = inputs['input_ids'].to(device)
    attention = inputs['attention_mask'].to(device)
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention
    }
    with torch.no_grad():        
          outputs = model(**inputs)
    if not use_softmax:
      logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
    else:
      logits = nn.functional.softmax(outputs.logits, dim=-1).detach().cpu().numpy().squeeze().tolist()
    if is_multilabel and not output_logits:
      logits = np.argmax(logits, axis=-1)
    y_pred.extend(logits)
  del model
  gc.collect()
  return y_pred

In [None]:
def get_oof_predictions(model_dirs, fold_dir, out_dir, kfolds=[0,1,2,3,4,5]):
  df = pd.DataFrame()
  
  for fold in kfolds:
    val_df = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')
    val_tx = [str(t) for t in val_df.excerpt.values]
    val_sc = [float(t) for t in val_df.target.values]
    fold_df = pd.DataFrame()
    fold_df['fold'] = [fold for v in val_sc]
    fold_df['excerpt'] = val_tx
    fold_df['target'] = val_sc
    fold_df['id'] = val_df['id']

    for model in model_dirs:
      final_model_dir = model + '/model_fold_' + str(fold) + '/best'
      model_name = model.split('/')[-1]
      preds = predict_fast(final_model_dir, val_tx)
      fold_df[model_name] = preds
    df = df.append(fold_df, ignore_index=True)
  
  df.to_csv(out_dir)  

In [None]:
def train_leaky_ensembler(oof_dir, model_names, out_dir, kfolds=[0,1,2,3,4,5], model_bins=[], clf='ridge', find_opt_avg=False, bin_avg_dir=None, use_postprocessing=False):
  df = pd.read_csv(oof_dir)

  if find_opt_avg:
    msk = np.random.rand(len(df)) < 0.2
    df_test = df[msk].reset_index(drop=True)
    df = df[~msk].reset_index(drop=True)
    
  get_bin_stratified(df, n_splits=6)

  results = []
  if find_opt_avg:
    avg_df = pd.DataFrame()
    avg_df['target'] = [float(f) for f in df_test['target']]
  for fold in kfolds:
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    
    train_tx = [str(t) for t in train_df.excerpt.values]
    val_tx = [str(t) for t in val_df.excerpt.values]
    val_sc = [float(f) for f in val_df.target.values]
    train_sc = [float(f) for f in train_df.target.values]

    train_predictions = []
    val_predictions = []
    avg_predictions = []

    if len(model_bins) > 0 and not use_postprocessing:
      for model_name in model_bins:
        preds = [json.loads(p) for p in train_df[model_name].values]
        preds_val = [json.loads(p) for p in val_df[model_name].values]
        if bin_avg_dir:
          with open(bin_avg_dir, 'r') as f:
            averages = json.loads(f.read())
          preds = [averages[np.argmax(p)] for p in preds]
          preds_val = [averages[np.argmax(p)] for p in preds_val]

        train_predictions.append(preds)
        val_predictions.append(preds_val)
    
    for model_name in model_names:
      preds = [float(f) for f in train_df[model_name].values]
      train_predictions.append(np.array(preds))
      preds_val = [float(f) for f in val_df[model_name].values]
      val_predictions.append(np.array(preds_val))
      if find_opt_avg:
        preds_avg = [float(f) for f in df_test[model_name].values]
        avg_predictions.append(np.array(preds_avg))
    
    X = np.column_stack(train_predictions)
    
    if clf == 'ridge':
      clf = Ridge(alpha=1.0)
    elif clf == 'linearsvr':
      clf = LinearSVR(max_iter=1000000)
    elif clf == 'svr':
      clf = SVR()
    elif clf == 'kernel':
      clf = KernelRidge()
    elif clf == 'gbr':
      clf = GradientBoostingRegressor()
    elif clf == 'linear':
      clf = LinearRegression()
    elif clf == 'lasso':
      clf = Lasso()
    elif clf == 'bayes':
      clf = BayesianRidge()
    elif clf == 'perceptron':
      clf = SGDRegressor()
    
    clf.fit(X, train_sc)

    final_out = out_dir + '/model_fold_' + str(fold) + '/'
    if not os.path.exists(os.path.dirname(final_out)):
      try:
          os.makedirs(os.path.dirname(final_out))
      except OSError as exc: # Guard against race condition
          if exc.errno != errno.EEXIST:
              raise
    dump(clf, final_out + 'ridge_model.joblib')

    Y = np.column_stack(val_predictions)

    y_preds = clf.predict(Y)
    if use_postprocessing:
      preds_val = [json.loads(p) for p in val_df[model_bins[0]].values]
      with open(bin_avg_dir, 'r') as f:
            averages = json.loads(f.read())
      preds_val_bins = [np.argmax(p) for p in preds_val]
      zipped = list(zip(preds_val_bins, preds_val))
      y_preds = postprocess_predictions(y_preds, zipped, averages)

    score = rms(val_sc, y_preds)
    print('Score is: ', score)
    results.append(score)

    if find_opt_avg:
      Y_test = np.column_stack(avg_predictions)
      y_preds_test = clf.predict(Y_test)
      avg_df['fold_' + str(fold)] = y_preds_test
  
  if find_opt_avg:
    ridge_names = ['fold_' + str(fold) for fold in range(kfolds)]
    print(find_best_stack(avg_df, ridge_names, drop_models=False))

  with open(out_dir + '/eval.txt', 'w') as f:
    mean = np.mean(results)
    print('CV ist: ', mean)
    f.write('CV is: ' + str(mean))

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
def rms(y_actual, y_predicted):
  return mean_squared_error(y_actual, y_predicted, squared=False)

# Pretraining models

In [None]:
# Load the pseudo-labeled training data for pretraining models
train_df = pd.read_csv(os.path.join(BASE_PATH, 'data/training/predicted/predicted.csv'))
train_tx = [str(t) for t in train_df.excerpt.values]
train_sc = [float(t) for t in train_df.target.values]

In [None]:
# Load the entire training set from the original competition for validation during pretraining
val_df = pd.read_csv(os.path.join(BASE_PATH, 'data/training/original/train.csv'))
val_tx = [str(t) for t in train_df.excerpt.values]
val_sc = [float(t) for t in train_df.target.values]

In [None]:
# Train an ALBERT model

model_name = 'albert-xxlarge-v2'
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.01,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.07,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 60,
  'keep_layers': None,
  'soft_labels': None
}

ALBERT_PRETRAINED = os.path.join(BASE_PATH, 'models/albertxxlarge2models')

In [None]:
train_model(
    model_dir=model_name,
    out_dir=ALBERT_PRETRAINED,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# Train a DEBERTA model
model_name = 'microsoft/deberta-large'
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 20,
  'keep_layers': None,
  'soft_labels': None
}

DEBERTA_PRETRAINED = os.path.join(BASE_PATH, 'models/debertabootstrap')

train_model(
    model_dir=model_name,
    out_dir=out_dir,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# Train a RoBERTa model
model_name = 'roberta-large'
hyperparams = {
  'bs': 8,
  'lr': 1e-5,
  'weight_decay': 0.01,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

ROBERTA_PRETRAINED = os.path.join(BASE_PATH, 'models/robertalargetwomodels')

train_model(
    model_dir=model_name,
    out_dir=out_dir,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# Train an ELECTRA model
model_name = 'google/electra-large-discriminator'
hyperparams = {
  'bs': 4,
  'lr': 8e-6,
  'weight_decay': 0.1,
  'ep': 7,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

ELECTRA_PRETRAINED = os.path.join(BASE_PATH, 'models/electralarge')

train_model(
    model_dir=model_name,
    out_dir=ELECTRA_PRETRAINED,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)

# Training models

In total, I trained 3 deberta-large, 1 roberta-large, 3 albert-xxlarge and 1 electra-large model for my winning submission.



In [None]:
# Training the ALBERT models

In [None]:
# albert 1
model_name = os.path.join(ALBERT_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.01,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.07,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ALBERT_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# albert 2
model_name = os.path.join(ALBERT_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 7e-6,
  'weight_decay': 0.07,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ALBERT_TRAINED_2

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# albert 3
# albert 3 is special it is trained on all training data without evaluation.

# YANISA ADD - so I can get this pretrained model I already have from Mathis but still train
#   the new ALBERT_TRAINED_3
ALBERT_PRETRAINED = os.path.join(BASE_PATH, 'models/albertxxlarge2models')

model_name = os.path.join(ALBERT_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 600,
  'keep_layers': None,
  'soft_labels': None
}

train_df = pd.read_csv(os.path.join(BASE_PATH, 'data/training/original/train.csv'))
train_tx = [str(t) for t in train_df.excerpt.values]
train_sc = [float(t) for t in train_df.target.values]

out_dir = ALBERT_TRAINED_3


train_model(
   model_dir=model_name,
   out_dir=out_dir,
   data=train_tx,
   data_labels=train_sc,
   hyperparams=hyperparams,
   cfg=cfg
)


In [None]:
# Training the deberta models

In [None]:
# deberta 1
model_name = os.path.join(DEBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = DEBERTA_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# deberta 2
model_name = os.path.join(DEBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 7e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = DEBERTA_TRAINED_2

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# deberta 3
# This deberta model was trained on data sampled using bootstrapping instead of cross validation
# Only models trained on 2 folds/bags were used in the final submission
model_name = os.path.join(DEBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.08,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = DEBERTA_TRAINED_3

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg,
    kfolds=[0,1]
)

In [None]:
# Training the ELECTRA model

In [None]:
# electra 1
model_name = os.path.join(ELECTRA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 8e-6,
  'weight_decay': 0.1,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ELECTRA_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# Training the RoBERTa model

In [None]:
# roberta 1
model_name = os.path.join(ROBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 8,
  'lr': 1e-5,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ROBERTA_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

# Stacking

In [None]:
model_dirs = [
    ALBERT_TRAINED_1,
    DEBERTA_TRAINED_1,
    ALBERT_TRAINED_2,
    DEBERTA_TRAINED_1,
    ROBERTA_TRAINED_1,
    ELECTRA_TRAINED_1
]

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = os.path.join(BASE_PATH, 'data/training/oof')

In [None]:
get_oof_predictions(model_dirs=model_dirs, fold_dir=fold_dir, out_dir=out_dir)

In [None]:
model_names_ensemble_1 = [
    ALBERT_TRAINED_1.split('/')[-1],
    DEBERTA_TRAINED_1.split('/')[-1],
    ALBERT_TRAINED_2.split('/')[-1],
    DEBERTA_TRAINED_1.split('/')[-1],
    ROBERTA_TRAINED_1.split('/')[-1],
    ELECTRA_TRAINED_1.split('/')[-1],      
]

model_names_ensemble_2 = model_names_ensemble_1[:-1]

# oof_dir = os.path.join(BASE_PATH, 'data/training/oof')

# out_dir_ensemble_1 = os.path.join(BASE_PATH, 'models/electraensembling')
# out_dir_ensemble_2 = os.path.join(BASE_PATH, 'models/hugeensembler')
# ^^ YANISA COMMENT - PUT BELOW

In [None]:
# YANISA ADD BLOCK FROM ABOVE ^^

oof_dir = os.path.join(BASE_PATH, 'data/training/oof')

out_dir_ensemble_1 = os.path.join(BASE_PATH, 'models/electraensembling')
out_dir_ensemble_2 = os.path.join(BASE_PATH, 'models/hugeensembler')

In [None]:
# train ensemble 1
train_leaky_ensembler(oof_dir=oof_dir, model_names=model_names_ensemble_1, out_dir=out_dir_ensemble_1)

In [None]:
# train ensemble 2
train_leaky_ensembler(oof_dir=oof_dir, model_names=model_names_ensemble_2, out_dir=out_dir_ensemble_2)

You have finished training the models.

**** YANISA ADD - I don't think I have to train anything here? Mathis included it in github