In [None]:
import sys

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
sys.path.append("/content/drive/MyDrive/DL4NLP/abstract-to-title-generation")
from config import *

In [None]:
!cd "{PROJECT_ROOT}"
sys.path.append(f"{PROJECT_ROOT}/src")
sys.path.append(f"{PROJECT_ROOT}/deps")

In [None]:
!pip install -r "requirements.txt" &> /dev/null

In [None]:
#!dvc pull -f # <- uncomment to pull data from dvc

In [None]:
import pandas as pd
import numpy as np
import torch
import datasets
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset
from tqdm import trange 
from transformers import AutoConfig, AutoTokenizer
import torch.nn as nn
from torch import optim
from transformers import BertModel,BertPreTrainedModel
import torch.nn as nn
from scipy import stats
import os
from pathlib import Path
import matplotlib.pyplot as plt
import math
import time
import datetime
import model_utils
import dataset_utils
from fast_soft_sort.pytorch_ops import soft_rank
from sklearn.model_selection import train_test_split
import copy
from scipy.interpolate import make_interp_spline, BSpline

In [None]:
## Model Configurations
p = {
    'max_len': 512,
    'batch_size': 6,
    'lr': 4.0638e-05, #4.0638e-06,
    'epochs': 16, #18,
    'humor_epochs': 18, #18,
    'train_runs': 3,
    'humor_train_runs': 4,
    'dropout': 0.5,
    'num_threads': 1,
    'model_name': 'allenai/scibert_scivocab_uncased',
    'train_quality': False,
    'train_humor': True,
    'random_seed': 24
}

model_utils.setup_seed(p['random_seed'])
df_size = 230

In [None]:
model_name = f"finetuned_size{df_size}_lr{p['lr']}_ep{p['epochs']}_{datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d__%H_%M_%S')}"
model_save_path = f"{PROJECT_ROOT}/evaluation_models/reward_model/{model_name}"

def save_training_stats(run, phase_title, training_stats, save_path):
  Path(save_path).mkdir(parents=True, exist_ok=True)
  stats_df = pd.DataFrame(np.array(training_stats))
  stats_df.columns = ["episode", "accuracy", "correlation"]
  stats_df.to_csv(f"{save_path}/{run}_{phase_title}_stats.csv")

def save_model(model, save_path):
  save_path = f"{save_path}"
  Path(save_path).mkdir(parents=True, exist_ok=True)
  torch.save(model.state_dict(), f"{save_path}/model.pth")
  

## Fine Tuning

In [None]:
## Configuration loaded from AutoConfig 
aconfig = AutoConfig.from_pretrained(p['model_name'])
## Tokenizer loaded from AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(p['model_name'])
## Creating the model from the desired transformer model
model = model_utils.HumorBertRegresser.from_pretrained(p['model_name'], config=aconfig)

#freeze all layers except regression head

unfreeze_layers = ['bert.pooler', 'regressor.1']
for name, params in model.named_parameters():
  params.requires_grad = False
  for ele in unfreeze_layers:
    if ele in name:
      params.requires_grad = True
      break

for name, params in model.named_parameters():
  if params.requires_grad:
    print(name, params.size())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
## Putting model to device
model = model.to(device)
## Optimizer
optimizer = optim.Adam(params=model.parameters(), lr=p['lr'])

In [None]:
mse = nn.MSELoss()
cos = nn.CosineSimilarity(dim=0)
criterion = lambda a, b: 1.0 - cos(a, b) + mse(a, b)
print(criterion(torch.as_tensor(1.0), torch.as_tensor(1.0)))
print(criterion(torch.as_tensor([0.0, 0.1]), torch.as_tensor([0.0, 1.0])))
print(criterion(torch.as_tensor(-1.0), torch.as_tensor(1.0)))
print(criterion(torch.as_tensor(0.1), torch.as_tensor(1.0)))
#print(cos(torch.as_tensor(0.0), torch.as_tensor(0.0)))
#print(cos(torch.as_tensor(1.0), torch.as_tensor(1.0)))
#print(cos(torch.as_tensor(0.1), torch.as_tensor(1.0)))
print(mse(torch.as_tensor([0.0, 0.1]), torch.as_tensor([0.0, 1.0])))
criterion = mse


def corrcoef(target, pred):
    # np.corrcoef in torch from @mdo
    # https://forum.numer.ai/t/custom-loss-functions-for-xgboost-using-pytorch/960
    pred_n = pred - pred.mean()
    target_n = target - target.mean()
    pred_n = pred_n / pred_n.norm()
    target_n = target_n / target_n.norm()
    return (pred_n * target_n).sum()

def spearman(
    pred,
    target,
    regularization="l2",
    regularization_strength=1.0,
):
    # fast_soft_sort uses 1-based indexing, divide by len to compute percentage of rank
    pred = soft_rank(
        pred,
        regularization=regularization,
        regularization_strength=regularization_strength,
    )
    return corrcoef(target, pred / pred.shape[-1])


def corr_loss(a, b):
  assert a.shape == b.shape, f"{a.shape} != {b.shape}"
  return 1.0 - spearman(a, b) + mse(a, b)
print(corr_loss(torch.as_tensor([[0.1, 1.0], [1.0, 0.1]]), torch.as_tensor([[1.0, 0.1], [0.1, 1.0]])))
print(corr_loss(torch.as_tensor([[0.1, 1.0]]), torch.as_tensor([[1.0, 0.1]])))

### Generate training data

In [None]:
annotations = pd.read_json(f'{DATA_DIR}/annotated/dataset_{df_size}samples.json')
annotations.columns = [0, 1, 2, 3, 4, 5]

quality_train_loader, quality_dev_loader, quality_test_loader = dataset_utils.gen_datasets(
    tokenizer,
    annotations,
    p["max_len"],
    p["batch_size"],
    p["num_threads"]
)

### Training

In [None]:
def map_quality(output, target):
  target = target
  return (output[:, 0], target[:, 0])

In [None]:
# Load annotated humor data
annotations = pd.read_csv(f'{DATA_DIR}/humor/quirky_annotated.csv')
aconfig = AutoConfig.from_pretrained(p['model_name'])

# add tokens [humor=0][humor=1][humor=2]
tokenizer, model = dataset_utils.add_humor_token(tokenizer, model)

# create humor dataset and auto annotate for quality
def create_humor_dataset(tokenizer, model, annotations):

  # annotate quality score with quality_model
  df = dataset_utils.gen_humor_dataframe(
      tokenizer,
      model,
      lambda output, target: (output[:, 0], target),
      device,
      annotations,
      p["max_len"],
      p["num_threads"]
  )

  return dataset_utils.gen_humor_datasets(
    tokenizer,
    df,
    p["max_len"],
    p["num_threads"],
    batch_size=3
  )

## Train Quality model only

In [None]:
maps = {
    "map_quality": (
        map_quality,
        quality_train_loader,
        quality_dev_loader,
        p['epochs'],
    ),
}

In [None]:
if p['train_quality']:
  for e in range(p['train_runs']):
    map_sample, train_loader, dev_loader, epochs = list(maps.values())[0]
    
    training_stats = model_utils.train(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            map_sample=map_sample,
            train_loader=train_loader,
            val_loader=dev_loader,
            epochs = epochs,
            device = device
        )
    
    save_training_stats(e, "quality", training_stats, save_path=model_save_path)

## Train Humor Model

In [None]:
# training pipeline for humor and quality learning
humor_maps = {
    "map_quality": (
        map_quality,
        quality_train_loader,
        quality_dev_loader,
        p['epochs'],
    ),
    "map_mock_quality": (
        lambda output, target: (output[:, 0], torch.full([target.shape[0]], 1.0)),
        None,
        None,
        p['epochs']
    ),
    "map_humor": (
        lambda output, target: (output[:, 1], target[:, 1]),
        None,
        None,
        p['humor_epochs']
    ),
    "map_id": (
        lambda output, target: (output, target),
        None,
        None,
        p['epochs']
    ),
}

In [None]:
if p['train_humor']:
  for e in range(p['train_runs']):

    map_sample, train_loader, dev_loader, epochs = list(humor_maps.values())[0]
    
    training_stats = model_utils.train(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            map_sample=map_sample,
            train_loader=train_loader,
            val_loader=dev_loader,
            epochs = epochs,
            device = device
        )
    
    save_training_stats(e, "quality", training_stats, save_path=model_save_path)
    
    train_loader, dev_loader, test_loader = create_humor_dataset(tokenizer, model, annotations)

    for phase_name, (map_sample, _, _, epochs) in list(humor_maps.items())[1:4]:

        training_stats = model_utils.train(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            map_sample=map_sample,
            train_loader=train_loader,
            val_loader=dev_loader,
            epochs = epochs,
            device = device
        )
        save_training_stats(e, phase_name, training_stats, save_path=model_save_path)

In [None]:
save_model(model, model_save_path)


### Evaluate Quality

In [None]:
def print_quality_stats(model):
  print("Train")
  model_utils.display_correlation(model, quality_train_loader, maps['map_quality'][0], device)
  print("Dev")
  model_utils.display_correlation(model, quality_dev_loader, maps['map_quality'][0], device)
  print("Test")
  model_utils.display_correlation(model, quality_test_loader, maps['map_quality'][0], device)

### Evaluate Humor & Quality

In [None]:
def print_humor_quality_stats(model, humor_train_loader, humor_dev_loader, humor_test_loader):
  print("Train")
  model_utils.display_correlation(model, humor_train_loader, humor_maps['map_id'][0], device)
  print("Dev")
  model_utils.display_correlation(model, humor_dev_loader, humor_maps['map_id'][0], device)
  print("Test")
  model_utils.display_correlation(model, humor_test_loader, humor_maps['map_id'][0], device)

### Load best model

In [None]:
def load_model(path):
  model.load_state_dict(torch.load(path))
  model.to(device)
  return model

### Plotting & evaluation

In [None]:
humor_model_paths = [
    f"{PROJECT_ROOT}/evaluation_models/reward_model/finetuned_size80_lr4.0638e-05_ep16_2022-08-20__13_17_24_final_new",
    f"{PROJECT_ROOT}/evaluation_models/reward_model/finetuned_size140_lr4.0638e-05_ep16_2022-08-18__11_34_24_final",
    f"{PROJECT_ROOT}/evaluation_models/reward_model/finetuned_size230_lr4.0638e-05_ep16_2022-08-20__13_03_57_final_new"
]

quality_model_paths = [
    f"{PROJECT_ROOT}/evaluation_models/reward_model/only_quality_finetuned_size80_lr4.0638e-05_ep20_2022-08-18__21_29_20_final",
    f"{PROJECT_ROOT}/evaluation_models/reward_model/only_quality_finetuned_size140_lr4.0638e-05_ep20_2022-08-18__15_05_43_final",
    f"{PROJECT_ROOT}/evaluation_models/reward_model/only_quality_finetuned_size230_lr4.0638e-05_ep20_2022-08-18__15_17_48_final"
]

SMALL_SIZE = 12
MEDIUM_SIZE = 12
BIGGER_SIZE = 15

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('axes', linewidth=0.5)

plt.rcParams["axes.labelweight"] = "light"
plt.rcParams["font.weight"] = "light"

def plot_model_quality_stats(runs, phase, paths):
  fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 3))
  for train_size, df_len, path in zip([54,96,162], [80,140,230], paths):
    stats_df = pd.DataFrame(np.array([]))
    stats.columns = ["accuracy", "correlation"]
    for run in range(runs):
      tmp_df = pd.read_csv(f"{path}/{run}_{phase}.csv", index_col=0) 
      tmp_df = tmp_df.drop(columns=["episode"])
      stats_df = pd.concat([stats_df, tmp_df])
      stats_df.reset_index(drop=True, inplace=True)
    
    stats_df["accuracy"] = stats_df["accuracy"].apply(lambda x: x / train_size)
    stats_df["accuracy"].interpolate(method='cubic')
    stats_df["accuracy"].plot(ax=axes[0], label=f"{df_len}")
    axes[0].legend(loc="upper right")
    axes[0].title.set_text('(a) MSE loss for reward model trained on title quality')
    stats_df["correlation"].plot(ax=axes[1], label=f"{df_len}")
    axes[1].legend(loc="lower right")
    axes[1].title.set_text('(b) Spearman correlation for title quality')
    for idx, ax in enumerate(axes.flat):
      ax.set(xlabel='epochs', ylabel='mean mse loss' if idx == 0 else 'spearman correlation')

  fig.tight_layout()
  plt.show()

def plot_model_quality_humor_stats(runs, phase, paths):
  fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
  for train_size, df_len, path in zip([54,96,162], [80,140,230], paths):
    stats_df = pd.DataFrame(np.array([]))
    stats.columns = ["accuracy", "correlation_quality", "correlation_humor"]
    for run in range(runs):
      tmp_df = pd.read_csv(f"{path}/{run}_{phase}.csv", index_col=0, converters={'correlation': lambda x: x[1:-1].strip().split(' ', 1)})
      tmp_df = tmp_df.drop(columns=["episode"])
      split_df = pd.DataFrame(tmp_df['correlation'].tolist(), columns=['correlation_quality', 'correlation_humor'])
      tmp_df = pd.concat([tmp_df, split_df], axis=1)
      tmp_df = tmp_df.drop(columns=["correlation"])
      tmp_df = tmp_df.astype({'correlation_quality':'float','correlation_humor':'float'})
      #tmp_df = tmp_df.astype({'correlation_quality': 'float', 'correlation_humor':'float'}).dtypes
      stats_df = pd.concat([stats_df, tmp_df])
      stats_df.reset_index(drop=True, inplace=True)

    stats_df["accuracy"] = stats_df["accuracy"].apply(lambda x: x / 138)
    stats_df["accuracy"].interpolate(method='cubic')
    stats_df["accuracy"].plot(ax=axes[0], label=f"{df_len}")
    axes[0].legend(loc="upper right")
    axes[0].title.set_text("(a) MSE loss / epoch")
    stats_df["correlation_quality"].plot(ax=axes[1], label=f"{df_len} quality")
    stats_df["correlation_humor"].plot(ax=axes[2], label=f"{df_len} humor")
    axes[1].legend(loc="lower right")
    axes[1].title.set_text('(b) title quality correlation / epoch')
    axes[2].legend(loc="lower left")
    axes[2].title.set_text('(c) title humor correlation / epoch')
    for idx, ax in enumerate(axes.flat):
      ax.set(xlabel='epochs', ylabel='mean mse loss' if idx == 0 else 'spearman correlation')

  x = [16, 32]
  ymin1, ymax1 = axes[0].get_ylim()
  ymin2, ymax2 = axes[1].get_ylim()
  ymin3, ymax3 = axes[2].get_ylim()
  axes[0].vlines(x=x, ymin=ymin1, ymax=ymax1, colors='black', ls='--', lw=2)
  axes[1].vlines(x=x, ymin=ymin2, ymax=ymax2, colors='black', ls='--', lw=2)
  axes[2].vlines(x=x, ymin=ymin3, ymax=ymax3, colors='black', ls='--', lw=2)
  fig.tight_layout()
  plt.show()


plot_model_quality_humor_stats(3, "map_id_stats", humor_model_paths)
plot_model_quality_stats(1, "quality_stats", quality_model_paths)

In [None]:
model = None
model = model_utils.HumorBertRegresser.from_pretrained(p["model_name"])
model.to(device)
tokenizer, model = dataset_utils.add_humor_token(tokenizer, model)
humor_train_loader, humor_dev_loader, humor_test_loader = create_humor_dataset(tokenizer, model, annotations)

for path in humor_model_paths:
  model_path = f"{path}/model.pth"
  tokenizer = AutoTokenizer.from_pretrained(p['model_name'])
  model = model_utils.HumorBertRegresser.from_pretrained(p["model_name"])
  tokenizer, model = dataset_utils.add_humor_token(tokenizer, model)
  model.load_state_dict(torch.load(model_path))
  model.to(device)
  humor_train_loader, humor_dev_loader, humor_test_loader = create_humor_dataset(tokenizer, model, annotations)
  print_humor_quality_stats(model, humor_train_loader, humor_dev_loader, humor_test_loader)
  print("-"*20)