In [None]:
import sys

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
sys.path.append("/content/drive/MyDrive/DL4NLP/abstract-to-title-generation")
from config import *

In [None]:
!pip install -r "{PROJECT_ROOT}/requirements.txt"

In [None]:
!dvc pull -f

In [None]:
import pandas as pd
import numpy as np
import torch
import datasets
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset
from tqdm import trange 
from transformers import AutoConfig, AutoTokenizer
import torch.nn as nn
from torch import optim
from transformers import BertModel,BertPreTrainedModel
import torch.nn as nn
from scipy import stats
import os
from pathlib import Path
import math
import time
import datetime
from model_utils import BertRegresser, Excerpt_Dataset, map2index, map_model, train, evaluate

In [None]:
## Model Configurations
p={
    'max_len': 512,
    'batch_size': 6,
    'lr' : 4.0638e-05,
    'epochs': 18, #18
    'dropout': 0.5,
    'num_threads' : 1,
    'model_name' : 'allenai/scibert_scivocab_uncased',
    #'model_name' : 'bert-base-uncased',
    'do_train' : True,
    'random_seed': 24
}

## Fine Tuning

In [None]:
## Configuration loaded from AutoConfig 
aconfig = AutoConfig.from_pretrained(p['model_name'])
## Tokenizer loaded from AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(p['model_name'])
## Creating the model from the desired transformer model
model = BertRegresser.from_pretrained(p['model_name'], config=aconfig)


In [None]:
#freeze all layers except regression head

unfreeze_layers = ['bert.pooler', 'regressor.1']
for name, params in model.named_parameters():
  params.requires_grad = False
  for ele in unfreeze_layers:
    if ele in name:
      params.requires_grad = True
      break

for name, params in model.named_parameters():
  if params.requires_grad:
    print(name, params.size())

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
## Putting model to device
model = model.to(device)
## Takes as the input the logits of the positive class and computes the binary cross-entropy 
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.MSELoss()
## Optimizer
optimizer = optim.Adam(params=model.parameters(), lr=p['lr'])

### Generate training data

In [None]:
title_classes = ['human_title', 'bart_base', 'bart_cnn', 'bart_xsum', 't5_small', 'gpt2', 'pegasus_xsum']
annotations = pd.read_json(f'{DATA_DIR}/annotated/dataset_230samples.json')

def match_titles(titles, classes, fillin):
  matched_titles = []
  for title_row in titles:
      matched_row = []
      row = dict(title_row)
      for generator in classes:
        if generator in row:
          matched_row.append(row[generator])
        else:
          matched_row.append(fillin)
        
      matched_titles.append(matched_row)
      
  return matched_titles

# title dataframe

human_annotation_pairs = []
matched = match_titles(annotations[2], title_classes[1:], fillin="")
print(matched[1])
human_annotation_pairs = [[row[0], row[1]] + matched[idx] for idx, row in annotations.iterrows()]

human_annotations_230 = pd.DataFrame(np.array(human_annotation_pairs))
human_annotations_230.columns = ["abstract"] + title_classes
display(human_annotations_230)

human_annotations_230.to_csv(f'{DATA_DIR}/annotated/230_annotations_pairs.csv')

# scores

score_classes = [cls + "_bws" for cls in title_classes]
human_scores = []
matched_human_scores = match_titles(annotations[3], score_classes, fillin="")
human_scores = [matched_human_scores[idx] for idx in range(len(annotations))]

human_scores_230 = pd.DataFrame(np.array(human_scores))
human_scores_230.columns = title_classes
display(human_scores_230)

human_scores_230.to_csv(f'{DATA_DIR}/annotated/230_humanannotation_withoutunannotated.csv')


In [None]:
text_map = pd.read_csv(f'{DATA_DIR}/annotated/230_annotations_pairs.csv', index_col=0)
scores = pd.read_csv(f'{DATA_DIR}/annotated/230_humanannotation_withoutunannotated.csv', index_col=0)
text_map.fillna('', inplace=True)
scores.fillna('', inplace=True)
display(scores)

In [None]:
abstract_df =text_map[['abstract']]
title_df = text_map[['human_title', 'bart_base', 'bart_cnn', 'bart_xsum', 't5_small', 'gpt2', 'pegasus_xsum']]
abstract_np = abstract_df.to_numpy()
scores_np = scores.to_numpy()
title_np = title_df.to_numpy()
title_np_picked = np.array([[s for s in list(row) if s != ''] for row in title_np])
score_np_picked = np.array([[s for s in list(row) if s != ''] for row in scores_np])
"""idx_map = [map2index(row) for row in map_model[:140]]
title_np_picked = np.array([np.take(row1, np.sort(row2)) for row1, row2 in zip(title_np, idx_map)])
score_np_picked = np.array([np.take(row1, np.sort(row2)) for row1, row2 in zip(scores_np, idx_map)])"""

In [None]:
pairs_np_picked = np.concatenate([abstract_np, title_np_picked,score_np_picked], axis=1)
pairs_np_picked_shuffled = np.random.permutation(pairs_np_picked)

In [None]:
abstracs = pairs_np_picked_shuffled[:,:1]
titles = pairs_np_picked_shuffled[:,1:7]
scores = pairs_np_picked_shuffled[:,7:].astype(float)
scores = np.around(scores, 4)

In [None]:
lst = []

for ab, row1, row2 in zip(abstracs, titles, scores):
  assert len(row1) == len(row2), f"{len(row1)}, {len(row2)}"
  assert len(row1) == 6
  for t, s in zip(row1, row2):
    
    if np.isnan(s):
      print('found nan score')
    if t=='':
      print('found empty title')
    lst.append([ab[0] + '[SEP]' + t, s])
df = pd.DataFrame(np.array(lst))

In [None]:
df.columns = ['excerpt', 'target']
#dataframe = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
train_ratio = 0.7
val_ration = 0.1
test_ratio = 0.2
df_len = 230
train_range = round(6 * train_ratio * df_len)
val_range = round(6 * val_ration * df_len)
test_range = round(6 * test_ratio * df_len)

print(f"{train_range}-{val_range}-{test_range}")

dftrain = df[:train_range].reset_index(drop=True)
dfdev = df[val_range:test_range].reset_index(drop=True)
dftest = df[test_range:].reset_index(drop=True)

Path(f'{OUTPUT_DIR}/reward_model_robust_test/').mkdir(parents=True, exist_ok=True)

dftrain.to_csv(f'{OUTPUT_DIR}/reward_model_robust_test/sciBert_shuffled_train.csv')
dfdev.to_csv(f'{OUTPUT_DIR}/reward_model_robust_test/sciBert_shuffled_dev.csv')
dftest.to_csv(f'{OUTPUT_DIR}/reward_model_robust_test/sciBert_shuffled_test.csv')

### Training Dataset

In [None]:
train_set = Excerpt_Dataset(data=dftrain, maxlen=p['max_len'], tokenizer=tokenizer)
dev_set = Excerpt_Dataset(data=dfdev, maxlen=p['max_len'], tokenizer=tokenizer)
test_set = Excerpt_Dataset(data=dftest, maxlen=p['max_len'], tokenizer=tokenizer)

### Data Loaders

In [None]:
train_loader = DataLoader(dataset=train_set, batch_size=p['batch_size'], num_workers=p['num_threads'], shuffle=True)
dev_loader = DataLoader(dataset=dev_set, batch_size=p['batch_size'], num_workers=p['num_threads'], shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=p['batch_size'], num_workers=p['num_threads'], shuffle=True)

In [None]:
# Do Train (do not use this for training of reward model, reward model trained using ray tune)

if p['do_train']:
  train(model=model,
    criterion=criterion,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=dev_loader,
    epochs = p['epochs'],
    device = device)


### Save model checkpoint


In [None]:
stats_df = pd.DataFrame(np.array(training_stats))
stats_df.columns = ["episode", "accuracy", "val_loss"]
display(stats_df)

In [None]:
save_folder = f"{PROJECT_ROOT}/reward_model/finetuned_size{df_len}_ep{p['epochs']}_{datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d__%H_%M_%S')}"
save_file = "model.pth"
save_path = f"{save_folder}/{save_file}"

Path(save_folder).mkdir(parents=True, exist_ok=True)

torch.save(model.state_dict(), save_path)
stats_df.to_csv(f"{save_folder}/stats.csv")

### Load best read model

In [None]:
model_state, optimizer_state = torch.load(os.path.join(f'{PROJECT_ROOT}/reward_model/{save_path}', "checkpoint"))
model.load_state_dict(model_state)

### Title prediction function

In [None]:
def predict(model, dataloader, device):
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
                        
            predicted_label += output
            actual_label += target
            
    return predicted_label, actual_label


## Display Correlation

In [None]:
model = BertRegresser.from_pretrained(p['model_name'], config=aconfig)
model.load_state_dict(torch.load(f"{PROJECT_ROOT}/reward_model/finetuned_size230_ep18_2022-08-08__14_05_45_false/model.pth"))
model.to(device)

In [None]:
output,GS_label = predict(model, train_loader, device)
cpu_output = np.array([x.cpu().data.numpy() for x in output]).squeeze()
cpu_target = np.array([x.cpu().data.numpy() for x in GS_label]).squeeze()
stats.spearmanr(cpu_output, cpu_target)[0]

In [None]:
dev_output,dev_GS_label = predict(model, dev_loader, device)
cpu_dev_output = np.array([x.cpu().data.numpy() for x in dev_output]).squeeze()
cpu_dev_target = np.array([x.cpu().data.numpy() for x in dev_GS_label]).squeeze()
stats.spearmanr(cpu_dev_output, cpu_dev_target)[0]

In [None]:
test_output,test_GS_label = predict(model, test_loader, device)
cpu_test_output = np.array([x.cpu().data.numpy() for x in test_output]).squeeze()
cpu_test_target = np.array([x.cpu().data.numpy() for x in test_GS_label]).squeeze()
stats.spearmanr(cpu_test_output, cpu_test_target)[0]