In [None]:
import sys

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
sys.path.append("/content/drive/MyDrive/DL4NLP/abstract-to-title-generation")
from config import *

In [None]:
!cd "{PROJECT_ROOT}"
sys.path.append(f"{PROJECT_ROOT}/src")

In [None]:
#!pip install -r "requirements.txt"

In [None]:
#!dvc pull -f

In [None]:
import pandas as pd
import numpy as np
import torch
import datasets
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset
from tqdm import trange 
from transformers import AutoConfig, AutoTokenizer
import torch.nn as nn
from torch import optim
from transformers import BertModel,BertPreTrainedModel
import torch.nn as nn
from scipy import stats
import os
from pathlib import Path
import math
import time
import datetime
from model_utils import BertRegresser, Excerpt_Dataset, map2index, map_model, train, evaluate
from dataset_utils import gen_datasets

In [None]:
## Model Configurations
p = {
    'max_len': 512,
    'batch_size': 6,
    'lr': 4.0638e-05,
    'epochs': 18, #18
    'dropout': 0.5,
    'num_threads': 1,
    'model_name': 'allenai/scibert_scivocab_uncased',
    #'model_name': 'bert-base-uncased',
    'do_train': True,
    'random_seed': 24
}

## Fine Tuning

In [None]:
## Configuration loaded from AutoConfig 
aconfig = AutoConfig.from_pretrained(p['model_name'])
## Tokenizer loaded from AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(p['model_name'])
## Creating the model from the desired transformer model
model = BertRegresser.from_pretrained(p['model_name'], config=aconfig)


In [None]:
#freeze all layers except regression head

unfreeze_layers = ['bert.pooler', 'regressor.1']
for name, params in model.named_parameters():
  params.requires_grad = False
  for ele in unfreeze_layers:
    if ele in name:
      params.requires_grad = True
      break

for name, params in model.named_parameters():
  if params.requires_grad:
    print(name, params.size())

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
## Putting model to device
model = model.to(device)
## Takes as the input the logits of the positive class and computes the binary cross-entropy 
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.MSELoss()
## Optimizer
optimizer = optim.Adam(params=model.parameters(), lr=p['lr'])

### Generate training data

In [None]:
annotations = pd.read_json(f'{DATA_DIR}/annotated/dataset_230samples.json')

train_loader, dev_loader, test_loader = gen_datasets(
    tokenizer,
    annotations,
    p["max_len"],
    p["batch_size"],
    p["num_threads"]
)

### Training

In [None]:
# Do Train (do not use this for training of reward model, reward model trained using ray tune)

if p['do_train']:
  train(model=model,
    criterion=criterion,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=dev_loader,
    epochs = p['epochs'],
    device = device)


### Save model checkpoint


In [None]:
stats_df = pd.DataFrame(np.array(training_stats))
stats_df.columns = ["episode", "accuracy", "val_loss"]
display(stats_df)

In [None]:
save_folder = f"{PROJECT_ROOT}/reward_model/finetuned_size{df_len}_ep{p['epochs']}_{datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d__%H_%M_%S')}"
save_file = "model.pth"
save_path = f"{save_folder}/{save_file}"

Path(save_folder).mkdir(parents=True, exist_ok=True)

torch.save(model.state_dict(), save_path)
stats_df.to_csv(f"{save_folder}/stats.csv")

### Load best read model

In [None]:
model_state, optimizer_state = torch.load(os.path.join(f'{PROJECT_ROOT}/reward_model/{save_path}', "checkpoint"))
model.load_state_dict(model_state)

### Title prediction function

In [None]:
def predict(model, dataloader, device):
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
                        
            predicted_label += output
            actual_label += target
            
    return predicted_label, actual_label


## Display Correlation

In [None]:
model = BertRegresser.from_pretrained(p['model_name'], config=aconfig)
model.load_state_dict(torch.load(f"{PROJECT_ROOT}/reward_model/finetuned_size230_ep18_2022-08-08__14_05_45_false/model.pth"))
model.to(device)

In [None]:
output,GS_label = predict(model, train_loader, device)
cpu_output = np.array([x.cpu().data.numpy() for x in output]).squeeze()
cpu_target = np.array([x.cpu().data.numpy() for x in GS_label]).squeeze()
stats.spearmanr(cpu_output, cpu_target)[0]

In [None]:
dev_output,dev_GS_label = predict(model, dev_loader, device)
cpu_dev_output = np.array([x.cpu().data.numpy() for x in dev_output]).squeeze()
cpu_dev_target = np.array([x.cpu().data.numpy() for x in dev_GS_label]).squeeze()
stats.spearmanr(cpu_dev_output, cpu_dev_target)[0]

In [None]:
test_output,test_GS_label = predict(model, test_loader, device)
cpu_test_output = np.array([x.cpu().data.numpy() for x in test_output]).squeeze()
cpu_test_target = np.array([x.cpu().data.numpy() for x in test_GS_label]).squeeze()
stats.spearmanr(cpu_test_output, cpu_test_target)[0]