# Finetune the Summary Grader

## Initialize

In [16]:
%%capture
from pathlib import Path
import pandas as pd
import numpy as np
!pip install SentencePiece
!pip install "ray[tune]"
!pip install wandb
import seaborn as sns

sns.set_theme(font='Liberation Serif',
              rc={'figure.figsize': (7.5,3.75),
                  'font.size': 11,
                  'figure.dpi': 300,
                 })

In [17]:
DATA = Path.cwd().parent / 'data'
SUMM_FOLDER = DATA / 'summaries_finetune'
TEXT_FILES = SUMM_FOLDER / 'text_files_copy'
SOURCE_TEXTS = SUMM_FOLDER / 'source_texts'

## Load and clean the data

In [18]:
summaries_df = pd.read_csv(SUMM_FOLDER / 'final_summaries_ai_aloe_fixed.csv')

from sklearn.preprocessing import MinMaxScaler
import numpy as np
  
# copy the data
df_normalized = summaries_df.copy()
  
# apply normalization techniques
df_normalized['content_pca'] = MinMaxScaler().fit_transform(np.array(df_normalized['content_pca']).reshape(-1,1))
df_normalized['paraphrase_pca'] = MinMaxScaler().fit_transform(np.array(df_normalized['paraphrase_pca']).reshape(-1,1))  

In [19]:
df_normalized.describe().to_csv('table.csv')

### Seperate out a test set to avoid prompt effect

In [20]:
# source_texts = df_normalized['source_text'].value_counts().to_frame()
# texts_to_remove = list(source_texts.iloc[1:6].index)

# test_df = df_normalized[df_normalized['source_text'].isin(texts_to_remove)]
# train_df = df_normalized[df_normalized['source_text'].isin(texts_to_remove) == False]
# print('test n:', len(test_df))
# print('train n:', len(train_df))

## Transformer time

In [21]:
from datasets import load_dataset, load_metric, Dataset, Value, ClassLabel, Features, DatasetDict
# from transformers import LongformerTokenizer, LongformerForSequenceClassification, LongformerConfig
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig, DataCollatorWithPadding, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import LongformerTokenizer, LongformerForSequenceClassification, LongformerConfig


import torch
seed = 42
model_name =  'roberta-base'# #"google/bigbird-roberta-base"'allenai/longformer-base-4096'# 
tokenizer = RobertaTokenizer.from_pretrained(model_name)# LongformerTokenizer.from_pretrained(model_name)#, padding=True)# #
#torch.cuda.empty_cache()

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

### Build datasets

In [22]:
def buildDataset(df):
    full_dataset = Dataset.from_pandas(df, preserve_index=False)
    # 70% train, 30% test
    train_valid = full_dataset.train_test_split(test_size=0.30, seed=seed)
    # gather everyone if you want to have a single DatasetDict
    valid_test = train_valid['test'].train_test_split(test_size=0.5, seed=seed)
    final_dataset = DatasetDict({
        'train': train_valid['train'],
        'valid': valid_test['train'],
        'test': valid_test['test']})
    return final_dataset

In [23]:
content_df = df_normalized[['text', 'content_pca']]
content_df.columns = ['text', 'labels']
content_ds = buildDataset(content_df)

paraphrase_df = df_normalized[['text', 'paraphrase_pca']]
paraphrase_df.columns = ['text', 'labels']
paraphrase_ds = buildDataset(paraphrase_df)

In [24]:
paraphrase_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 3283
    })
    valid: Dataset({
        features: ['text', 'labels'],
        num_rows: 703
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 704
    })
})

In [25]:
# # this adds the test set in that we seperated earlier
# content_ds['test'] = Dataset.from_pandas(test_df[['text', 'content_pca']].rename(columns={'content_pca':'labels'}), preserve_index=False)
# paraphrase_ds['test'] = test_dataset = Dataset.from_pandas(test_df[['text', 'paraphrase_pca']].rename(columns={'paraphrase_pca':'labels'}), preserve_index=False)

In [26]:
model_name = model_name
tokenizer = tokenizer

In [27]:
# tokenize them
def tokenize_inputs(example):
    return tokenizer(example['text'], truncation = True)

content_ds_t = content_ds.map(tokenize_inputs, batched=True)
paraphrase_ds_t = paraphrase_ds.map(tokenize_inputs, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Define metrics

In [28]:
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr 

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

### Set hyperparameters

In [29]:
learning_rate = 3e-05
batch_size = 36
seed = 42
num_epochs = 4
da_model_name = './results/checkpoint-78500'

# def model_init():
#     return RobertaForSequenceClassification.from_pretrained(model_name,
#                                                               num_labels=1).to(device)
def model_init():
    return RobertaForSequenceClassification.from_pretrained(da_model_name,
                                                              num_labels=1).to(device)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Train the content model

In [30]:
trainer = Trainer(
    model_init=model_init,
)

training_args = TrainingArguments(
    output_dir = f'./results/da_content_checkpoints',
    optim = 'adamw_torch',
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    learning_rate = learning_rate,
    logging_dir = f'./logs/content',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'mse',
    evaluation_strategy = "epoch",
    save_strategy = "epoch", 
    greater_is_better = False,
    seed=seed,
    log_level = 'error',  # took me ages to find these options
    disable_tqdm = False, # enable output cell scrolling in JupyterLab for even more beautiful output :D
) 

    # Call the Trainer
content_trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator=data_collator,
    train_dataset = content_ds_t['train'],
    eval_dataset = content_ds_t['valid'],
    compute_metrics = compute_metrics_for_regression,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
content_trainer.train()

loading configuration file ./results/checkpoint-78500/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ./results/checkpoint-78500/pytorch_model.bin
Some weights of the model checkpoint at ./results/checkpoint-78500 were not used when init

RuntimeError: CUDA out of memory. Tried to allocate 148.00 MiB (GPU 0; 47.54 GiB total capacity; 11.83 GiB already allocated; 42.31 MiB free; 12.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Train the paraphrase model

In [None]:
trainer = Trainer(
    model_init=model_init,
)

training_args = TrainingArguments(
    output_dir = f'./results/da_paraphrase_checkpoints',
    optim = 'adamw_torch',
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    learning_rate = learning_rate,
    logging_dir = f'./logs/paraphrase',
    save_total_limit = 10,
    load_best_model_at_end = True,
    metric_for_best_model = 'mse',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    greater_is_better = False,
    seed=seed,
    log_level = 'error', # took me ages to find these options
    disable_tqdm = False, # enable output cell scrolling in JupyterLab for even more beautiful output :D
) 

    # Call the Trainer
paraphrase_trainer = Trainer(
    model_init = model_init,
    args = training_args,
    data_collator=data_collator,
    train_dataset = paraphrase_ds_t['train'],
    eval_dataset = paraphrase_ds_t['valid'],
    compute_metrics = compute_metrics_for_regression,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
paraphrase_trainer.train()

## Evaluate the models on the test groups

In [31]:
import scipy.stats

con_preds, con_labs, con_metrics = content_trainer.predict(content_ds_t['test'])
con_actual = content_ds_t['test']['labels']
print(scipy.stats.pearsonr(con_actual, con_preds))

from matplotlib import pyplot as plt
con_preds.flatten()
plt.scatter(con_preds, con_actual, alpha=0.5)
plt.ylabel('true content score')
plt.xlabel('predicted content score')
plt.title('Content Model Accuracy')
plt.show()

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 704
  Batch size = 36


RuntimeError: CUDA out of memory. Tried to allocate 42.00 MiB (GPU 0; 47.54 GiB total capacity; 11.83 GiB already allocated; 42.31 MiB free; 12.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
para_preds, para_labs, para_metrics = paraphrase_trainer.predict(paraphrase_ds_t['test'])
paraphrase_actual = paraphrase_ds_t['test']['labels']
print(scipy.stats.pearsonr(paraphrase_actual, para_preds))

from matplotlib import pyplot as plt
para_preds.flatten()
plt.scatter(para_preds, paraphrase_actual, alpha=0.5)
plt.ylabel('true paraphrase score')
plt.xlabel('predicted paraphrase score')
plt.title('Paraphrase Model Accuracy')
plt.show()

In [None]:
paraphrase_trainer.save_model(SUMM_FOLDER / 'paraphrase_model_roberta_da')
content_trainer.save_model(SUMM_FOLDER / 'content_model_roberta_da')
tokenizer.save_pretrained(SUMM_FOLDER / 'paraphrase_model_roberta_da/tokenizer.json')
tokenizer.save_pretrained(SUMM_FOLDER / 'content_model_roberta_da/tokenizer.json')