<a href="https://colab.research.google.com/github/Zihooo/Text-selection-codes-pub/blob/main/Prediction_Model_(RoBERTa_and_Longformer).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer Models for Personality Score Prediction
This colab is written in **Python** to illistrate the process of *fine-tuning*  state-of-the-art **Transformer** models to predict personality scores. In this context the fine-tuning process involves training models with a relatively small amount of samples with known trait scores. 

In [None]:
# Mount Google drive to get access to the data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
## install required pacakges
! pip install transformers==4.28.0
! pip install sentencepiece
! pip install datasets
! pip install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m122.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transfo

In [None]:
# import pacakges
from transformers import AutoConfig, AutoTokenizer, TrainingArguments, Trainer

from torch.utils.data import Dataset
import torch

from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import numpy as np
from warnings import warn
import os
import sys
import datetime
import gc

import scipy
from scipy.stats import pearsonr


### Using a GPU
To speed things up you can use a *GPU* (*optional*).

First, you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

Next, confirm that you can connect to the GPU with tensorflow:

In [None]:
# A helper function to check for a GPU
def get_gpu ():
  if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    return torch.cuda.current_device()
  else:
    return -1

In [None]:
get_gpu()

0

In [None]:
!nvidia-smi

### Functions and Classes

In [None]:
#@title Load user-defined utility functions
# Import Data function
def import_data(path, text_col, label_col, index_col = None, index_val = None, enc = 'latin1'):
  """Import a CSV of sentences
  
  Args:
    path: A csv file path
    text_col: Name of column in csv containing sentences
    label_col: Name of column containing labels
    enc: File encoding to be used (optional)
  """
  df = pd.read_csv(path, encoding = enc,keep_default_na=False)
  if not isinstance(index_val, type(None)):
    df = df[df[index_col] == index_val]
  if label_col is None:
    return df[text_col].tolist(), df
  return df[text_col].tolist(), df[label_col].tolist(), df

# Map labels to keys
#def map_labels_to_keys(labels, sort_labels = True):
  """Map text labels to integers
  
  Args:
    labels: a list/vector of text labels
    sort_labels: Sort labels alphabetically before recoding (optional)
  """


# Get model for simple transformers
def get_model(model_type):
    if  model_type == "specter":
        model_name = "allenai/specter"
    elif model_type == "bert":
        model_name = "bert-base-cased"
    elif model_type == "roberta":
        model_name = "roberta-large"
    elif model_type == "distilbert":
        model_name = "distilbert-base-cased-distilled-squad"
    elif model_type == "distilroberta":
        model_type = "roberta"
        model_name = "cross-encoder/stsb-distilroberta-base"
    elif model_type == "electra-base":
        model_type = "electra"
        model_name = "cross-encoder/ms-marco-electra-base"
    elif model_type == "xlnet":
        model_name = "xlnet-large-cased"
    elif model_type == "bart":
        model_name = "facebook/bart-large"
    elif model_type == "deberta":
        model_type = "debertav2"
        model_name = "microsoft/deberta-v3-large"
    elif model_type == "albert":
        model_name = "albert-xlarge-v2"
    elif model_type == "xlmroberta":
        model_name = "xlm-roberta-large"
    else:
        warnings.warn("model_type not a pre-defined, setting model_type to model_name")
        model_name = model_type
    return model_type, model_name
  


In [None]:
# eval metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import r_regression
from scipy.stats import pearsonr

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, logits)
    r = pearsonr(labels.reshape(-1), logits.reshape(-1))
    rscore = r[0].tolist()
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    return {"mse": mse, "r": rscore}

In [None]:
#@title Data Class
class TextClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
      

### Defining Variables


---


We define our variables for purposes described in our research manuscripte. However, we encourage researchers and practitioners to try out alternative models. In addition, we wanted to minimize the tuning hyper-parameters during training as the aim of this research is to highlight Transformers in a baseline sense.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = 'roberta-base' # replace with "longformer-base-4096" for longformer
LEARNING_RATE = 5e-7
MAX_LENGTH = 512      # can be increased to 4096 when use longformer, a longer sequence leads to heavier computation load
BATCH_SIZE = 12       # batch size is defined based on available computational resource (GPU memory)
EPOCHS = 50           # may increase this number if there is no diminishing reture on evaluation metric



---


## Fine-tuning A Transformer Model


---
This example demonstrates the fine-tuning process for the pupose of score prediction from text data.


### Importing and formatting Training Data


---


Since we have already mount this notebood at our drive, we can directly import data from Google drive.

In [None]:
#@title Importing custom datasets

# "texta" refers to the column that contains textual data, "ascore" refers to the column that contains labels
# the import_data function will return a list of sentences and the original dataset
# training set
train_text, train_labels, train_raw_data = import_data("/content/drive/MyDrive/Text Selection Paper Codes/data/train_relevant_10.csv", "textn", "nscore")

# evaluation set
eval_text, eval_labels, eval_raw_data = import_data("/content/drive/MyDrive/Text Selection Paper Codes/data/eval_relevant_10.csv", "textn", "nscore")

## testing set
test_text, test_labels, test_raw_data = import_data("/content/drive/MyDrive/Text Selection Paper Codes/data/test_relevant_10.csv", "textn", "nscore")

To properly import the training data we must specify the file path, column name containing our items, and column name containing our labels. Then, the `import_data()` returns three objects:

- a list (vector) of items
- a list (vector) of labels
- a copy of our training data

The code above assigns these to objects names `train_text`, `train_labels` and `raw_data` respectively.

In [None]:
#@title Tokenize data

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

#train_labels_indx, lab_to_id, num_labs = map_labels_to_keys(train_labels)
train_encodings = tokenizer(train_text, truncation=True, max_length = MAX_LENGTH,padding='max_length')
train_dataset = TextClassificationDataset(train_encodings, train_labels)
    
#eval_labels_indx, _, _ = map_labels_to_keys(eval_labels)
eval_encodings = tokenizer(eval_text, truncation=True, max_length = MAX_LENGTH,padding='max_length')
eval_dataset = TextClassificationDataset(eval_encodings, eval_labels)

#test_labels_indx, _, _ = map_labels_to_keys(test_labels)
test_encodings = tokenizer(test_text, truncation=True, max_length = MAX_LENGTH,padding='max_length')
test_dataset = TextClassificationDataset(test_encodings, test_labels)


### Training the model



In [None]:
# load model
MODEL = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1) # problem_type is set to 'regression' when num_labels = 1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Text Selection Paper Codes/checkpoints/relevant-n", # directory to save the model
    learning_rate=LEARNING_RATE,
    seed = 100,                                                    # thought the seed number for training is fixed here, there is still some randomness in model innitiations. 
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps = 50,
    metric_for_best_model="mse", greater_is_better = False,    # This metric can also be r, and change greater is better to True.
                                                               # No matter which to use, an observation on the training log is necessary for model selection (avoid automatically selecting the first few epoches).
    load_best_model_at_end=True,     # this will save the epoch with the lowest loss metric as final output.
    weight_decay=0.01
)

In [None]:
  # initialize trainer
trainer = Trainer(model=MODEL,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    compute_metrics = compute_metrics_for_regression,
    )

In [None]:
# RUN
# trainer.train()



Epoch,Training Loss,Validation Loss,Mse,R
1,No log,7.562261,7.56226,0.059764
2,6.999300,6.850716,6.850716,0.091776
3,5.580600,5.580133,5.580133,0.097186
4,5.580600,3.128383,3.128383,0.067326
5,3.696600,1.524704,1.524703,-0.071475
6,1.569800,1.13631,1.13631,-0.076346
7,1.569800,1.11898,1.11898,0.006629
8,1.038900,1.114224,1.114224,0.04358
9,0.954900,1.116863,1.116863,0.089022
10,0.954900,1.111495,1.111495,0.13998


TrainOutput(global_step=2050, training_loss=1.2784752375905106, metrics={'train_runtime': 3526.1023, 'train_samples_per_second': 6.863, 'train_steps_per_second': 0.581, 'total_flos': 6367230370406400.0, 'train_loss': 1.2784752375905106, 'epoch': 50.0})

### Predict scores with the fine-tuned model

---

Since we've fined tuned the model we can use the `.predict()` method to predict the target labels.

In [None]:
# use the same initial model for training
trainer.train(resume_from_checkpoint = "/content/drive/MyDrive/Text Selection Paper Codes/checkpoints/relevant-n/checkpoint-41")

In [None]:
# check which epoch was selected
trainer.eval_dataset=eval_dataset
trainer.evaluate()

{'eval_loss': 0.8695018887519836,
 'eval_mse': 0.8695018887519836,
 'eval_r': 0.5055594380513734,
 'eval_runtime': 3.8433,
 'eval_samples_per_second': 31.744,
 'eval_steps_per_second': 2.862,
 'epoch': 50.0}

In [None]:
# run prediction
pred_set = trainer.predict(test_dataset)

In [None]:
# save the predicted results into a list
xss = pred_set[0]
flat_list = [x for xs in xss for x in xs]


In [None]:
# calculate the correlation between predicted scores and labels
pearsonr(flat_list,pred_set[1])

In [None]:
# save the predicted scores
import pd from pandas
dfpred = pd.DataFrame(flat_list)
#dfpred.to_csv('/content/drive/MyDrive/personality prediction/final-saved outputs/wd/relevance/test_O_epoch8.csv')