<a href="https://colab.research.google.com/github/Zihooo/Text-selection-codes-pub/blob/main/New_Prediction_Model_(DeBERTa).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer Models for Personality Score Prediction
This colab is written in **Python** to illistrate the process of *fine-tuning*  state-of-the-art **Transformer** models to predict personality scores. In this code sample, we used **Roberta-based** as an example of a transformer and **neuroticism** as a sample personality trait. We've made notes in the code about the changes you'd need to make to use other transformers or predict other personality traits.

In [None]:
# Mount Google drive to get access to the data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
## install required pacakges
! pip install transformers==4.28.0
! pip install sentencepiece
! pip install datasets
! pip install scipy

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/7.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/7.0 MB[0m [31m47.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.9/7.0 MB[0m [31m71.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.5 MB/s[0m eta [36m0:00

In [None]:
# import pacakges
from transformers import AutoConfig, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback, IntervalStrategy

import torch
from torch.utils.data import Dataset

import scipy
from scipy.stats import pearsonr
from scipy.special import softmax
import statistics
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import numpy as np
from warnings import warn
import os
import sys
import gc


### Using a GPU
To speed things up you can use a *GPU* (*optional*).

First, you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

Next, confirm that you can connect to the GPU with tensorflow:

In [None]:
# A helper function to check for a GPU
def get_gpu ():
  if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    return torch.cuda.current_device()
  else:
    return -1

In [None]:
get_gpu()

0

In [None]:
!nvidia-smi

Sat Feb  3 17:21:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8              11W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Functions and Classes

In [None]:
#@title Load user-defined utility functions
# Import Data function
def import_data(path, text_col, label_col, index_col = None, index_val = None, enc = 'latin1'):
  """Import a CSV of sentences

  Args:
    path: A csv file path
    text_col: Name of column in csv containing sentences
    label_col: Name of column containing labels
    enc: File encoding to be used (optional)
  """
  df = pd.read_csv(path, encoding = enc,keep_default_na=False)
  if not isinstance(index_val, type(None)):
    df = df[df[index_col] == index_val]
  if label_col is None:
    return df[text_col].tolist(), df
  return df[text_col].tolist(), df[label_col].tolist(), df


# Get model for simple transformers
def get_model(model_type):
    if  model_type == "specter":
        model_name = "allenai/specter"
    elif model_type == "bert":
        model_name = "bert-base-cased"
    elif model_type == "roberta":
        model_name = "roberta-large"
    elif model_type == "distilbert":
        model_name = "distilbert-base-cased-distilled-squad"
    elif model_type == "distilroberta":
        model_type = "roberta"
        model_name = "cross-encoder/stsb-distilroberta-base"
    elif model_type == "electra-base":
        model_type = "electra"
        model_name = "cross-encoder/ms-marco-electra-base"
    elif model_type == "xlnet":
        model_name = "xlnet-large-cased"
    elif model_type == "bart":
        model_name = "facebook/bart-large"
    elif model_type == "deberta":
        model_type = "debertav2"
        model_name = "microsoft/deberta-v3-large"
    elif model_type == "albert":
        model_name = "albert-xlarge-v2"
    elif model_type == "xlmroberta":
        model_name = "xlm-roberta-large"
    else:
        warnings.warn("model_type not a pre-defined, setting model_type to model_name")
        model_name = model_type
    return model_type, model_name



In [None]:
# eval metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import r_regression
from scipy.stats import pearsonr

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, logits)
    r = pearsonr(labels.reshape(-1), logits.reshape(-1))
    rscore = r[0].tolist()
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    return {"mse": mse, "r": rscore}

In [None]:
#@title Data Class
class TextClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


### Defining Variables


---


We define our variables for purposes described in our research manuscripte. However, we encourage researchers and practitioners to try out alternative models. In addition, we wanted to minimize the tuning hyper-parameters during training as the aim of this research is to highlight Transformers in a baseline sense.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = 'microsoft/deberta-v3-base' # replace with "allenai/longformer-base-4096" for longformer
LEARNING_RATE = 1e-5
MAX_LENGTH = 512      # can be increased to 4096 when use longformer, a longer sequence leads to heavier computation load
BATCH_SIZE = 12       # batch size is defined based on available computational resource (GPU memory)
EPOCHS = 10           # may increase this number if there is no diminishing return on evaluation metric



---


## Fine-tuning A Transformer Model


---
This example demonstrates the fine-tuning process for the pupose of score prediction from text data.


### Importing and formatting Training Data


---


Since we have already mount this notebood at our drive, we can directly import data from Google drive.

In [None]:
# read the whole dataset (selected text)
all_text, all_labels, all_raw_data = import_data("/content/drive/MyDrive/Text Selection Paper Codes/data/all_text_latent_extract_10.csv", "texte", "escore")


# add tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split

all_pred_scores = []

num_folds = 5
kf_outer = KFold(n_splits=num_folds, shuffle=True, random_state=42)
for fold_index, (train_index, test_index) in enumerate(kf_outer.split(all_raw_data)):
    X_train, X_test = [all_text[i] for i in train_index], [all_text[i] for i in test_index]
    y_train, y_test = [all_labels[i] for i in train_index], [all_labels[i] for i in test_index]

    #creating evaluation set
    train_text, eval_text = train_test_split(X_train, test_size=0.25, random_state=42)
    train_labels, eval_labels = train_test_split(y_train, test_size=0.25, random_state=42)
    #tokenizing the data
    #train_labels_indx, lab_to_id, num_labs = map_labels_to_keys(train_labels)
    train_encodings = tokenizer(train_text, truncation=True, max_length = MAX_LENGTH,padding='max_length')
    train_dataset = TextClassificationDataset(train_encodings, train_labels)

    #eval_labels_indx, _, _ = map_labels_to_keys(eval_labels)
    eval_encodings = tokenizer(eval_text, truncation=True, max_length = MAX_LENGTH,padding='max_length')
    eval_dataset = TextClassificationDataset(eval_encodings, eval_labels)

    #test_labels_indx, _, _ = map_labels_to_keys(test_labels)
    test_encodings = tokenizer(X_test, truncation=True, max_length = MAX_LENGTH,padding='max_length')
    test_dataset = TextClassificationDataset(test_encodings, y_test)

    # load the model
    MODEL = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1, force_download = True)

    # arguments
    training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Text Selection Paper Codes/checkpoints/deberta", # directory to save the model
    learning_rate=LEARNING_RATE,
    seed = 100,                                                    # though the seed number for training is fixed here, there is still some randomness in model innitiations.
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = 22,
    save_steps = 440,
    save_strategy="steps",
    logging_steps = 22,
    metric_for_best_model="r", greater_is_better = True,    # This metric can also be mse, and change greater is better to False.
                                                           # No matter which to use, an observation on the training log is necessary for model selection.
    load_best_model_at_end=True,     # this will save the epoch with the lowest loss metric as final output.
    weight_decay=0.01)

    # initialize trainer
    trainer = Trainer(model=MODEL,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    compute_metrics = compute_metrics_for_regression,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])

    #training
    trainer.train()

    #evaluation
    trainer.eval_dataset=eval_dataset
    print(trainer.evaluate())

    #prediction
    # run prediction
    pred_set = trainer.predict(test_dataset)
    # save the predicted results into a list
    xss = pred_set[0]
    #flat_list = [x for xs in xss for x in xs]

    #convergent validity
    correlation = pearsonr(xss,pred_set[1])

    # calculate criterion validity
    task = all_raw_data.task.iloc[test_index]
    people = all_raw_data.people.iloc[test_index]
    char = all_raw_data.char.iloc[test_index]
    ethic = all_raw_data.ethic.iloc[test_index]


    criterion_task = np.corrcoef(xss,task)[0, 1]
    criterion_people = np.corrcoef(xss,people)[0, 1]
    criterion_char = np.corrcoef(xss,char)[0, 1]
    criterion_ethic = np.corrcoef(xss,ethic)[0, 1]

    # Save the DataFrame to a CSV file
    fold_results = pd.DataFrame({'fold_index': fold_index, 'test_index': test_index, 'predicted_scores': xss})

    # Append the DataFrame to the list
    all_pred_scores.append(fold_results)

    # calculate the correlation between predicted scores and labels
    print(f"Pearson Correlation on Test Set: {correlation}")
    print(f"Criterion Correlation on task: {criterion_task}")
    print(f"Criterion Correlation on people: {criterion_people}")
    print(f"Criterion Correlation on char: {criterion_char}")
    print(f"Criterion Correlation on ethic: {criterion_ethic}")
    print("-" * 30)

    torch.cuda.empty_cache()

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Step,Training Loss,Validation Loss,Mse,R
22,7.2315,3.974746,3.974747,0.161354
44,1.8933,1.035832,1.035832,0.051497
66,0.9795,0.916905,0.916905,0.253923
88,0.9481,1.016483,1.016483,0.314495
110,0.8423,1.000284,1.000283,0.325517
132,1.0354,0.855607,0.855607,0.357989
154,0.8674,0.865467,0.865467,0.370635
176,0.7253,0.977432,0.977432,0.380449
198,0.6669,0.895045,0.895045,0.400874
220,0.6691,0.869849,0.869849,0.407648


{'eval_loss': 0.9856811165809631, 'eval_mse': 0.9856811165809631, 'eval_r': 0.4161641013456649, 'eval_runtime': 8.6539, 'eval_samples_per_second': 20.107, 'eval_steps_per_second': 1.733, 'epoch': 10.0}
Pearson Correlation on Test Set: PearsonRResult(statistic=0.4689066908917161, pvalue=6.763984293833086e-11)
Criterion Correlation on task: 0.14363784134515695
Criterion Correlation on people: -0.04409930115527775
Criterion Correlation on char: 0.2160891202695821
Criterion Correlation on ethic: 0.02392907050438864
------------------------------


config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Step,Training Loss,Validation Loss,Mse,R
22,8.0607,3.857506,3.857506,-0.130384
44,1.65,0.94634,0.94634,-0.170496
66,1.0505,0.869951,0.869951,0.219352
88,0.9942,0.853567,0.853567,0.298176
110,0.8726,0.836778,0.836778,0.356259
132,0.9243,1.076268,1.076268,0.327185
154,0.8574,0.800043,0.800043,0.41362
176,0.7073,0.828233,0.828233,0.392489
198,0.8035,1.470005,1.470005,0.377338
220,0.9805,0.91339,0.91339,0.36686


{'eval_loss': 0.8447206020355225, 'eval_mse': 0.8447206020355225, 'eval_r': 0.44419190399287106, 'eval_runtime': 8.6691, 'eval_samples_per_second': 20.071, 'eval_steps_per_second': 1.73, 'epoch': 10.0}
Pearson Correlation on Test Set: PearsonRResult(statistic=0.40834609398372346, pvalue=2.2266783835649338e-08)
Criterion Correlation on task: 0.201871879927724
Criterion Correlation on people: 0.06339582916000937
Criterion Correlation on char: 0.19386015246348498
Criterion Correlation on ethic: 0.11837134961947918
------------------------------


config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Step,Training Loss,Validation Loss,Mse,R
22,7.7064,3.795171,3.795171,-0.112494
44,1.7361,1.023241,1.023241,0.057416
66,1.0811,0.952955,0.952955,0.310644
88,1.0689,1.063635,1.063635,0.316879
110,0.9257,0.922421,0.922421,0.493637
132,0.9854,0.803822,0.803822,0.469284
154,0.7246,0.874362,0.874362,0.426651
176,0.8345,0.809708,0.809707,0.518185
198,0.7039,0.735157,0.735157,0.521811
220,0.6151,0.811234,0.811234,0.506113


{'eval_loss': 0.7968816757202148, 'eval_mse': 0.7968816757202148, 'eval_r': 0.5604631903097335, 'eval_runtime': 8.6428, 'eval_samples_per_second': 20.132, 'eval_steps_per_second': 1.736, 'epoch': 10.0}
Pearson Correlation on Test Set: PearsonRResult(statistic=0.3871445114775576, pvalue=1.4263259334617803e-07)
Criterion Correlation on task: 0.19087622975827623
Criterion Correlation on people: -0.0681252838249055
Criterion Correlation on char: 0.14134367958044639
Criterion Correlation on ethic: 0.07896877763282645
------------------------------


config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Step,Training Loss,Validation Loss,Mse,R
22,8.1184,3.392234,3.392233,-0.088471
44,1.7042,1.183476,1.183476,0.192892
66,0.986,1.192318,1.192318,0.41588
88,0.9638,0.996923,0.996923,0.273069
110,1.0201,1.206266,1.206266,0.340946
132,0.8968,1.022464,1.022464,0.331295
154,0.9408,0.834678,0.834678,0.398961
176,0.7695,0.930266,0.930266,0.494861
198,0.7597,0.790033,0.790033,0.495626
220,0.7648,0.763581,0.763581,0.523978


{'eval_loss': 0.7735636234283447, 'eval_mse': 0.7735634446144104, 'eval_r': 0.5082484104004104, 'eval_runtime': 8.6412, 'eval_samples_per_second': 20.136, 'eval_steps_per_second': 1.736, 'epoch': 10.0}
Pearson Correlation on Test Set: PearsonRResult(statistic=0.5066318232355349, pvalue=1.1451570638990365e-12)
Criterion Correlation on task: 0.11290416579322039
Criterion Correlation on people: 0.005944441850567828
Criterion Correlation on char: 0.26767447048991955
Criterion Correlation on ethic: -0.08323563155071935
------------------------------


config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Step,Training Loss,Validation Loss,Mse,R
22,7.9874,3.383742,3.383741,-0.053935
44,1.7602,0.814881,0.814881,-0.089708
66,1.0991,0.88188,0.88188,0.301821
88,0.9599,0.796788,0.796788,0.260645
110,0.9225,0.847062,0.847062,0.310914
132,0.9376,0.659645,0.659645,0.413297
154,0.7993,0.713381,0.713381,0.410961
176,0.6704,0.823682,0.823682,0.439375
198,0.7415,0.852146,0.852146,0.416528
220,0.5363,0.880415,0.880416,0.418243


{'eval_loss': 0.883706271648407, 'eval_mse': 0.8837060928344727, 'eval_r': 0.42481590532245755, 'eval_runtime': 8.6751, 'eval_samples_per_second': 20.057, 'eval_steps_per_second': 1.729, 'epoch': 10.0}
Pearson Correlation on Test Set: PearsonRResult(statistic=0.467441004789258, pvalue=8.951490274586152e-11)
Criterion Correlation on task: 0.20137400750182552
Criterion Correlation on people: -0.0798564983978116
Criterion Correlation on char: 0.17316962548302944
Criterion Correlation on ethic: 0.07895054176431227
------------------------------


In [None]:
save_pred_scores = pd.concat(all_pred_scores, ignore_index=True)
save_pred_scores.to_csv('/content/drive/MyDrive/Text Selection Paper Codes/final saved outputs/selection/Oscore2.csv')