In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  

In [3]:
import pandas as pd
import datasets
from datasets import DatasetDict, Dataset
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
#from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split



In [4]:
targets = ['agreeableness', 'openness', 'conscientiousness', 'extraversion','neuroticism']

In [5]:
authors_df = pd.read_parquet('PANDORA_author_profiles.parquet')

In [6]:
full_text = pd.read_parquet('authors_full_text.parquet')

In [7]:
author_full_text_join = pd.merge(authors_df, full_text, on='author', how='inner')

In [8]:
id_target = 4

In [9]:
target = targets[id_target]
target #use different targets for training on  different personality traits

'neuroticism'

In [10]:
max_length = 4096

In [11]:
author_full_text_join_cleaned = author_full_text_join.dropna(subset=[target])

In [12]:
author_full_text_join_cleaned['full_text']


1        Your first and second question is the same que...
6        I've been asked to cum everywhere with my ex j...
7        I'm currently in the middle of making a Payday...
8        First and foremost I extend my condolences to ...
23       I failed both...I'm great at reading people ir...
                               ...                        
10290    Hakuna Matata man. The problem free philosophy...
10291    How has no one mentioned [Gerkin's](http://www...
10292    What should we do? I knew it from the moment I...
10293    TYPE_MENTION for morals and emotional toleranc...
10294    YES. my god, thank you haha It could be that y...
Name: full_text, Length: 1603, dtype: object

In [13]:
def divide_text_into_batches(text, batch_size=max_length * 5):
    words = text.split()
    
    batches = [words[i:i + batch_size] for i in range(0, len(words), batch_size)]
    
    batches = [' '.join(batch) for batch in batches]
    
    return batches

In [14]:
texts = []
y_values = []
batches_tot = []

for i, row in author_full_text_join_cleaned.iterrows():
    batches = divide_text_into_batches(row['full_text'])
    target_values = [row[target] for x in range(len(batches))]
    texts += batches
    batches_tot.append(batches)
    y_values += target_values
    

In [19]:
#texts = (texts[0:5000])
#y_values =  (y_values[0:5000])

In [15]:
bins = [0, 20, 40, 60, 80, 101]  # note: upper bound of the last bin is 101 to include 100
labels = [0, 1, 2, 3, 4]

# classify the values into bins
categories = np.digitize(y_values, bins, right=False) - 1  # -1 to shift to zero-based indexing

# map bin indices to labels
classified_values = [labels[i] for i in categories]

y = classified_values

In [16]:
print(np.unique(y, return_counts=True)[1] / len(y))

[0.2585244  0.14888301 0.19826573 0.16240447 0.2319224 ]


In [17]:
X = np.array(texts)
y = np.array(y)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y) , test_size=0.3, stratify=y, random_state=42)
#X_train, X_val, y_train, y_val = train_test_split(np.array(X_train), np.array(y_train) , test_size=0.1, stratify=y_train, random_state=42)

In [19]:
len(X_train)

4762

In [20]:
ds = DatasetDict()
ds['train'] = Dataset.from_pandas(pd.DataFrame({'text' : X_train, 'label' : y_train}))
ds['test'] =  Dataset.from_pandas(pd.DataFrame({'text' : X_test, 'label' : y_test}))


In [21]:
# load model and tokenizer and define length of the text sequence

max_length = 4096

model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                           attention_window = 512,
                                                           num_labels = len(set(y)))
                                                 

tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = max_length)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = max_length)

train_data = ds['train'].map(tokenization)
test_data = ds['test'].map(tokenization)

In [23]:
# we make sure our truncation strateging and the padding are set to the maximung length
len(train_data['input_ids'][0])

4096

In [24]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [26]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
import torch
from transformers import Trainer, TrainingArguments

# Your training arguments
training_args = TrainingArguments(
    output_dir = f'./{target}/output_dir',
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size = 16,
    evaluation_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps = 4,
    save_strategy = 'epoch',
    fp16 = True,
    logging_dir= f'./{target}/data_files',
    dataloader_num_workers = 0,
    run_name = f'{target}_longformer-classification-updated-rtx3090_paper_replication_2_warm'
)

# Define the Trainer class
class CustomTrainer(Trainer):
    def __init__(self, model, args, train_dataset=None, eval_dataset=None, compute_metrics=None):
        super().__init__(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)

    def train(self):
        self.model.train()
        for epoch in range(int(self.args.num_train_epochs)):
            for step, batch in enumerate(self.get_train_dataloader()):
                # Move the batch to the device
                batch = {k: v.to(self.device).to('cuda') for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs.loss
                loss.backward()

                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    self.optimizer.step()
                    self.scheduler.step()
                    self.optimizer.zero_grad()

                if step % self.args.logging_steps == 0:
                    print(f'Epoch: {epoch}, Step: {step}, Loss: {loss.item()}')

# Instantiate the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)

# Start training
trainer.train()


In [27]:
training_args = TrainingArguments(
    output_dir = f'./{target}/output_dir',
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps = 4,
    save_strategy = 'epoch',
    fp16 = True,
    logging_dir= f'./{target}/data_files',
    dataloader_num_workers = 0,
    run_name = f'{target}_longformer-classification-updated-rtx3090_paper_replication_2_warm',
    #no_cuda = True
)

In [28]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
# train the model
trainer.train()

In [None]:
trainer.save_model(f'./{target}/model_saved/model')