In [19]:
# Importing Packages
import numpy as np
import pandas as pd
import os
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, TrainerCallback
from sklearn.metrics import accuracy_score, f1_score
import joblib

In [20]:
# Setting code to run on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [21]:
# Importing data
segmentation_df = pd.read_parquet(r"C:\Users\sanke\Desktop\Therapist_Model\Segmentation Data\Data\Final Data\Therapy_Session.parquet")
segmentation_df.head(10)

Unnamed: 0,utterance,speaker
0,I told you.,Client
1,Told me what?,Therapist
2,That you'd be sorry you ever encouraged me to ...,Client
3,I'm not sorry at all.,Therapist
4,"You didn't expect it to be like this, I bet.",Client
5,Like what?,Therapist
6,You know what? It's disappointing. I thought I...,Client
7,And it isn't?,Therapist
8,"No, it's horrible. I don't know if I'm able to...",Client
9,Are you all right?,Therapist


In [22]:
# Creating training, testing and validation data
train_df, temp_df = train_test_split(segmentation_df, test_size=0.3, stratify=segmentation_df['speaker'], random_state=310)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['speaker'], random_state=310)
print(f"The shape of the training data: {train_df.shape}")
print(f"The shape of the validation data: {val_df.shape}")
print(f"The shape of the test data: {test_df.shape}")

The shape of the training data: (21134, 2)
The shape of the validation data: (4529, 2)
The shape of the test data: (4529, 2)


In [23]:
# Save test data
test_data_dir = r"C:\Users\sanke\Desktop\Therapist_Model\Segmentation Data\Data\Final Data"
os.makedirs(test_data_dir, exist_ok=True)
test_data_path = os.path.join(test_data_dir, "Therapy_Session_Test.parquet")
if not os.path.exists(test_data_path):
    test_df.to_parquet(test_data_path, index=False)
    print(f"Test data saved to {test_data_path}")
else:
    print(f"Test data already exists at {test_data_path}, skipping save.")

Test data already exists at C:\Users\sanke\Desktop\Therapist_Model\Segmentation Data\Data\Final Data\Therapy_Session_Test.parquet, skipping save.


In [24]:
# Label Encoding
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["speaker"])
val_df["label"] = label_encoder.transform(val_df["speaker"])

In [25]:
# Tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
def tokenize_data(examples):
    return tokenizer(examples["utterance"], padding="max_length", truncation=True, max_length=512)

In [26]:
# Convert DataFrames to Dataset
train_dataset = Dataset.from_pandas(train_df[['utterance', 'label']])
val_dataset = Dataset.from_pandas(val_df[['utterance', 'label']])

In [27]:
# Apply tokenization
train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = val_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/21134 [00:00<?, ? examples/s]

Map:   0%|          | 0/4529 [00:00<?, ? examples/s]

In [28]:
# Ensure correct format
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [29]:
# Model setup
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_encoder.classes_)).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Custom Callback for tracking metrics
class MetricTrackerCallback(TrainerCallback):
    def __init__(self):
        self.logs = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            log_data = {"epoch": state.epoch}
            log_data.update(logs)
            self.logs.append(log_data)

In [32]:
# Instantiate callback
metric_tracker = MetricTrackerCallback()

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2), metric_tracker]
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2627,0.216985,0.911239,0.911205
2,0.2113,0.261163,0.91433,0.91428
3,0.1815,0.35465,0.905056,0.904885


TrainOutput(global_step=3963, training_loss=0.21849208295631073, metrics={'train_runtime': 4781.9176, 'train_samples_per_second': 30.937, 'train_steps_per_second': 1.934, 'total_flos': 1.668176713193472e+16, 'train_loss': 0.21849208295631073, 'epoch': 3.0})

In [33]:
# Save model, tokenizer and label encoder
saved_model_dir = r"C:\Users\sanke\Desktop\Therapist_Model\Saved Model"
os.makedirs(saved_model_dir, exist_ok=True)
model.save_pretrained(saved_model_dir)
tokenizer.save_pretrained(saved_model_dir)
joblib.dump(label_encoder, os.path.join(saved_model_dir, "label_encoder.pkl"))
print(f"Model, tokenizer and label encoder saved to {saved_model_dir}")

Model, tokenizer and label encoder saved to C:\Users\sanke\Desktop\Therapist_Model\Saved Model
