In [1]:
import os
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader
from datasets import Dataset
import joblib

In [2]:
# Set device
device = torch.device("cuda")
assert torch.cuda.is_available(), "CUDA GPU is not available. Please check your setup."

In [3]:
# Load data
csv_path = r"C:\Users\sanke\Desktop\Therapist_Model\Segmentation Data\Data\Final Data\session_transcript.csv"
df = pd.read_csv(csv_path)

In [4]:
# Load model and tokenizer
model_dir = r"C:\Users\sanke\Desktop\Therapist_Model\Saved Model"
model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = RobertaTokenizer.from_pretrained(model_dir)
label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.pkl"))
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [5]:
# Prediction function
def predict_speaker_class(df_subset):
    dataset = Dataset.from_pandas(df_subset[["Text"]].rename(columns={"Text": "utterance"}))
    dataset = dataset.map(lambda x: tokenizer(x["utterance"], padding="max_length", truncation=True, max_length=512), batched=True)
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    loader = DataLoader(dataset, batch_size=16)

    all_preds = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy()
            all_preds.extend(preds)

    predicted_labels = label_encoder.inverse_transform(all_preds)
    return predicted_labels

# Predict for SPEAKER_00
speaker_00_df = df[df["Speaker"] == "SPEAKER_00"].copy()
speaker_00_preds = predict_speaker_class(speaker_00_df)
speaker_00_df["Predicted"] = speaker_00_preds
majority_00 = speaker_00_df["Predicted"].value_counts().idxmax()
percent_00 = (speaker_00_df["Predicted"].value_counts()[majority_00] / len(speaker_00_df)) * 100

# Predict for SPEAKER_01
speaker_01_df = df[df["Speaker"] == "SPEAKER_01"].copy()
speaker_01_preds = predict_speaker_class(speaker_01_df)
speaker_01_df["Predicted"] = speaker_01_preds
majority_01 = speaker_01_df["Predicted"].value_counts().idxmax()
percent_01 = (speaker_01_df["Predicted"].value_counts()[majority_01] / len(speaker_01_df)) * 100

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [6]:
# Decide which speaker has higher majority confidence
if percent_00 > percent_01:
    speaker_map = {
        "SPEAKER_00": majority_00,
        "SPEAKER_01": "therapist" if majority_00.lower() == "client" else "client"
    }
else:
    speaker_map = {
        "SPEAKER_01": majority_01,
        "SPEAKER_00": "therapist" if majority_01.lower() == "client" else "client"
    }

# Apply speaker mapping
updated_df = df.copy()
updated_df["Speaker"] = updated_df["Speaker"].replace(speaker_map)

# Print results
print("\nPrediction Summary:")
print(f"SPEAKER_00 → Majority class: {majority_00}, Confidence: {percent_00:.2f}%")
print(f"SPEAKER_01 → Majority class: {majority_01}, Confidence: {percent_01:.2f}%")
print(f"\nSelected Mapping Based on Higher Confidence: {speaker_map}\n")
print(updated_df["Speaker"].head(20))
print(updated_df.iloc[:10, :3])


Prediction Summary:
SPEAKER_00 → Majority class: Client, Confidence: 55.56%
SPEAKER_01 → Majority class: Client, Confidence: 100.00%

Selected Mapping Based on Higher Confidence: {'SPEAKER_01': 'Client', 'SPEAKER_00': 'therapist'}

0     therapist
1     therapist
2     therapist
3     therapist
4     therapist
5     therapist
6     therapist
7        Client
8     therapist
9     therapist
10       Client
11       Client
12       Client
13    therapist
14    therapist
15    therapist
16    therapist
17    therapist
18    therapist
19    therapist
Name: Speaker, dtype: object
   Start (sec)  End (sec)    Speaker
0        1.837     10.055  therapist
1       10.055     19.454  therapist
2       19.454     30.440  therapist
3       30.440     44.497  therapist
4       44.497     52.580  therapist
5       52.580     61.456  therapist
6       61.456     76.340  therapist
7       76.829     87.258     Client
8       87.595     99.610  therapist
9       99.610    113.178  therapist


In [7]:
# Helper function to convert seconds to MM:SS format
def sec_to_min_sec(seconds):
    minutes = int(seconds) // 60
    seconds = int(seconds) % 60
    return f"{minutes:02d}:{seconds:02d}"

In [8]:
# Initialize with the first row
merged_data = []
current_speaker = updated_df.loc[0, 'Speaker']
current_start = updated_df.loc[0, 'Start (sec)']
current_end = updated_df.loc[0, 'End (sec)']
current_text = updated_df.loc[0, 'Text']

# Iterate through rows to merge consecutive therapist segments, keeping client entries separate
for i in range(1, len(updated_df)):
    row = updated_df.loc[i]
    speaker = row['Speaker']
    if speaker == "therapist" and current_speaker == "therapist":
        current_end = row['End (sec)']
        current_text += " " + row['Text']
    else:
        merged_data.append({
            'Start': sec_to_min_sec(current_start),
            'End': sec_to_min_sec(current_end),
            'Speaker': current_speaker,
            'Text': current_text
        })
        current_speaker = speaker
        current_start = row['Start (sec)']
        current_end = row['End (sec)']
        current_text = row['Text']
merged_data.append({
    'Start': sec_to_min_sec(current_start),
    'End': sec_to_min_sec(current_end),
    'Speaker': current_speaker,
    'Text': current_text
})

In [9]:
# Create the final conversation DataFrame
final_conversation_df = pd.DataFrame(merged_data)
print(final_conversation_df.head(10))

   Start    End    Speaker                                               Text
0  00:01  01:16  therapist   All right, hi everybody. My name is Daniel Ga...
1  01:16  01:27     Client   And I'm a peer educator at CAPS here, and I t...
2  01:27  01:53  therapist   All right, so we're going to go ahead and div...
3  01:53  02:08     Client   Um, so nothing much has changed in terms of w...
4  02:08  02:21     Client   But as for the homework. I felt that sometime...
5  02:22  02:33     Client   and my thoughts were like controlling me. So ...
6  02:33  04:40  therapist   the same thing going on every day, right? Rig...
7  04:41  04:55     Client   So whenever I felt like my anxiety was throug...
8  04:56  05:08     Client   Like emails from my college about my GPA, fro...
9  05:09  05:37  therapist   And it just like builds up so I can't even co...


In [10]:
# Save final conversation data only if it does not exist
final_data_dir = r"C:\Users\sanke\Desktop\Therapist_Model\Segmentation Data\Data\Final Data"
os.makedirs(final_data_dir, exist_ok=True)
final_data_path = os.path.join(final_data_dir, "Final_Conversation.csv")
if not os.path.exists(final_data_path):
    final_conversation_df.to_csv(final_data_path, index=False)
    print(f"Final conversation data saved to {final_data_path}")
else:
    print(f"Final conversation CSV already exists at {final_data_path}, skipping save.")

Final conversation CSV already exists at C:\Users\sanke\Desktop\Therapist_Model\Segmentation Data\Data\Final Data\Final_Conversation.csv, skipping save.
