In [None]:
import torch
from moviepy import *
import whisperx
import os
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader
from datasets import Dataset
import joblib
import cv2
import numpy as np
from IPython.display import display, clear_output
import ipywidgets as widgets
import shutil
import torch.nn as nn
from transformers import DebertaV2TokenizerFast, DebertaV2ForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput

print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("Torch Version:", torch.__version__)
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA")

# Seting the device to CUDA if available
device = torch.device("cuda")
assert torch.cuda.is_available(), "CUDA GPU is not available. Please check your setup."

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
INFO:datasets:PyTorch version 2.6.0+cu118 available.


CUDA Available: True
CUDA Version: 11.8
Torch Version: 2.6.0+cu118
Device Name: NVIDIA GeForce RTX 4070 Laptop GPU


### Transcription & Speaker Diarization

In [2]:
def extract_audio(video_path):
    base = os.path.splitext(video_path)[0]
    audio_output_path = base + ".wav"
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_output_path)
    return audio_output_path

def transcribe_and_diarize(video_path, hf_token, whisper_model="large-v2", device="cuda"):
    audio_path = extract_audio(video_path)

    model = whisperx.load_model(whisper_model, device=device, compute_type="float16")
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)

    result = model.transcribe(audio_path, chunk_size=15)
    diarization_result = diarize_model(audio_path)
    result = whisperx.assign_word_speakers(diarization_result, result)

    data = []
    for segment in result["segments"]:
        speaker = segment.get("speaker", "Unknown")
        text = segment["text"]
        start_time_sec = segment["start"]
        end_time_sec = segment["end"]
        data.append([start_time_sec, end_time_sec, speaker, text])

    df = pd.DataFrame(data, columns=["Start (sec)", "End (sec)", "Speaker", "Text"])
    df["total_duration"] = df["End (sec)"] - df["Start (sec)"]
    return df


video_path = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\notebooks_code\segmentation\videos\Telemental Health Mock Session.mp4"
hf_token = "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"

transcript_df = transcribe_and_diarize(video_path, hf_token)

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'creation_time': '2023-02-25T20:14:20.000000Z'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 139, 'fps': 25.0, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'creation_time': '2023-02-25T20:14:20.000000Z', 'handler_name': 'ISO Media file produced by Google Inc. Created on: 02/25/2023.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 127, 'metadata': {'Metadata': '', 'creation_time': '2023-02-25T20:14:20.000000Z', 'handler_name': 'ISO Media file produced by Google Inc. Created on: 02/25/2023.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 884.1, 'bitrate': 270, 'start': 0.0, 'default_video_input_n

                                                                        

MoviePy - Done.
No language specified, language will be first be detected for each audio file (increases inference time).


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\aliir\anaconda3\envs\whisperx\lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu118. Bad things might happen unless you revert torch to 1.x.


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



Detected language: en (1.00) in first 30s of audio...


  std = sequences.std(dim=-1, correction=1)


In [4]:
transcript_df

Unnamed: 0,Start (sec),End (sec),Speaker,Text,total_duration
0,1.837,10.055,SPEAKER_00,"All right, hi everybody. My name is Daniel Ga...",8.218
1,10.055,19.454,SPEAKER_00,Today I am joined by a fellow student here on...,9.399
2,19.454,30.440,SPEAKER_00,The purpose of this recording is to try to sh...,10.986
3,30.440,44.497,SPEAKER_00,"the foreseeable future, and we understand tha...",14.057
4,44.497,52.580,SPEAKER_00,So I'm going to allow Prita to introduce hers...,8.083
...,...,...,...,...,...
69,826.636,833.740,SPEAKER_01,"Yeah, I will definitely try to practice it, a...",7.104
70,834.196,846.700,SPEAKER_00,"Give it a couple of tries, okay? Again, it's ...",12.504
71,846.953,861.871,SPEAKER_00,completely different from the square breathin...,14.918
72,861.871,868.486,SPEAKER_00,The tools that we're currently lacking to hel...,6.615


### Predicting Speakers from Text 

In [None]:
# Load model components
def load_speaker_model(model_dir, device):
    model = RobertaForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = RobertaTokenizer.from_pretrained(model_dir)
    label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.pkl"))
    model.eval()
    return model, tokenizer, label_encoder

# Predict speaker label
def predict_speaker_class(df_subset, model, tokenizer, label_encoder, device):
    dataset = Dataset.from_pandas(df_subset[["Text"]].rename(columns={"Text": "utterance"}))
    dataset = dataset.map(lambda x: tokenizer(x["utterance"], padding="max_length", truncation=True, max_length=512), batched=True)
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    loader = DataLoader(dataset, batch_size=16)

    all_preds = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy()
            all_preds.extend(preds)

    return label_encoder.inverse_transform(all_preds)

# Classify and relabel speakers
def classify_and_map_speakers(df, model, tokenizer, label_encoder, device):
    speaker_00_df = df[df["Speaker"] == "SPEAKER_00"].copy()
    speaker_00_preds = predict_speaker_class(speaker_00_df, model, tokenizer, label_encoder, device)
    speaker_00_df["Predicted"] = speaker_00_preds
    majority_00 = speaker_00_df["Predicted"].value_counts().idxmax()
    percent_00 = (speaker_00_df["Predicted"].value_counts()[majority_00] / len(speaker_00_df)) * 100

    speaker_01_df = df[df["Speaker"] == "SPEAKER_01"].copy()
    speaker_01_preds = predict_speaker_class(speaker_01_df, model, tokenizer, label_encoder, device)
    speaker_01_df["Predicted"] = speaker_01_preds
    majority_01 = speaker_01_df["Predicted"].value_counts().idxmax()
    percent_01 = (speaker_01_df["Predicted"].value_counts()[majority_01] / len(speaker_01_df)) * 100

    if percent_00 > percent_01:
        speaker_map = {
            "SPEAKER_00": majority_00,
            "SPEAKER_01": "therapist" if majority_00.lower() == "client" else "client"
        }
    else:
        speaker_map = {
            "SPEAKER_01": majority_01,
            "SPEAKER_00": "therapist" if majority_01.lower() == "client" else "client"
        }

    return df.copy().replace({"Speaker": speaker_map})

# Convert seconds to MM:SS
def sec_to_min_sec(seconds):
    minutes = int(seconds) // 60
    seconds = int(seconds) % 60
    return f"{minutes:02d}:{seconds:02d}"

# Merge therapist segments
def merge_conversation_segments(df):
    merged_data = []
    current_speaker = df.loc[0, 'Speaker']
    current_start = df.loc[0, 'Start (sec)']
    current_end = df.loc[0, 'End (sec)']
    current_text = df.loc[0, 'Text']

    for i in range(1, len(df)):
        row = df.loc[i]
        speaker = row['Speaker']
        if speaker == "therapist" and current_speaker == "therapist":
            current_end = row['End (sec)']
            current_text += " " + row['Text']
        else:
            merged_data.append({
                'Start': sec_to_min_sec(current_start),
                'End': sec_to_min_sec(current_end),
                'Speaker': current_speaker,
                'Text': current_text
            })
            current_speaker = speaker
            current_start = row['Start (sec)']
            current_end = row['End (sec)']
            current_text = row['Text']

    merged_data.append({
        'Start': sec_to_min_sec(current_start),
        'End': sec_to_min_sec(current_end),
        'Speaker': current_speaker,
        'Text': current_text
    })

    return pd.DataFrame(merged_data)


model_dir = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\models\speaker_prediction_roberta_model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading model components
model, tokenizer, label_encoder = load_speaker_model(model_dir, device)

# Re-map generic speakers to "client" or "therapist"
updated_df = classify_and_map_speakers(transcript_df, model, tokenizer, label_encoder, device)

# Merge therapist blocks and return the final conversation format
final_conversation_df = merge_conversation_segments(updated_df)

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [6]:
final_conversation_df

Unnamed: 0,Start,End,Speaker,Text
0,00:01,01:16,therapist,"All right, hi everybody. My name is Daniel Ga..."
1,01:16,01:27,Client,"And I'm a peer educator at CAPS here, and I t..."
2,01:27,01:53,therapist,"All right, so we're going to go ahead and div..."
3,01:53,02:08,Client,"Um, so nothing much has changed in terms of w..."
4,02:08,02:21,Client,But as for the homework. I felt that sometime...
5,02:22,02:33,Client,and my thoughts were like controlling me. So ...
6,02:33,04:40,therapist,"the same thing going on every day, right? Rig..."
7,04:41,04:55,Client,So whenever I felt like my anxiety was throug...
8,04:56,05:08,Client,"Like emails from my college about my GPA, fro..."
9,05:09,05:37,therapist,And it just like builds up so I can't even co...


### Image Extraction

In [None]:
# Initialize Haar cascade
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def time_to_seconds(t):
    minutes, seconds = map(int, t.split(':'))
    return minutes * 60 + seconds

def prepare_client_df(conversation_df):
    client_df = conversation_df[conversation_df['Speaker'].str.lower() == 'client'].copy()
    client_df['Start_sec'] = client_df['Start'].apply(time_to_seconds)
    client_df['End_sec'] = client_df['End'].apply(time_to_seconds)
    client_df['Mid_sec'] = (client_df['Start_sec'] + client_df['End_sec']) / 2
    return client_df

def setup_output_folder(output_dir):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)

def extract_face_from_frame(frame, view_type, option, idx, output_dir, target_size=(256, 256)):
    h, w, _ = frame.shape

    def save_resized_face(face, prefix):
        resized_face = cv2.resize(face, target_size, interpolation=cv2.INTER_AREA)
        path = os.path.join(output_dir, f"{prefix}_{idx}.jpg")
        cv2.imwrite(path, resized_face)
        return path

    if view_type == 'Gallery View':
        if option == 'Left':
            half_img = frame[:, :w//2]
            faces = face_cascade.detectMultiScale(half_img, 1.3, 5)
            if len(faces) == 1:
                x, y, fw, fh = faces[0]
                return save_resized_face(half_img[y:y+fh, x:x+fw], "left")
        elif option == 'Right':
            half_img = frame[:, w//2:]
            faces = face_cascade.detectMultiScale(half_img, 1.3, 5)
            if len(faces) == 1:
                x, y, fw, fh = faces[0]
                return save_resized_face(half_img[y:y+fh, x:x+fw], "right")

    elif view_type == 'Speaker View':
        faces = face_cascade.detectMultiScale(frame, 1.3, 5)
        if len(faces) == 2:
            face_areas = [(fw * fh, (x, y, fw, fh)) for (x, y, fw, fh) in faces]
            face_areas.sort(reverse=True)
            if option == 'Large':
                x, y, fw, fh = face_areas[0][1]
                return save_resized_face(frame[y:y+fh, x:x+fw], "large")
            elif option == 'Small':
                x, y, fw, fh = face_areas[1][1]
                return save_resized_face(frame[y:y+fh, x:x+fw], "small")

    return None

def run_face_extraction(client_df, final_conversation_df, video_path, output_dir, view_value, option_value):
    setup_output_folder(output_dir)
    cap = cv2.VideoCapture(video_path)

    for idx, row in client_df.iterrows():
        cap.set(cv2.CAP_PROP_POS_MSEC, row['Mid_sec'] * 1000)
        ret, frame = cap.read()
        if not ret:
            continue
        path = extract_face_from_frame(frame, view_value, option_value, idx, output_dir)
        if path:
            client_df.loc[idx, 'Image_Path'] = path

    cap.release()

    if 'Image_Path' not in final_conversation_df.columns:
        final_conversation_df['Image_Path'] = None
    for idx in client_df.index:
        final_conversation_df.at[idx, 'Image_Path'] = client_df.at[idx, 'Image_Path']

    return final_conversation_df.copy()

def show_dropdown_ui(callback_on_confirm):
    selected_view = {'value': None}
    selected_option = {'value': None}

    view_dropdown = widgets.Dropdown(
        options=['', 'Gallery View', 'Speaker View'],
        description='Video View:',
        style={'description_width': 'initial'}
    )
    option_dropdown = widgets.Dropdown(
        options=[''],
        description='Face Option:',
        style={'description_width': 'initial'}
    )
    confirm_button = widgets.Button(description="Confirm Selection", button_style='success')
    output_area = widgets.Output()

    def update_option_dropdown(change):
        if change['new'] == 'Gallery View':
            option_dropdown.options = ['', 'Left', 'Right']
        elif change['new'] == 'Speaker View':
            option_dropdown.options = ['', 'Large', 'Small']
        else:
            option_dropdown.options = ['']
    view_dropdown.observe(update_option_dropdown, names='value')

    def confirm_selection(b):
        selected_view['value'] = view_dropdown.value
        selected_option['value'] = option_dropdown.value
        with output_area:
            clear_output()
            if selected_view['value'] and selected_option['value']:
                print(f"Selected View: {selected_view['value']} | Option: {selected_option['value']}")
                callback_on_confirm(selected_view['value'], selected_option['value'])
            else:
                print("Please select both dropdown values.")

    confirm_button.on_click(confirm_selection)
    display(view_dropdown, option_dropdown, confirm_button, output_area)


output_dir = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\notebooks_code\segmentation\Extracted_Images"
client_df = prepare_client_df(final_conversation_df)

def on_confirm(view_val, option_val):
    post_df = run_face_extraction(client_df, final_conversation_df, video_path, output_dir, view_val, option_val)
    display(post_df[['Start', 'End', 'Speaker', 'Text', 'Image_Path']].head())

show_dropdown_ui(on_confirm)


Dropdown(description='Video View:', options=('', 'Gallery View', 'Speaker View'), style=DescriptionStyle(descr…

Dropdown(description='Face Option:', options=('',), style=DescriptionStyle(description_width='initial'), value…

Button(button_style='success', description='Confirm Selection', style=ButtonStyle())

Output()

In [8]:
client_df

Unnamed: 0,Start,End,Speaker,Text,Start_sec,End_sec,Mid_sec,Image_Path
1,01:16,01:27,Client,"And I'm a peer educator at CAPS here, and I t...",76,87,81.5,D:\Data Science Projects Github\ai-multimodal-...
3,01:53,02:08,Client,"Um, so nothing much has changed in terms of w...",113,128,120.5,D:\Data Science Projects Github\ai-multimodal-...
4,02:08,02:21,Client,But as for the homework. I felt that sometime...,128,141,134.5,D:\Data Science Projects Github\ai-multimodal-...
5,02:22,02:33,Client,and my thoughts were like controlling me. So ...,142,153,147.5,D:\Data Science Projects Github\ai-multimodal-...
7,04:41,04:55,Client,So whenever I felt like my anxiety was throug...,281,295,288.0,D:\Data Science Projects Github\ai-multimodal-...
8,04:56,05:08,Client,"Like emails from my college about my GPA, fro...",296,308,302.0,D:\Data Science Projects Github\ai-multimodal-...
10,05:38,05:52,Client,distract myself from like the initial thing t...,338,352,345.0,D:\Data Science Projects Github\ai-multimodal-...
11,05:53,06:04,Client,tempted to like check whatever all the media ...,353,364,358.5,D:\Data Science Projects Github\ai-multimodal-...
12,06:04,06:12,Client,racing. My thoughts are keep racing with ever...,364,372,368.0,D:\Data Science Projects Github\ai-multimodal-...
14,09:53,10:04,Client,"to myself in front of a mirror. And honestly,...",593,604,598.5,D:\Data Science Projects Github\ai-multimodal-...


### Client/Patient Text Emotion Prediction

In [None]:
# Emotion mapping
emotion_labels = {
    0: "Anger", 1: "Fear", 2: "Happy", 3: "Sadness",
    4: "Neutral", 5: "Surprise", 6: "Confusion", 7: "Disgust"
}

# Updated model class with dropout=0.3
class WeightedDeBERTa(nn.Module):
    def __init__(self, model_name, num_labels, class_weights):
        super(WeightedDeBERTa, self).__init__()
        self.num_labels = num_labels
        self.model = DebertaV2ForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels
        )
        self.dropout = nn.Dropout(p=0.3)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.dropout(outputs.logits)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
            return SequenceClassifierOutput(loss=loss, logits=logits)
        return SequenceClassifierOutput(logits=logits)

# Main prediction function
def add_speech_emotions_to_client_df(client_df, model_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load tokenizer
    tokenizer = DebertaV2TokenizerFast.from_pretrained(model_path, local_files_only=True)
    print("Tokenizer loaded.")

    # Load checkpoint
    checkpoint = torch.load(os.path.join(model_path, 'custom_model.pth'))
    class_weights = checkpoint['class_weights']

    # Initialize and load model
    model = WeightedDeBERTa(
        "microsoft/deberta-v3-small",
        num_labels=8,
        class_weights=class_weights
    ).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    print("Custom DeBERTa model loaded.")

    # Set label mapping in model config
    model.model.config.id2label = emotion_labels
    model.model.config.label2id = {v: k for k, v in emotion_labels.items()}

    def predict_emotion(text):
        inputs = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        if 'token_type_ids' in inputs:
            del inputs['token_type_ids']

        with torch.no_grad():
            logits = model(**inputs).logits
            predicted_class = torch.argmax(logits, dim=-1).item()
            return emotion_labels[predicted_class]

    print("🔍 Predicting speech-based emotions...")
    texts = client_df["Text"].tolist()
    predicted_emotions = [predict_emotion(text) for text in texts]
    client_df["speech_predicted_emotion"] = predicted_emotions

    print("Emotion prediction complete.")
    return client_df

model_path = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\models\deberta_model"
client_df = add_speech_emotions_to_client_df(client_df, model_path)

Tokenizer loaded.


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Custom DeBERTa model loaded.
🔍 Predicting speech-based emotions...
Emotion prediction complete.


In [11]:
client_df

Unnamed: 0,Start,End,Speaker,Text,Image_Path,Start_sec,End_sec,Mid_sec,speech_predicted_emotion
1,01:16,01:27,Client,"And I'm a peer educator at CAPS here, and I t...",D:\Data Science Projects Github\ai-multimodal-...,76,87,81.5,Happy
3,01:53,02:08,Client,"Um, so nothing much has changed in terms of w...",D:\Data Science Projects Github\ai-multimodal-...,113,128,120.5,Fear
4,02:08,02:21,Client,But as for the homework. I felt that sometime...,D:\Data Science Projects Github\ai-multimodal-...,128,141,134.5,Fear
5,02:22,02:33,Client,and my thoughts were like controlling me. So ...,D:\Data Science Projects Github\ai-multimodal-...,142,153,147.5,Anger
7,04:41,04:55,Client,So whenever I felt like my anxiety was throug...,D:\Data Science Projects Github\ai-multimodal-...,281,295,288.0,Fear
8,04:56,05:08,Client,"Like emails from my college about my GPA, fro...",D:\Data Science Projects Github\ai-multimodal-...,296,308,302.0,Fear
10,05:38,05:52,Client,distract myself from like the initial thing t...,D:\Data Science Projects Github\ai-multimodal-...,338,352,345.0,Fear
11,05:53,06:04,Client,tempted to like check whatever all the media ...,D:\Data Science Projects Github\ai-multimodal-...,353,364,358.5,Neutral
12,06:04,06:12,Client,racing. My thoughts are keep racing with ever...,D:\Data Science Projects Github\ai-multimodal-...,364,372,368.0,Happy
14,09:53,10:04,Client,"to myself in front of a mirror. And honestly,...",D:\Data Science Projects Github\ai-multimodal-...,593,604,598.5,Fear


#### Face Emotion Prediction

In [13]:
import torch
import torchvision.models as models
import torch.nn as nn
from PIL import Image
from torchvision import transforms

In [14]:
# Load trained ResNet18-based face emotion model
def load_cv_emotion_model(weights_path):
    model = models.resnet18(pretrained=False)
    num_ftrs = model.fc.in_features
    model.fc = nn.Sequential(
        nn.BatchNorm1d(num_ftrs),
        nn.Dropout(0.5),
        nn.Linear(num_ftrs, 8)
    )
    model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))
    model.eval()
    return model

# Transform for image preprocessing
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Predict emotion from image
def predict_face_emotion(image_path, model):
    try:
        image = Image.open(image_path).convert("RGB")
        image = image_transform(image).unsqueeze(0)  # (1, 3, 224, 224)
        with torch.no_grad():
            logits = model(image)
            pred_class = torch.argmax(logits, dim=1).item()
            return emotion_labels[pred_class]
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return "Error"

# Main function to apply predictions to all rows
def add_face_emotions_to_client_df(client_df, model_path):
    print("📦 Loading face emotion model...")
    model = load_cv_emotion_model(model_path)

    print("🔍 Predicting emotions from images...")
    client_df["face_emotion_prediction"] = client_df["Image_Path"].apply(lambda path: predict_face_emotion(path, model))

    print("✅ Face emotion predictions added to DataFrame.")
    return client_df


model_path = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\models\cv model\best_model (1).pth"
client_df = add_face_emotions_to_client_df(client_df, model_path)


📦 Loading face emotion model...
🔍 Predicting emotions from images...




✅ Face emotion predictions added to DataFrame.


In [15]:
client_df

Unnamed: 0,Start,End,Speaker,Text,Image_Path,Start_sec,End_sec,Mid_sec,speech_predicted_emotion,face_emotion_prediction
1,01:16,01:27,Client,"And I'm a peer educator at CAPS here, and I t...",D:\Data Science Projects Github\ai-multimodal-...,76,87,81.5,Happy,Disgust
3,01:53,02:08,Client,"Um, so nothing much has changed in terms of w...",D:\Data Science Projects Github\ai-multimodal-...,113,128,120.5,Fear,Neutral
4,02:08,02:21,Client,But as for the homework. I felt that sometime...,D:\Data Science Projects Github\ai-multimodal-...,128,141,134.5,Fear,Disgust
5,02:22,02:33,Client,and my thoughts were like controlling me. So ...,D:\Data Science Projects Github\ai-multimodal-...,142,153,147.5,Anger,Sadness
7,04:41,04:55,Client,So whenever I felt like my anxiety was throug...,D:\Data Science Projects Github\ai-multimodal-...,281,295,288.0,Fear,Disgust
8,04:56,05:08,Client,"Like emails from my college about my GPA, fro...",D:\Data Science Projects Github\ai-multimodal-...,296,308,302.0,Fear,Disgust
10,05:38,05:52,Client,distract myself from like the initial thing t...,D:\Data Science Projects Github\ai-multimodal-...,338,352,345.0,Fear,Disgust
11,05:53,06:04,Client,tempted to like check whatever all the media ...,D:\Data Science Projects Github\ai-multimodal-...,353,364,358.5,Neutral,Happy
12,06:04,06:12,Client,racing. My thoughts are keep racing with ever...,D:\Data Science Projects Github\ai-multimodal-...,364,372,368.0,Happy,Disgust
14,09:53,10:04,Client,"to myself in front of a mirror. And honestly,...",D:\Data Science Projects Github\ai-multimodal-...,593,604,598.5,Fear,Happy


#### Testing Pipeline

In [16]:
import os
import pandas as pd

# Video path
video_path = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\uploads\Zoom practice therapy session.mp4"

# Model paths
speaker_model_path = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\models\speaker_prediction_roberta_model"
text_model_path = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\models\deberta_model"
face_model_path = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\models\cv model\best_model (1).pth"

# Output folder for extracted faces
output_dir = r"D:\Data Science Projects Github\ai-multimodal-emotion-therapy\notebooks_code\segmentation\Extracted_Images"

# HF Token (if WhisperX diarization needs auth)
hf_token = "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"


In [55]:
from pipeline import (
    transcribe_and_diarize,
    load_speaker_model,
    classify_and_map_speakers,
    merge_conversation_segments,
    extract_faces_for_client_segments,
    add_speech_emotions_to_client_df,
    add_face_emotions_to_client_df
)

In [18]:
df_transcript = transcribe_and_diarize(video_path, hf_token)
df_transcript.head()

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'encoder': 'Google'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [640, 360], 'bitrate': 103, 'fps': 25.0, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 95, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 691.91, 'bitrate': 202, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(Main)', 'video_size': [640, 360], 'video_bitrate': 103, 'video_fps': 25.0, 'def

                                                                        

MoviePy - Done.


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\aliir\anaconda3\envs\whisperx\lib\site-packages\whisperx\assets\pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu118. Bad things might happen unless you revert torch to 1.x.
Detected language: en (1.00) in first 30s of audio...


  std = sequences.std(dim=-1, correction=1)


Unnamed: 0,Start (sec),End (sec),Speaker,Text,total_duration
0,0.031,9.92,SPEAKER_01,"what brought you here. Now go. Hi, Ms. Jalen....",9.889
1,10.443,19.707,SPEAKER_01,It's great to see you today. Let's start by r...,9.264
2,20.517,28.685,SPEAKER_00,"Um, I have been struggling with anxiety, espe...",8.168
3,31.942,46.724,SPEAKER_01,"That makes sense. Yeah, it's important to und...",14.782
4,47.129,61.068,SPEAKER_00,"Oh, I am aiming for being less anxious during...",13.939


In [56]:
speaker_model, speaker_tokenizer, label_encoder = load_speaker_model(speaker_model_path, device="cuda")
df_mapped = classify_and_map_speakers(df_transcript, speaker_model, speaker_tokenizer, label_encoder, device="cuda")

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

In [67]:
df_merged = merge_conversation_segments(df_mapped)
df_merged.head()

Unnamed: 0,Start,End,Speaker,Text
0,00:00,00:19,therapist,"what brought you here. Now go. Hi, Ms. Jalen...."
1,00:20,00:28,Client,"Um, I have been struggling with anxiety, espe..."
2,00:31,00:46,therapist,"That makes sense. Yeah, it's important to und..."
3,00:47,01:01,Client,"Oh, I am aiming for being less anxious during..."
4,01:04,01:15,therapist,Can you tell me more about situations where y...


In [70]:
view_type = "Gallery View"  # or "Speaker View"
face_option = "Right"        # Gallery: Left/Right, Speaker: Large/Small

df_client = df_merged[df_merged["Speaker"].str.lower() == "client"].copy()
df_client = df_client.dropna()
df_with_faces = extract_faces_for_client_segments(df_client, video_path, output_dir, view_type, face_option)
df_with_faces = df_with_faces.dropna()

In [71]:
df_with_faces.head()

Unnamed: 0,Start,End,Speaker,Text,Start_sec,End_sec,Mid_sec,Image_Path
1,00:20,00:28,Client,"Um, I have been struggling with anxiety, espe...",20,28,24.0,D:\Data Science Projects Github\ai-multimodal-...
3,00:47,01:01,Client,"Oh, I am aiming for being less anxious during...",47,61,54.0,D:\Data Science Projects Github\ai-multimodal-...
7,02:00,02:10,Client,I would say that it's been somewhat helpful. ...,120,130,125.0,D:\Data Science Projects Github\ai-multimodal-...
9,02:25,02:37,Client,I think the breathing exercises have been rea...,145,157,151.0,D:\Data Science Projects Github\ai-multimodal-...
11,03:00,03:12,Client,"Um, I would say that I'm probably in the prep...",180,192,186.0,D:\Data Science Projects Github\ai-multimodal-...


In [72]:
df_with_text_emotion = add_speech_emotions_to_client_df(df_with_faces, text_model_path)
df_final = add_face_emotions_to_client_df(df_with_text_emotion, face_model_path)
df_final.head()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  client_df["speech_predicted_emotion"] = client_df["Text"].apply(predict_emotion)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  client_df["face_emotion_prediction"] =

Unnamed: 0,Start,End,Speaker,Text,Start_sec,End_sec,Mid_sec,Image_Path,speech_predicted_emotion,face_emotion_prediction
1,00:20,00:28,Client,"Um, I have been struggling with anxiety, espe...",20,28,24.0,D:\Data Science Projects Github\ai-multimodal-...,Confusion,Anger
3,00:47,01:01,Client,"Oh, I am aiming for being less anxious during...",47,61,54.0,D:\Data Science Projects Github\ai-multimodal-...,Fear,Sadness
7,02:00,02:10,Client,I would say that it's been somewhat helpful. ...,120,130,125.0,D:\Data Science Projects Github\ai-multimodal-...,Fear,Anger
9,02:25,02:37,Client,I think the breathing exercises have been rea...,145,157,151.0,D:\Data Science Projects Github\ai-multimodal-...,Happy,Sadness
11,03:00,03:12,Client,"Um, I would say that I'm probably in the prep...",180,192,186.0,D:\Data Science Projects Github\ai-multimodal-...,Confusion,Neutral
