In [1]:
import pandas as pd
import os

# Paths
BASE_DIR = "../data"
TRANSCRIPT_DIR = os.path.join(BASE_DIR, "Transcripts")
AUDIO_DIR = os.path.join(BASE_DIR, "Audio")
LABELS_PATH = os.path.join(BASE_DIR, "Labels", "train_split.csv")  # assuming participant 300 is in train

# Load label file
labels_df = pd.read_csv(LABELS_PATH)
print("PHQ Score Labels:")
display(labels_df.head())

# Filter participant 300
participant_id = 303
participant_label = labels_df[labels_df['Participant_ID'] == participant_id]
print(f"\nPHQ-8 Score for Participant {participant_id}:")
display(participant_label)

# Load transcript for participant 300
transcript_path = os.path.join(TRANSCRIPT_DIR, f"{participant_id}_TRANSCRIPT.csv")
transcript_df = pd.read_csv(transcript_path, sep='\t')
print(f"\nTranscript for Participant {participant_id}:")
display(transcript_df.head())

PHQ Score Labels:


Unnamed: 0,Participant_ID,PHQ8_Binary,PHQ8_Score,Gender,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,303,0,0,0,0,0,0.0,0,0,0,0,0
1,304,0,6,0,0,1,1.0,2,2,0,0,0
2,305,0,7,1,0,1,1.0,2,2,1,0,0
3,310,0,4,1,1,1,0.0,0,0,1,1,0
4,312,0,2,1,0,0,1.0,1,0,0,0,0



PHQ-8 Score for Participant 303:


Unnamed: 0,Participant_ID,PHQ8_Binary,PHQ8_Score,Gender,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,303,0,0,0,0,0,0.0,0,0,0,0,0



Transcript for Participant 303:


Unnamed: 0,start_time,stop_time,speaker,value
0,26.276,48.696,Ellie,hi i'm ellie thanks for coming in today i was ...
1,49.256,50.406,Ellie,how are you doing today
2,50.686,51.836,Participant,okay how 'bout yourself
3,52.576,54.136,Ellie,i'm great thanks
4,54.816,56.236,Ellie,where are you from originally


In [2]:
!pip install transformers
!pip install torch



In [3]:
from transformers import BertTokenizer, BertModel
import torch

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval();  # inference mode

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Filter only participant's utterances
user_utterances = transcript_df[transcript_df['speaker'] == 'Participant']

# Clean text if needed (optional)
def clean_text(text):
    return str(text).strip().replace('\n', ' ')

user_utterances['cleaned'] = user_utterances['value'].apply(clean_text)
display(user_utterances[['start_time', 'cleaned']].head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_utterances['cleaned'] = user_utterances['value'].apply(clean_text)


Unnamed: 0,start_time,cleaned
2,50.686,okay how 'bout yourself
5,56.586,here in california
7,59.066,yeah
9,63.396,oh well that it's big and broad there's a lot ...
10,69.416,um job opportunities than other states


In [6]:
# Function to get BERT sentence embedding
def get_bert_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        outputs = bert_model(**inputs)
        # Mean-pooling over token embeddings (dim=1)
        sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        return sentence_embedding

# Generate embeddings
user_utterances['bert_emb'] = user_utterances['cleaned'].apply(get_bert_embedding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_utterances['bert_emb'] = user_utterances['cleaned'].apply(get_bert_embedding)


In [7]:
import numpy as np

# Check shape of one embedding
print("Embedding shape:", user_utterances['bert_emb'].iloc[0].shape)
print("First 5 dims:", user_utterances['bert_emb'].iloc[0][:5])

Embedding shape: (768,)
First 5 dims: [ 0.13236138 -0.38385934 -0.1249822  -0.35145512 -0.08211304]


In [8]:
!pip install opensmile

Collecting opensmile
  Downloading opensmile-2.5.1-py3-none-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.12-py3-none-any.whl.metadata (2.7 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.3.1-py3-none-any.whl.metadata (4.3 kB)
Collecting audeer>=2.1.1 (from audinterface>=0.7.0->opensmile)
  Downloading audeer-2.2.2-py3-none-any.whl.metadata (4.1 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.3.2-py3-none-any.whl.metadata (4.7 kB)
Collecting audiofile>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audiofile-1.5.1-py3-none-any.whl.metadata (4.9 kB)
Collecting audmath>=1.4.1 (from audinterface>=0.7.0->opensmile)
  Downloading audmath-1.4.2-py3-none-any.whl.metadata (3.7 kB)
Collecting audresample<2.0.0,>=1.1.0 (from audinterface>=0.7.0->opensmile)
  Downloading audresample-1.3.4-py3-none-macosx_11_0_arm64.whl.metadata (4.4 k

In [10]:
import os
from opensmile import Smile, FeatureSet, FeatureLevel

# Set up OpenSMILE with eGeMAPSv02 (common for affective tasks)
smile = Smile(
    feature_set=FeatureSet.eGeMAPSv02,
    feature_level=FeatureLevel.Functionals,  # Summarized features
)

# Audio path for participant 300
audio_path = os.path.join("../data", "Audio", "300_AUDIO.wav")

# Extract features
audio_features_df = smile.process_file(audio_path)

print("Extracted Audio Features:")
display(audio_features_df.head())

Extracted Audio Features:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
file,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
../data/Audio/300_AUDIO.wav,0 days,0 days 00:10:48.500000,28.989614,0.164996,25.515335,26.992317,33.286697,7.771362,212.503174,396.768768,62.987938,103.834175,...,-0.019914,0.008097,0.069432,0.243643,1.131965,0.191049,0.241436,0.690924,1.544863,-31.463453


In [11]:
audio_feat_vector = audio_features_df.iloc[0].values  # shape: (88,)
print("Audio Feature Vector shape:", audio_feat_vector.shape)

Audio Feature Vector shape: (88,)


In [12]:
import numpy as np

# Stack all utterance embeddings
all_utterance_embs = np.stack(user_utterances['bert_emb'].values)  # shape: (num_utterances, 768)

# Mean pool all text embeddings into one vector
text_embedding_avg = np.mean(all_utterance_embs, axis=0)  # shape: (768,)

In [13]:
# Final multimodal feature vector (768 text + 88 audio = 856)
multimodal_feature_vector = np.concatenate([text_embedding_avg, audio_feat_vector])
print("Multimodal Feature Shape:", multimodal_feature_vector.shape)  # (856,)

Multimodal Feature Shape: (856,)


In [15]:
# Assuming you already loaded `labels_df` from train_split.csv
phq_score = labels_df[labels_df['Participant_ID'] == 303]['PHQ8_Score'].values[0]
print("Target PHQ-8 Score:", phq_score)

Target PHQ-8 Score: 0
