<a href="https://colab.research.google.com/github/cgenevier/CSCI5622-HW4/blob/part-f-transformers/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Study 1: Designing explainable speech-based machine learning models of depression

To open this ipynb in Colab, click the "Open in Colab" button at the top of the ipynb in Github, or [follow this link](https://colab.research.google.com/github/cgenevier/CSCI5622-HW4/blob/main/main.ipynb).

Given that Colab doesn't automatically load any of the content (data or other functions) from the Github repo, running the code below will copy the repo into the workspace directory for use. To save this ipynb file back to Github, select **File > Save** (which should show the repo if you're signed in) or **File > Save a copy in Github** if it's in the menu.

Note that the content of the data files or any of the other file structures are not saved back to Github, so make sure that if you make changes to things there, that you put them in Github separately.

In [None]:
# Clone Github Repo into the temporary local environment so data can be accessed and manipulated
!git clone https://github.com/cgenevier/CSCI5622-HW4.git
%cd CSCI5622-HW4

Cloning into 'CSCI5622-HW4'...
remote: Enumerating objects: 436, done.[K
remote: Counting objects: 100% (436/436), done.[K
remote: Compressing objects: 100% (431/431), done.[K
remote: Total 436 (delta 30), reused 394 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (436/436), 5.19 MiB | 9.91 MiB/s, done.
Resolving deltas: 100% (30/30), done.
/content/CSCI5622-HW4


In [None]:
# Import necessary libraries

# Helpers
import glob

# Pandas, seaborn, and numpy for data manipulation
import pandas as pd
pd.set_option("display.max_rows", None)
import statistics as stat
import seaborn as sns
import numpy as np
np.random.seed(42)

# Keras & TensorFlow for building the neural networks
import itertools, json, time
from itertools import count
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, regularizers, callbacks, backend as K
tf.random.set_seed(42)

# Feature extraction
!pip install vaderSentiment transformers torch
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import logging, BertTokenizer, BertModel
import torch

# Matplotlib for graphing
import matplotlib.pyplot as plt

# Disable progress bars (necessary for it to show up correctly in Github)
logging.set_verbosity_error()

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


##### Formatting the data - Depression Labels

In [None]:
# Import Depression Labels
# Columns: Participant_ID, PHQ_Score
depression_labels = pd.read_csv("data/DepressionLabels.csv")

# Rename Participant_ID to ParticipantID to match accoustic files below & force trimmed string type
depression_labels = depression_labels.rename(columns={"Participant_ID": "ParticipantID"})
depression_labels["ParticipantID"] = depression_labels["ParticipantID"].astype(str).str.strip()

##### Formatting the data - Text Features

In [None]:
# Import Text Dataset (for text feature extraction)
# Note: When comparing the E-DAIC_Transcripts files to the corresponding E-DAIC Acoustics files,
# it looks like the transcripts sometimes only contain partial data from the accoustics text column -
# for example, 386_Transcript.csv - so it seems to make sense to concatenate Text data in the
# Acoustics file for completeness.
rows = []
for p in glob.glob("data/E-DAIC_Acoustics/*_utterance_agg.csv"):
    df = pd.read_csv(p)
    df["ParticipantID"] = df["ParticipantID"].astype(str).str.strip()
    full_text = " ".join(df["Text"].dropna().astype(str))
    full_text = " ".join(full_text.split())  # collapse whitespace
    rows.append({"ParticipantID": df["ParticipantID"].iloc[0], "FullText": full_text})

# Columns: ParticipantID, FullText
text_df = pd.DataFrame(rows)
# Merge with labels. Columns: ParticipantID, FullText, PHQ_Score
lang_df = depression_labels.merge(text_df, on="ParticipantID", how="inner")

# Inspect results
lang_df.head()

Unnamed: 0,ParticipantID,PHQ_Score,FullText
0,386,11,might have pulled something that I'm going to ...
1,387,2,when she's done she'll let you know alrighty t...
2,388,17,are you okay with yes doing all right from Pas...
3,389,14,and please are you okay sure I'm okay small to...
4,390,9,and now she's going to chat with you for a bit...


##### Formatting the data - Acoustic Features

In [None]:
# Import Accoustic Dataset (for part c, d)

# Helper function for mean, standard dev, & IQR (interquartile range)
def summarize_cols(num_df: pd.DataFrame) -> pd.DataFrame:
    # mean and std per column (NaN-safe)
    mean_s = num_df.mean(numeric_only=True)
    std_s  = num_df.std(numeric_only=True)
    # IQR = Q3 - Q1 per column
    q75 = num_df.quantile(0.75, numeric_only=True)
    q25 = num_df.quantile(0.25, numeric_only=True)
    iqr_s = q75 - q25

    # assemble into a tidy (feature, stat) table
    stats = pd.concat(
        {"mean": mean_s, "std": std_s, "iqr": iqr_s},
        axis=1
    )  # index=feature, columns=[mean,std,iqr]

    # flatten to one row with columns like feature__mean
    wide = stats.stack().to_frame().T
    wide.columns = [f"{feat}__{stat}" for feat, stat in wide.columns]
    return wide

# Each file in E-DAIC_Acoustics contains utterance-level acoustic features for one participant.
rows_with_conf = []
rows_no_conf = []
for p in glob.glob("data/E-DAIC_Acoustics/*_utterance_agg.csv"):
    df = pd.read_csv(p)
    df["ParticipantID"] = df["ParticipantID"].astype(str).str.strip()

    # Include Confidence (column 5) + all acoustic features (6+)
    numeric_with_conf = df.columns[5:]
    df[numeric_with_conf] = df[numeric_with_conf].apply(pd.to_numeric, errors="coerce")

    agg_with_conf = summarize_cols(df[numeric_with_conf])
    agg_with_conf.insert(0, "ParticipantID", df["ParticipantID"].iloc[0])
    rows_with_conf.append(agg_with_conf)

    # Excludes Confidence - only include acoustic features
    numeric_no_conf = df.columns[6:]
    df[numeric_no_conf] = df[numeric_no_conf].apply(pd.to_numeric, errors="coerce")

    agg_no_conf = summarize_cols(df[numeric_no_conf])
    agg_no_conf.insert(0, "ParticipantID", df["ParticipantID"].iloc[0])
    rows_no_conf.append(agg_no_conf)

# Combine into one DataFrame each
acoustic_features_with_conf = pd.concat(rows_with_conf, ignore_index=True)
acoustic_features_no_conf = pd.concat(rows_no_conf, ignore_index=True)

# Merge with labels to add PHQ_Score
acoustic_df_with_confidence = depression_labels.merge(acoustic_features_with_conf, on="ParticipantID", how="inner")
acoustic_df_no_confidence   = depression_labels.merge(acoustic_features_no_conf, on="ParticipantID", how="inner")

# Reorder columns: ParticipantID, PHQ_Score, then features
cols = ["ParticipantID", "PHQ_Score"] + [c for c in acoustic_df_with_confidence.columns if c not in ["ParticipantID", "PHQ_Score"]]
acoustic_df_with_confidence = acoustic_df_with_confidence[cols]
cols = ["ParticipantID", "PHQ_Score"] + [c for c in acoustic_df_no_confidence.columns if c not in ["ParticipantID", "PHQ_Score"]]
acoustic_df_no_confidence = acoustic_df_no_confidence[cols]

# Inspect results
display(acoustic_df_with_confidence.head())
display(acoustic_df_no_confidence.head())

Unnamed: 0,ParticipantID,PHQ_Score,Confidence__mean,Confidence__std,Confidence__iqr,Loudness_sma3__mean,Loudness_sma3__std,Loudness_sma3__iqr,alphaRatio_sma3__mean,alphaRatio_sma3__std,...,F2frequency_sma3nz__iqr,F2amplitudeLogRelF0_sma3nz__mean,F2amplitudeLogRelF0_sma3nz__std,F2amplitudeLogRelF0_sma3nz__iqr,F3frequency_sma3nz__mean,F3frequency_sma3nz__std,F3frequency_sma3nz__iqr,F3amplitudeLogRelF0_sma3nz__mean,F3amplitudeLogRelF0_sma3nz__std,F3amplitudeLogRelF0_sma3nz__iqr
0,386,11,0.933917,0.055195,0.0378,0.14739,0.064438,0.089581,-15.719342,2.71392,...,42.335639,-128.087978,39.510693,63.620488,2584.433064,57.013353,47.965921,-131.009492,37.808278,59.861069
1,387,2,0.931755,0.06202,0.064689,0.094281,0.061988,0.060425,-17.290294,2.972634,...,117.523721,-140.359126,45.511084,86.525518,2525.71599,84.201997,76.811123,-142.535582,43.59494,82.822549
2,388,17,0.888103,0.105329,0.12352,0.103571,0.045205,0.069353,-16.480069,2.705549,...,81.395833,-156.9911,32.353199,56.016943,2549.702244,75.558349,94.16342,-158.777582,30.838568,53.547303
3,389,14,0.897373,0.099174,0.103688,0.095695,0.03837,0.055492,-20.799213,2.820568,...,81.503827,-143.443426,33.056194,47.504207,2549.11012,70.157241,97.720753,-145.861499,31.530769,45.670156
4,390,9,0.900761,0.082217,0.08813,0.106446,0.045064,0.052923,-17.288813,2.328784,...,79.791693,-139.494192,31.536717,39.07632,2450.182795,73.798789,106.3734,-142.697953,29.870122,36.615301


Unnamed: 0,ParticipantID,PHQ_Score,Loudness_sma3__mean,Loudness_sma3__std,Loudness_sma3__iqr,alphaRatio_sma3__mean,alphaRatio_sma3__std,alphaRatio_sma3__iqr,hammarbergIndex_sma3__mean,hammarbergIndex_sma3__std,...,F2frequency_sma3nz__iqr,F2amplitudeLogRelF0_sma3nz__mean,F2amplitudeLogRelF0_sma3nz__std,F2amplitudeLogRelF0_sma3nz__iqr,F3frequency_sma3nz__mean,F3frequency_sma3nz__std,F3frequency_sma3nz__iqr,F3amplitudeLogRelF0_sma3nz__mean,F3amplitudeLogRelF0_sma3nz__std,F3amplitudeLogRelF0_sma3nz__iqr
0,386,11,0.14739,0.064438,0.089581,-15.719342,2.71392,3.345443,26.607891,2.620843,...,42.335639,-128.087978,39.510693,63.620488,2584.433064,57.013353,47.965921,-131.009492,37.808278,59.861069
1,387,2,0.094281,0.061988,0.060425,-17.290294,2.972634,3.86012,27.380149,3.369106,...,117.523721,-140.359126,45.511084,86.525518,2525.71599,84.201997,76.811123,-142.535582,43.59494,82.822549
2,388,17,0.103571,0.045205,0.069353,-16.480069,2.705549,3.389694,26.983015,3.08138,...,81.395833,-156.9911,32.353199,56.016943,2549.702244,75.558349,94.16342,-158.777582,30.838568,53.547303
3,389,14,0.095695,0.03837,0.055492,-20.799213,2.820568,3.971909,31.428066,3.209966,...,81.503827,-143.443426,33.056194,47.504207,2549.11012,70.157241,97.720753,-145.861499,31.530769,45.670156
4,390,9,0.106446,0.045064,0.052923,-17.288813,2.328784,2.859256,29.355163,3.087194,...,79.791693,-139.494192,31.536717,39.07632,2450.182795,73.798789,106.3734,-142.697953,29.870122,36.615301


### (a) (2 points) Extracting language features.

**Syntactic vectorizers:** count vectorizer (e.g., CountVectorizer from sklearn) transforming
a collection of text documents into a numerical matrix of word or token counts; TFIDF vectorizer (e.g., TfidfVectorizer from sklearn) incorporating document-level weighting,
which emphasizes words significant to specific documents’ part-of-speech features counting
the distribution of part of speech tags over a document

In [None]:
# Use TfidfVectorizer from sklearn
vect = TfidfVectorizer(max_features=1000)
X_tfidf = vect.fit_transform(lang_df["FullText"])

# Convert sparse matrix to DataFrame
syntactic_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vect.get_feature_names_out()
)

# Add ParticipantID column & move to first column
syntactic_df["ParticipantID"] = lang_df["ParticipantID"].values
cols = ["ParticipantID"] + [c for c in syntactic_df.columns if c != "ParticipantID"]
syntactic_df = syntactic_df[cols]

# Add back in PHQ_Score & move to second column
syntactic_df = syntactic_df.merge(depression_labels, on="ParticipantID", how="inner")
cols = ["ParticipantID", "PHQ_Score"] + [c for c in syntactic_df.columns if c not in ["ParticipantID", "PHQ_Score"]]
syntactic_df = syntactic_df[cols]

# Inspect dataframe
syntactic_df.head()

Unnamed: 0,ParticipantID,PHQ_Score,10,12,15,16,18,19,20,30,...,years,yes,yesterday,yet,york,you,young,younger,your,yourself
0,386,11,0.008629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017304,0.010626,0.0,0.0,0.0,0.352579,0.011235,0.0,0.016919,0.019328
1,387,2,0.027755,0.0,0.0,0.0,0.0,0.0,0.032038,0.0,...,0.074209,0.0,0.0,0.0,0.047486,0.346512,0.072275,0.0,0.108838,0.031083
2,388,17,0.031186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041692,0.038404,0.0,0.040133,0.0,0.283166,0.0,0.0,0.020382,0.034926
3,389,14,0.054573,0.052964,0.0,0.0,0.0,0.0,0.0,0.0,...,0.036479,0.0,0.0,0.0,0.0,0.325182,0.0,0.0,0.089168,0.0
4,390,9,0.0,0.0,0.0,0.0,0.0,0.0,0.016775,0.0,...,0.067999,0.008948,0.020119,0.0,0.0,0.123706,0.0,0.0,0.009498,0.016275


**Semantic features:** sentiment scores (e.g., Vader, https://github.com/cjhutto/vaderSentiment),
topic distribution (using topic modeling), or named entities

In [None]:
# Using Vader to analyze sentiment of the text data
analyzer = SentimentIntensityAnalyzer()

# Apply Vader to the text data (creates 4 new columns)
vader_scores = lang_df["FullText"].apply(lambda x: pd.Series(analyzer.polarity_scores(str(x))))
semantic_df = pd.concat([lang_df, vader_scores], axis=1)

# Inspect dataframe
semantic_df.head()


Unnamed: 0,ParticipantID,PHQ_Score,FullText,neg,neu,pos,compound
0,386,11,might have pulled something that I'm going to ...,0.046,0.77,0.184,0.9999
1,387,2,when she's done she'll let you know alrighty t...,0.05,0.665,0.285,0.9996
2,388,17,are you okay with yes doing all right from Pas...,0.07,0.769,0.161,0.9953
3,389,14,and please are you okay sure I'm okay small to...,0.057,0.827,0.116,0.9822
4,390,9,and now she's going to chat with you for a bit...,0.067,0.74,0.193,0.9996


**Advanced features:** word embeddings, such as Word2Vec or BERT (e.g., pytorch-pretrainedbert)) for capturing contextual meaning

In [None]:
# Use BERT to capture contextual meaning (note: takes about 4 minutes to run on T4)

# Load uncased base model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

# Loop through text data and get embeddings
embeddings = []
for text in lang_df["FullText"]:
    # Truncate long text (BERT max = 512 tokens)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token
    embeddings.append(cls_embedding)

# Convert list of embeddings (each 768-dim) to DataFrame
bert_df = pd.DataFrame(np.vstack(embeddings))
bert_df.columns = [f"bert_{i}" for i in range(bert_df.shape[1])]

# Add ParticipantID and PHQ_Score
bert_df = pd.concat([lang_df[["ParticipantID", "PHQ_Score"]].reset_index(drop=True), bert_df], axis=1)

# Inspect dataframe
bert_df.head()

Unnamed: 0,ParticipantID,PHQ_Score,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,386,11,0.024454,0.250101,0.48064,-0.209235,-0.06423,-0.409601,0.119545,0.92613,...,0.003123,-0.820313,-0.263505,-0.272017,0.424426,0.617825,-0.222068,-0.407772,0.242542,-0.204929
1,387,2,-0.085553,-0.097101,0.22818,0.072628,-0.162793,-0.457378,-0.005894,0.718334,...,0.354261,-1.054273,-0.246188,-0.318969,0.135777,0.865151,-0.351662,-0.323795,0.236854,-0.311686
2,388,17,-0.077363,0.149377,0.376863,0.118528,-0.10099,-0.355886,0.216298,0.758371,...,0.062572,-0.927725,-0.235412,-0.189596,0.669311,0.589867,-0.421785,-0.244375,0.100614,-0.064971
3,389,14,-0.305623,0.097801,0.182487,0.152354,-0.197408,-0.22955,0.315507,0.775767,...,0.160983,-1.14603,-0.109301,-0.369335,0.432705,0.457361,-0.329305,-0.434079,0.119847,-0.388671
4,390,9,-0.075243,-0.034517,0.310655,0.051466,-0.14426,-0.145557,0.093521,0.791168,...,-0.124968,-0.980401,-0.147667,-0.18862,0.437721,0.50131,-0.376553,-0.422697,0.162246,-0.142558


**Combined dataset:** Combined the three dataframes above into one with all the features

In [None]:
# Merge all three on ParticipantID
text_feature_df = (
    syntactic_df
    .merge(semantic_df, on=["ParticipantID", "PHQ_Score"], how="outer")
    .merge(bert_df, on=["ParticipantID", "PHQ_Score"], how="outer")
)
text_feature_df.head()

# Optional: sort by ParticipantID for clarity
#merged_df = merged_df.sort_values("ParticipantID").reset_index(drop=True)

Unnamed: 0,ParticipantID,PHQ_Score,10,12,15,16,18,19,20,30,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,386,11,0.008629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003123,-0.820313,-0.263505,-0.272017,0.424426,0.617825,-0.222068,-0.407772,0.242542,-0.204929
1,387,2,0.027755,0.0,0.0,0.0,0.0,0.0,0.032038,0.0,...,0.354261,-1.054273,-0.246188,-0.318969,0.135777,0.865151,-0.351662,-0.323795,0.236854,-0.311686
2,388,17,0.031186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.062572,-0.927725,-0.235412,-0.189596,0.669311,0.589867,-0.421785,-0.244375,0.100614,-0.064971
3,389,14,0.054573,0.052964,0.0,0.0,0.0,0.0,0.0,0.0,...,0.160983,-1.14603,-0.109301,-0.369335,0.432705,0.457361,-0.329305,-0.434079,0.119847,-0.388671
4,390,9,0.0,0.0,0.0,0.0,0.0,0.0,0.016775,0.0,...,-0.124968,-0.980401,-0.147667,-0.18862,0.437721,0.50131,-0.376553,-0.422697,0.162246,-0.142558


### (b) (2 points) Estimating depression severity with interpretable models using language features.

### (c) (2 points) Estimating depression severity with interpretable models using acoustic features.

### (d) (2 points) Estimating depression severity with unimodal and multimodal deep learning models.

### (e) (2 points) Explainable ML.

### (f) (Bonus, 2 points) Experimenting with transformers.

In [None]:
# Install dependencies for tinyllama transformer
!pip install -U transformers accelerate bitsandbytes
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import re
from sklearn.model_selection import train_test_split


# Disable progress bars (necessary for it to show up correctly in Github)
logging.set_verbosity_error()



In [None]:
from huggingface_hub import login

login("")

In [None]:
# Setup tiny llama model
#model_name = "meta-llama/Llama-3.2-1B-Instruct" # This requires authentication
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

#device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"

#tokenizer = AutoTokenizer.from_pretrained(model_name)

#model = AutoModelForCausalLM.from_pretrained(
#    model_name,
#    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
#)
#model = model.to(device)
#model.eval()

# Pick your model – you can swap this for 8B if you have access/VRAM:
# model_name = "meta-llama/Llama-3.2-8B-Instruct"
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # requires HF auth

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# 4-bit quantization config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # good default for GPUs
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",  # let HF place it on the GPU
)

model.eval()

In [None]:
system_prompt = """
You are a research assistant that estimates depression severity from transcripts using the PHQ-8 questionnaire.

The PHQ-8 total score ranges from 0 to 24 and is the sum of 8 items (each scored 0–3):
1. Little interest or pleasure in doing things
2. Feeling down, depressed, or hopeless
3. Trouble falling or staying asleep, or sleeping too much
4. Feeling tired or having little energy
5. Poor appetite or overeating
6. Feeling bad about yourself or that you are a failure or have let yourself or your family down
7. Trouble concentrating on things
8. Moving or speaking so slowly that other people could have noticed, or being fidgety/restless

Guidelines:
- Use only the information in the transcript.
- If there is no clear evidence for a symptom, treat that item as 0.
- If frequency is vague ("sometimes", "often"), choose the closest reasonable PHQ-style frequency internally.
- Do not give medical advice or diagnostic labels; you are only estimating a PHQ-8 total score for research.

Output requirement (very important):
- After reading the transcript, output only a single integer between 0 and 24, representing the PHQ-8 total score.
- Do not output any words, labels, explanations, or punctuation.
- Example of valid output: 7
- Invalid outputs: PHQ score: 7, The score is 7, {"score": 7}
""".strip()

In [None]:
train_df, test_df = train_test_split(
    lang_df,
    test_size=0.97,
    random_state=42
)

# --------------------------------
# 4. Build few-shot examples from *all* training data
# --------------------------------
def build_few_shot_block(df: pd.DataFrame, max_chars_per_example: int = 1500) -> str:
    """
    Build a few-shot instruction block from ALL rows in the training dataframe.
    Each row becomes one example (optionally truncated for length).
    Expects df to have 'FullText' and 'PHQ_Score'.
    """
    lines = ["Here are some labeled examples:\n"]

    for i, (_, row) in enumerate(df.iterrows(), start=1):
        txt = str(row["FullText"])
        score = int(row["PHQ_Score"])

        # truncate super-long transcripts to avoid blowing up context
        if len(txt) > max_chars_per_example:
            txt_display = txt[:max_chars_per_example] + "... [TRUNCATED]"
        else:
            txt_display = txt

        lines.append(
            f"Example {i}\n"
            f"---------\n"
            f"Transcript:\n"
            f"{txt_display}\n\n"
            f"Correct PHQ-8 total score: {score}\n"
        )

    lines.append(
        "Now, based on these examples, estimate the PHQ-8 total score for the following transcript."
    )

    return "\n".join(lines).strip()

# few-shot = ALL training rows
few_shot_block = build_few_shot_block(train_df)

print(few_shot_block)

Unnamed: 0,FullText,Prompt_1,Prompt_2,Prompt_3
0,might have pulled something that I'm going to ...,NOT_SAD,NOT_SAD,NOT_SAD
1,when she's done she'll let you know alrighty t...,SAD,NOT_SAD,NOT_SAD
2,are you okay with yes doing all right from Pas...,SAD,NOT_SAD,NOT_SAD
3,and please are you okay sure I'm okay small to...,SAD,NOT_SAD,NOT_SAD
4,and now she's going to chat with you for a bit...,SAD,NOT_SAD,NOT_SAD


In [None]:
def classify_phq(transcript: str, system_prompt: str, few_shot_block: str) -> int:
    """
    Given a transcript, return a PHQ-8 total score (0–24) as an integer.
    Uses TinyLlama chat template with:
      - system message
      - user message containing few-shot examples + the new transcript
    """
    user_content = (
        few_shot_block
        + "\n\n"
        + "Task\n"
        + "----\n"
        + "You will now be given a new transcript of a conversation with one participant.\n"
          "Your task is to read the transcript and output only the PHQ-8 total score (0–24).\n\n"
        + "Transcript:\n"
        + f"{transcript}\n\n"
        + "Remember: reply with one integer between 0 and 24 and nothing else."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_content},
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    #inputs = tokenizer(prompt, return_tensors="pt").to(device)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=4,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Keep only the generated continuation
    gen_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    match = re.search(r"\d+", gen_text)
    if match:
        score = int(match.group())
        score = max(0, min(24, score))  # clamp
        return score
    else:
        return 0  # fallback if parsing fails

Unnamed: 0,FullText,Prompt_1,Prompt_2,Prompt_3,PHQ_Score
0,might have pulled something that I'm going to ...,NOT_SAD,NOT_SAD,NOT_SAD,11
1,when she's done she'll let you know alrighty t...,SAD,NOT_SAD,NOT_SAD,2
2,are you okay with yes doing all right from Pas...,SAD,NOT_SAD,NOT_SAD,17
3,and please are you okay sure I'm okay small to...,SAD,NOT_SAD,NOT_SAD,14
4,and now she's going to chat with you for a bit...,SAD,NOT_SAD,NOT_SAD,9


In [None]:
predicted_scores = []

chunk_size = 32   # try 32 → if OOM, drop to 16 or 8

test_texts = test_df["FullText"].tolist()

for start in range(0, len(test_texts), chunk_size):
    end = min(start + chunk_size, len(test_texts))
    print(f"Processing rows {start}–{end-1}...")

    for text in test_texts[start:end]:
        score = classify_phq(text, system_prompt, few_shot_block)
        predicted_scores.append(score)

        # extra GPU cleanup between calls to avoid out of memory issue
        if device == "cuda":
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()

import gc
gc.collect()

# Build results
results_df = test_df.copy()
results_df["PHQ_Pred"] = predicted_scores

# Reorder columns
cols = ["FullText", "PHQ_Score", "PHQ_Pred"] + [
    c for c in results_df.columns if c not in ["FullText", "PHQ_Score", "PHQ_Pred"]
]
results_df = results_df[cols]

results_df.head()

In [None]:
from scipy.stats import pearsonr

# y_true = actual PHQ-8 scores
# y_pred = estimated PHQ-8 scores from the model
y_true = results_df["PHQ_Score"].to_numpy(dtype=float)
y_pred = results_df["PHQ_Pred"].to_numpy(dtype=float)

# 1) Pearson correlation r
r, p_value = pearsonr(y_true, y_pred)
print(f"Pearson r: {r:.3f}")
print(f"p-value:  {p_value:.3e}")

# 2) Absolute relative error (RE) per participant
# RE_i = |y_pred_i - y_true_i| / |y_true_i|
# Need to be careful when y_true_i = 0.

# Option A: define RE with a small epsilon in the denominator
eps = 1e-8
re = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), eps)

# Attach to dataframe so you can inspect per-participant errors
results_df["RE"] = re

# Summaries over all participants
mean_re = re.mean()
median_re = np.median(re)

print(f"Mean absolute relative error (RE):   {mean_re:.3f}")
print(f"Median absolute relative error (RE): {median_re:.3f}")