# Checks

In [1]:
import torch

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"GPU device name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")


GPU is not available.


In [3]:
import os

def is_slurm_job():
    # Check if common SLURM environment variables are set
    return 'SLURM_JOB_ID' in os.environ or 'SLURM_TASKS_PER_NODE' in os.environ

# Check if the Jupyter notebook is running under a SLURM job
if is_slurm_job():
    print("Running on a SLURM-managed cluster.")
    print(f"SLURM_JOB_ID: {os.environ.get('SLURM_JOB_ID')}")
    print(f"SLURM_NODELIST: {os.environ.get('SLURM_NODELIST')}")
else:
    print("Running locally (not on a SLURM-managed cluster).")

Running on a SLURM-managed cluster.
SLURM_JOB_ID: 951760
SLURM_NODELIST: atl1-1-01-005-19-0


# Instructions

- Change paths to files/folders 
    - The `scripts_path` (the path that contains the scripts from the GitHub repo)
        - **Ex**: `/home/hice1/mbibars3/scratch/vlm-debiasing/VLM-Debiasing-Project/scripts`
    - `csv_path`: the path to the csv file that was installed with the original data that contains the patients and the detailed labels
        - **Ex**: `"/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/original/labels/detailed_lables.csv"`
    - `main_dir`: the path to the main folder that contains the untarred patients data with structure 
        - Patient1
            - Features
            - AUDIO.wav
        - Patient2
            - Features 
            - AUDIO.wav
        - **Ex**: `"/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/untarred"`
    - `path_to_save_features_csv`: path to save the final csv file that should contain the labels, split, gender, and audio modality features path
        - **Ex**: `"/home/hice1/mbibars3/scratch/vlm-debiasing/data/e-daic/data_paths.csv"`


In [5]:
scripts_path = '/home/hice1/asubramanian91/scratch/git/VLM-Debiasing-Project/scripts'
csv_path = "/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/original/labels/detailed_lables.csv"
main_dir = "/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/untarred"
path_to_save_features_csv = "/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/data_paths.csv"

# Imports

In [6]:
%load_ext autoreload
import sys
sys.path.append(scripts_path)

In [8]:
%autoreload 2
import extract_audio_features
from transformers import AutoFeatureExtractor, ASTForAudioClassification
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Extracting Audio Features from .wav Files

## List Audio .wav Files
This section of the code creates a csv file that contains the paths to all .wav files with their labels

In [9]:
# Load the labels CSV
labels_df = pd.read_csv(csv_path)
labels_df.head()

Unnamed: 0,Participant,PHQ8_1_NoInterest,PHQ8_2_Depressed,PHQ8_3_Sleep,PHQ8_4_Tired,PHQ8_5_Appetite,PHQ8_6_Failure,PHQ8_7_Concentration,PHQ8_8_Psychomotor,Depression_severity,...,PCL-C_14_Irritability,PCL-C_15_Concentration,PCL-C_16_HyperAlert,PCL-C_17_Jumpy,PTSD_severity,gender,age,Depression_label,PTSD_label,split
0,300,0,0,1,0,1,0,0,0,2,...,2,2,2,1,25.0,male,33,0,0,dev
1,301,0,0,1,1,1,0,0,0,3,...,1,1,1,1,17.0,male,39,0,0,dev
2,302,1,1,0,1,0,1,0,0,4,...,1,1,1,1,28.0,male,25,0,0,train
3,303,0,0,0,0,0,0,0,0,0,...,1,1,1,1,17.0,female,41,0,0,train
4,304,0,1,1,2,2,0,0,0,6,...,1,2,2,1,20.0,female,22,0,0,train


In [10]:
labels_csv = labels_df
cols = ["gender", "split", "PTSD_label","age","PTSD_severity"]

In [13]:
wav_paths_df = extract_audio_features.list_audio_files_paths(main_dir, labels_csv, cols)

100%|██████████| 275/275 [00:00<00:00, 1013.12it/s]


In [14]:
wav_paths_df.head()

Unnamed: 0,gender,split,PTSD_label,age,PTSD_severity,file_path
0,female,train,0,45,22.0,/home/hice1/asubramanian91/scratch/e-daic/data...
1,male,test,0,69,23.0,/home/hice1/asubramanian91/scratch/e-daic/data...
2,male,train,0,25,19.0,/home/hice1/asubramanian91/scratch/e-daic/data...
3,female,train,1,58,67.0,/home/hice1/asubramanian91/scratch/e-daic/data...
4,male,dev,0,33,39.0,/home/hice1/asubramanian91/scratch/e-daic/data...


This cell is optional, if you want to save the csv to `path_to_save` on your system

In [19]:
path_to_save = "/home/hice1/asubramanian91/scratch/e-daic/data/e-daic"
file_name = "labels_audio_wavs.csv"
extract_audio_features.save_csv(wav_paths_df, path_to_save, file_name)

## Extract and Save AST features for all patients 

This cell is optional, if you want to load the saved csv from your system

In [21]:
path = "/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/labels_audio_wavs.csv"
wav_paths_df = pd.read_csv(path)
wav_paths_csv = wav_paths_df["file_path"]

In [22]:
# Load model and processor
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
processor = AutoFeatureExtractor.from_pretrained(model_name)
model = ASTForAudioClassification.from_pretrained(model_name, output_hidden_states=True)

In [23]:
# output feature size: (1214, 768) without pooling
target_path = main_dir
extract_audio_features.extract_features_AST(wav_paths_csv, target_path, processor, model, pooling=True)

100%|██████████| 275/275 [03:58<00:00,  1.15it/s]


# Creating a .csv for the features paths 
The following cells are designed to create a csv file that contains the path to each patients AST Audio features with the labels and split and gender for each file.

This csv should be used for training the perceiver after concatenating it to the columns of the other modality features paths

The final csv should be saved for easier loading.

In [24]:
# Add the new column with modified file paths
wav_paths_df["audio"] = wav_paths_df["file_path"].str.replace(".wav", ".npy", regex=False)
wav_paths_df.head()

Unnamed: 0,gender,split,PTSD_label,age,PTSD_severity,file_path,audio
0,female,train,0,45,22.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
1,male,test,0,69,23.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
2,male,train,0,25,19.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
3,female,train,1,58,67.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
4,male,dev,0,33,39.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...


In [25]:
audio_features = wav_paths_df[["split", "PTSD_severity", "PTSD_label",	"gender", "audio"]].copy()
audio_features

Unnamed: 0,split,PTSD_severity,PTSD_label,gender,audio
0,train,22.0,0,female,/home/hice1/asubramanian91/scratch/e-daic/data...
1,test,23.0,0,male,/home/hice1/asubramanian91/scratch/e-daic/data...
2,train,19.0,0,male,/home/hice1/asubramanian91/scratch/e-daic/data...
3,train,67.0,1,female,/home/hice1/asubramanian91/scratch/e-daic/data...
4,dev,39.0,0,male,/home/hice1/asubramanian91/scratch/e-daic/data...
...,...,...,...,...,...
270,test,44.0,1,female,/home/hice1/asubramanian91/scratch/e-daic/data...
271,test,64.0,1,female,/home/hice1/asubramanian91/scratch/e-daic/data...
272,test,,1,male,/home/hice1/asubramanian91/scratch/e-daic/data...
273,train,17.0,0,male,/home/hice1/asubramanian91/scratch/e-daic/data...


In [26]:
audio_features.to_csv(path_to_save_features_csv, index=False)