In [1]:
import sys
sys.path.append('/home/hice1/awagh31/scratch/miniconda3/envs/vlm-debiasing/lib/python3.12/site-packages')


In [34]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the metadata CSV file
metadata = pd.read_csv('/home/hice1/asubramanian91/scratch/e-daic/data/e-daic/undersampled_embeddings.csv')


In [35]:
embeddings = []
valid_indices = []

In [36]:
import os
from sklearn.decomposition import PCA

In [37]:
# Function to load an embedding from a file path
def load_embedding(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Embedding file not found: {file_path}")
    return np.load(file_path, allow_pickle=True)

# Iterate over each row in the metadata DataFrame
for idx, row in metadata.iterrows():
    try:
        # Load embeddings for each modality
        audio_emb = load_embedding(row['audio'])
        visual_emb = load_embedding(row['visual'])
        text_emb = load_embedding(row['text'])
        
        # Ensure all embeddings are 1-D arrays (flatten if necessary)
        if audio_emb.ndim != 1:
            audio_emb = audio_emb.flatten()
        if visual_emb.ndim != 1:
            visual_emb = visual_emb.flatten()
        if text_emb.ndim != 1:
            text_emb = text_emb.flatten()
        
        # Concatenate embeddings to form a multimodal embedding
        multimodal_emb = np.concatenate([audio_emb, visual_emb, text_emb])
        
        # Append the concatenated embedding and index to the respective lists
        embeddings.append(multimodal_emb)
        valid_indices.append(idx)
        
    except (ValueError, IOError, FileNotFoundError) as e:
        print(f"Skipping index {idx} due to loading error: {e}")

# Convert the list of embeddings to a NumPy array for further processing
if embeddings:
    multimodal_embeddings_array = np.stack(embeddings)
    print(f"Multimodal Embeddings Shape: {multimodal_embeddings_array.shape}")
else:
    print("No valid embeddings were loaded.")



Multimodal Embeddings Shape: (210, 2236)


In [8]:
embeddings_modified = [[sub[1:] for sub in embedding] for embedding in embeddings]

In [38]:
X = np.vstack(multimodal_embeddings_array)

In [39]:
try:
    metadata_filtered = metadata.iloc[valid_indices]
except IndexError:
    print("Some indices in valid_indices are out-of-bounds for metadata. Check your filtering steps.")

# After filtering, extract the PTSD labels
y = metadata_filtered['PTSD_label'].values

# Verify that X and y have the same number of samples
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")
X = np.array(X)
y = np.array(y)

# Get the number of samples in each
num_samples = min(X.shape[0], y.shape[0])

# Truncate X and y to the same number of samples
X_aligned = X[:num_samples]
y_aligned = y[:num_samples]

# Now proceed with train_test_split if the shapes match
if X_aligned.shape[0] == y_aligned.shape[0]:
    X_train, X_test, y_train, y_test = train_test_split(X_aligned, y_aligned, test_size=0.2, random_state=42)
else:
    print("Mismatch in the number of samples between X and y.")

Shape of X: (210, 2236), Shape of y: (210,)


In [40]:

if 'gender' in metadata.columns:
    le = LabelEncoder()
    metadata['gender'] = le.fit_transform(metadata['gender'])

# Split the dataset into training and testing sets (80-20 split)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

In [41]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6904761904761905
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.97      0.81        29
           1       0.50      0.08      0.13        13

    accuracy                           0.69        42
   macro avg       0.60      0.52      0.47        42
weighted avg       0.64      0.69      0.60        42



In [42]:
!pip install fairlearn



In [43]:
metadata.head()

Unnamed: 0,Participant,gender,split,PTSD_label,age,PTSD_severity,audio,text,visual
0,411,0,train,0,59,17.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
1,353,0,train,1,35,55.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
2,427,0,train,0,23,26.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
3,402,0,dev,1,45,51.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...
4,374,0,dev,0,36,19.0,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...,/home/hice1/asubramanian91/scratch/e-daic/data...


In [45]:
from fairlearn.metrics import demographic_parity_difference

# Assuming `gender` corresponds to the original metadata
# Split the dataset into training and testing sets for `gender`
gender_train, gender_test = train_test_split(metadata['gender'].values, test_size=0.2, random_state=42)

# Ensure that `gender_test`, `y_test`, and `y_pred` are aligned
print(f"Shape of gender_test: {gender_test.shape}")
print(f"Shape of y_test: {y_test.shape}")
print(f"Shape of y_pred: {y_pred.shape}")

# Calculate Demographic Parity Difference
demographic_parity = demographic_parity_difference(
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=gender_test
)

print(f"Demographic Parity Difference: {demographic_parity}")

Shape of gender_test: (42,)
Shape of y_test: (42,)
Shape of y_pred: (42,)


AttributeError: 'numpy.ndarray' object has no attribute 'reset_index'

In [33]:
from fairlearn.metrics import MetricFrame, selection_rate
metric_frame = MetricFrame(metrics=selection_rate, 
                           y_true=y_test, 
                           y_pred=y_pred, 
                           sensitive_features=gender_test)

# Get selection rates for each group
selection_rates = metric_frame.by_group
print("Selection Rates by Group:")
print(selection_rates)


AttributeError: module 'numpy' has no attribute 'matrix'

In [23]:
group_a_rate = selection_rates[1]
group_b_rate = selection_rates[0]

if group_b_rate > 0:  # Avoid division by zero
    demographic_parity_ratio = group_a_rate / group_b_rate
    print(f"Demographic Parity Ratio: {demographic_parity_ratio}")

Demographic Parity Ratio: 0.4473684210526316
