In [3]:
import pandas as pd
df = pd.read_parquet("../data/processed/all_user_combined_data_processed.parquet")

print(df.head())
print(df.columns)
print(df['split'].value_counts())

   clinical_f0  clinical_f1  clinical_f2  clinical_f3  clinical_f4  \
0     0.525162     0.673773     0.267560     0.322613     0.930104   
1    -0.607254    -0.749974     0.551444     0.162778    -1.219647   
2     0.119814    -0.359332     0.358457    -0.236400     0.107426   
3     0.974566     0.674282     0.563673    -0.225339     0.082728   
4     0.671426     0.493924     0.326073    -0.397610     0.153739   

   clinical_f5  clinical_f6  clinical_f7  clinical_f8  clinical_f9  ...  \
0    -0.113993    -0.391508     0.207020     0.554596    -0.350246  ...   
1    -0.001206     0.366103     0.101824     0.063507    -0.201003  ...   
2     0.857075     0.894455    -1.210330    -0.880923    -0.670373  ...   
3    -0.417215     0.192510     1.818101     1.111454    -0.521585  ...   
4    -0.022351    -0.190328     0.972940     0.280787    -0.207952  ...   

   audio02_f98  audio02_f99  audio02_f100       age  gender_male  Start_Time  \
0    -0.420073     0.513532      1.224791  1.172

In [4]:
# Separate data by split
train_df = df[df['split'] == 'train']
dev_df = df[df['split'] == 'dev']
test_df = df[df['split'] == 'test']

print(f"Train samples: {len(train_df)}")
print(f"Dev samples: {len(dev_df)}")
print(f"Test samples: {len(test_df)}")

Train samples: 99
Dev samples: 72
Test samples: 0


In [6]:
target_col = 'Depression_label'

clinical_cols = [col for col in df.columns if col.startswith('clinical_')]
audio_cols = [col for col in df.columns if col.startswith('audio')]
video_cols = [col for col in df.columns if col.startswith('video')]
text_cols = [col for col in df.columns if col.startswith('text')]
demo_cols = ['age', 'gender_male']

feature_cols = clinical_cols + audio_cols + video_cols + text_cols + demo_cols

# Now you can safely use target_col here:
X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_dev = dev_df[feature_cols]
y_dev = dev_df[target_col]

X_test = test_df[feature_cols]
y_test = test_df[target_col]



In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)
print(X_test.shape)
print(y_test.shape)

(99, 1700)
(99,)
(72, 1700)
(72,)
(0, 1700)
(0,)


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from torch.utils.data import TensorDataset, DataLoader

# Convert numpy arrays or pandas to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_dev_tensor = torch.tensor(X_dev.values, dtype=torch.float32)
y_dev_tensor = torch.tensor(y_dev.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
dev_dataset = TensorDataset(X_dev_tensor, y_dev_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)


In [15]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()  # Output for binary classification
        )

    def forward(self, x):
        return self.network(x)


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = MLP(input_dim=X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 1.1154
Epoch 2, Loss: 0.6080
Epoch 3, Loss: 0.2444
Epoch 4, Loss: 0.0924
Epoch 5, Loss: 0.0292
Epoch 6, Loss: 0.0101
Epoch 7, Loss: 0.0026
Epoch 8, Loss: 0.0020
Epoch 9, Loss: 0.0005
Epoch 10, Loss: 0.0002
Epoch 11, Loss: 0.0001
Epoch 12, Loss: 0.0000
Epoch 13, Loss: 0.0000
Epoch 14, Loss: 0.0000
Epoch 15, Loss: 0.0000
Epoch 16, Loss: 0.0000
Epoch 17, Loss: 0.0000
Epoch 18, Loss: 0.0000
Epoch 19, Loss: 0.0000
Epoch 20, Loss: 0.0000


In [17]:
model.eval()
all_preds = []

with torch.no_grad():
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch)
        all_preds.extend(preds.cpu().numpy())

# Convert probabilities to class labels
y_pred = [1 if p >= 0.5 else 0 for p in all_preds]
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: nan


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
