In [7]:
import os
import pandas as pd

base_path = '../data'
missing_time_files = []

print(f"Starting to scan processed CSV files under {base_path}...\n")

for subject in os.listdir(base_path):
    subject_path = os.path.join(base_path, subject)
    if not os.path.isdir(subject_path):
        continue
    
    for modality in ['audio', 'video', 'text', 'clinical']:
        processed_path = os.path.join(subject_path, modality, 'processed')
        if not os.path.exists(processed_path):
            continue
        
        for filename in os.listdir(processed_path):
            if filename.endswith('.csv'):
                file_path = os.path.join(processed_path, filename)
                try:
                    df = pd.read_csv(file_path)
                    if 'time' in df.columns:
                        continue
                    else:
                        print(f"{subject} | {modality} | {filename} | time column: NO - Missing time column!")
                        missing_time_files.append(file_path)
                except Exception as e:
                    print(f"{subject} | {modality} | {filename} | Error reading file: {e}")

print(f"\nTotal files missing 'time' column: {len(missing_time_files)}")
if missing_time_files:
    print("List of files missing 'time' column:")
    for f in missing_time_files:
        print(f)


Starting to scan processed CSV files under ../data...

300_P | text | 300_Transcript_reprocessed_scaled.csv | time column: NO - Missing time column!
300_P | clinical | processed_300_Transcript_biobert_features.csv | time column: NO - Missing time column!
302_P | text | 302_Transcript_reprocessed_scaled.csv | time column: NO - Missing time column!
302_P | clinical | processed_302_Transcript_biobert_features.csv | time column: NO - Missing time column!
308_P | text | 308_Transcript_reprocessed_scaled.csv | time column: NO - Missing time column!
308_P | clinical | processed_308_Transcript_biobert_features.csv | time column: NO - Missing time column!
301_P | text | 301_Transcript_reprocessed_scaled.csv | time column: NO - Missing time column!
301_P | clinical | processed_301_Transcript_biobert_features.csv | time column: NO - Missing time column!

Total files missing 'time' column: 8
List of files missing 'time' column:
../data/300_P/text/processed/300_Transcript_reprocessed_scaled.csv
../

In [3]:
import os
import pandas as pd
import numpy as np

base_path = '../data'
save_path = '../data_combine'
final_combined_path = os.path.join(save_path, 'all_subjects_combined.csv')

os.makedirs(save_path, exist_ok=True)

print("Starting to combine features for all subjects into a single file...\n")

all_subjects_data = []
valid_subjects = [s for s in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, s)) and s.endswith('_P')]
total_subjects = len(valid_subjects)

for i, subject in enumerate(valid_subjects, start=1):
    subject_path = os.path.join(base_path, subject)

    try:
        print(f"Processing subject: {subject}")

        # Define paths
        audio_path = os.path.join(subject_path, 'audio', 'processed')
        video_path = os.path.join(subject_path, 'video', 'processed')
        text_path = os.path.join(subject_path, 'text', 'processed')
        clinical_path = os.path.join(subject_path, 'clinical', 'processed')

        # Find files
        audio_files = [f for f in os.listdir(audio_path) if f.endswith('.csv')]
        video_files = [f for f in os.listdir(video_path) if f.endswith('.csv')]
        text_files = [f for f in os.listdir(text_path) if f.endswith('.csv')]
        clinical_files = [f for f in os.listdir(clinical_path) if f.endswith('.csv')]

        if not (audio_files and video_files and text_files and clinical_files):
            raise FileNotFoundError("One or more modality files missing.")

        # Load data
        audio_df = pd.read_csv(os.path.join(audio_path, audio_files[0]))
        video_df = pd.read_csv(os.path.join(video_path, video_files[0]))
        text_df = pd.read_csv(os.path.join(text_path, text_files[0]))
        clinical_df = pd.read_csv(os.path.join(clinical_path, clinical_files[0]))

        # Check for required columns
        if 'time' not in audio_df.columns or 'time' not in video_df.columns:
            raise ValueError("Missing 'time' column in audio or video.")
        if 'Start_Time' not in text_df.columns or 'End_Time' not in text_df.columns:
            raise ValueError("Missing 'Start_Time'/'End_Time' in text.")
        if 'Start_Time' not in clinical_df.columns or 'End_Time' not in clinical_df.columns:
            raise ValueError("Missing 'Start_Time'/'End_Time' in clinical.")

        # Rename columns to ensure uniqueness
        audio_df = audio_df.rename(columns=lambda x: f'audio_{x}' if x != 'time' else x)
        video_df = video_df.rename(columns=lambda x: f'video_{x}' if x != 'time' else x)
        text_df = text_df.rename(columns=lambda x: f'text_{x}' if x not in ['Start_Time', 'End_Time'] else x)
        clinical_df = clinical_df.rename(columns=lambda x: f'clinical_{x}' if x not in ['Start_Time', 'End_Time'] else x)

        # Round time columns to avoid merge warnings
        audio_df['time'] = audio_df['time'].astype(float).round(3)
        video_df['time'] = video_df['time'].astype(float).round(3)

        # Merge audio and video on time
        merged_df = pd.merge(audio_df, video_df, on='time', suffixes=('', ''))

        # Time-window mapping function
        def map_by_time(df, time_col='time', start_col='Start_Time', end_col='End_Time'):
            features = []
            for t in merged_df[time_col]:
                matched = df[(df[start_col] <= t) & (df[end_col] >= t)]
                if not matched.empty:
                    features.append(matched.iloc[0].drop([start_col, end_col]).values)
                else:
                    features.append([np.nan] * (df.shape[1] - 2))
            return pd.DataFrame(features, columns=[col for col in df.columns if col not in [start_col, end_col]])

        text_features = map_by_time(text_df)
        clinical_features = map_by_time(clinical_df)

        # Combine all features
        combined_df = pd.concat([
            merged_df.reset_index(drop=True),
            text_features.reset_index(drop=True),
            clinical_features.reset_index(drop=True)
        ], axis=1)
        combined_df.insert(0, 'subject_id', subject)

        all_subjects_data.append(combined_df)

        print(f"✔ Done: {subject} ({combined_df.shape[0]} rows)\n")

    except Exception as e:
        print(f"✘ Error processing {subject}: {e}\n")

# Save combined data with progress messages
if all_subjects_data:
    print("Saving combined data...\n")

    # Instead of saving all at once, save incrementally to show progress
    # (Though pandas doesn't support appending headers easily,
    #  here we simulate saving progress with in-memory concat)

    # Concatenate all data first
    final_df = pd.concat(all_subjects_data, ignore_index=True)
    final_df.drop(columns=['time'], inplace=True, errors='ignore')

    # Split final_df by subject to simulate step-wise saving progress
    subject_groups = final_df.groupby('subject_id')
    subjects = list(subject_groups.groups.keys())
    total_save = len(subjects)

    # Save empty file first with header only
    final_df.iloc[0:0].to_csv(final_combined_path, index=False)

    with open(final_combined_path, 'a') as f:
        for idx, subject in enumerate(subjects, start=1):
            # Write each subject's data without header and index
            subject_groups.get_group(subject).to_csv(f, index=False, header=False)
            print(f"Saving progress: {idx}/{total_save} subjects saved")

    print("\nSaving complete.")
    display(final_df.head(20))
else:
    print("⚠ No valid subject data was processed.")


Starting to combine features for all subjects into a single file...

Processing subject: 300_P
✔ Done: 300_P (19458 rows)

Processing subject: 302_P
✔ Done: 302_P (22766 rows)

Processing subject: 308_P
✔ Done: 308_P (26031 rows)

Processing subject: 301_P
✔ Done: 301_P (24721 rows)

Saving combined data...

Saving progress: 1/4 subjects saved
Saving progress: 2/4 subjects saved
Saving progress: 3/4 subjects saved
Saving progress: 4/4 subjects saved

Saving complete.


Unnamed: 0,subject_id,audio_neuron_0,audio_neuron_1,audio_neuron_2,audio_neuron_3,audio_neuron_4,audio_neuron_5,audio_neuron_6,audio_neuron_7,audio_neuron_8,...,audio_col92,audio_col93,audio_col94,audio_col95,audio_col96,audio_col97,audio_col98,audio_col99,audio_col100,audio_col101
0,300_P,0.0,0.0,0.0,0.382615,0.14541,0.0,0.64715,0.0,0.0,...,,,,,,,,,,
1,300_P,0.0,0.0,0.0,0.445436,0.213644,0.0,0.529493,0.0,0.0,...,,,,,,,,,,
2,300_P,0.0,0.0,0.0,0.432049,0.047902,0.0,0.604765,0.0,0.0,...,,,,,,,,,,
3,300_P,0.0,0.0,0.0,0.49136,0.194395,0.0,0.485144,0.0,0.0,...,,,,,,,,,,
4,300_P,0.0,0.0,0.0,0.483452,0.107363,0.0,0.684674,0.0,0.0,...,,,,,,,,,,
5,300_P,0.0,0.0,0.0,0.388975,0.232173,0.0,0.717729,0.0,0.0,...,,,,,,,,,,
6,300_P,0.0,0.0,0.0,0.447071,0.381611,0.0,0.521644,0.0,0.0,...,,,,,,,,,,
7,300_P,0.0,0.0,0.0,0.336824,0.393669,0.0,0.638716,0.0,0.0,...,,,,,,,,,,
8,300_P,0.0,0.0,0.0,0.455114,0.371693,0.0,0.703352,0.0,0.0,...,,,,,,,,,,
9,300_P,0.0,0.0,0.0,0.234666,0.576909,0.0,0.639348,0.0,0.0,...,,,,,,,,,,


In [2]:
import pandas as pd

# Path to your saved combined CSV
final_combined_path = '../data_combine/all_subjects_combined.csv'

# Load the CSV
df = pd.read_csv(final_combined_path)

# Number of rows and columns to display
num_rows = 50
num_cols = 100

# If dataframe has fewer than 200 columns, just show all columns
cols_to_show = df.columns[:num_cols]

# Display the subset DataFrame
display_df = df.loc[:num_rows-1, cols_to_show]

# If you are running in Jupyter or IPython, this will nicely display a table:
display(display_df)

# If running in plain Python console, use:
# print(display_df.to_string())


Unnamed: 0,subject_id,audio_neuron_0,audio_neuron_1,audio_neuron_2,audio_neuron_3,audio_neuron_4,audio_neuron_5,audio_neuron_6,audio_neuron_7,audio_neuron_8,...,audio_neuron_89,audio_neuron_90,audio_neuron_91,audio_neuron_92,audio_neuron_93,audio_neuron_94,audio_neuron_95,audio_neuron_96,audio_neuron_97,audio_neuron_98
0,300_P,0.0,0.0,0.0,0.382615,0.14541,0.0,0.64715,0.0,0.0,...,0.244159,0.0,0.396989,0.742742,0.0,0.0,0.0,0.0,0.0,0.0
1,300_P,0.0,0.0,0.0,0.445436,0.213644,0.0,0.529493,0.0,0.0,...,0.0,0.0,0.327531,0.665737,0.0,0.0,0.0,0.0,0.0,0.0
2,300_P,0.0,0.0,0.0,0.432049,0.047902,0.0,0.604765,0.0,0.0,...,0.201871,0.214847,0.372936,0.737832,0.0,0.0,0.0,0.0,0.0,0.0
3,300_P,0.0,0.0,0.0,0.49136,0.194395,0.0,0.485144,0.0,0.0,...,0.0,0.113611,0.310179,0.588511,0.0,0.0,0.0,0.0,0.0,0.0
4,300_P,0.0,0.0,0.0,0.483452,0.107363,0.0,0.684674,0.0,0.0,...,0.0,0.145585,0.35288,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,300_P,0.0,0.0,0.0,0.388975,0.232173,0.0,0.717729,0.0,0.0,...,0.0,0.137228,0.30211,0.858662,0.0,0.0,0.0,0.0,0.0,0.0
6,300_P,0.0,0.0,0.0,0.447071,0.381611,0.0,0.521644,0.0,0.0,...,0.0,0.0,0.62912,0.669236,0.0,0.0,0.0,0.0,0.0,0.0
7,300_P,0.0,0.0,0.0,0.336824,0.393669,0.0,0.638716,0.0,0.0,...,0.123428,0.490974,0.429628,0.571974,0.0,0.0,0.0,0.0,0.0,0.0
8,300_P,0.0,0.0,0.0,0.455114,0.371693,0.0,0.703352,0.0,0.0,...,0.0,0.501026,0.5839,0.585431,0.0,0.0,0.0,0.0,0.0,0.0
9,300_P,0.0,0.0,0.0,0.234666,0.576909,0.0,0.639348,0.0,0.0,...,0.055237,0.374177,0.846205,0.773636,0.0,0.773215,0.0,0.0,0.0,0.0


In [1]:
import pandas as pd

# Path to your saved combined CSV
final_combined_path = '../data_combine/all_subjects_combined.csv'

# Load the CSV
df = pd.read_csv(final_combined_path)

# Check if 'label' column exists
if 'label' in df.columns:
    print("yes")

# Group by subject_id and count rows
if 'subject_id' not in df.columns:
    raise ValueError("Column 'subject_id' not found in the CSV file.")

subject_counts = df['subject_id'].value_counts().sort_index()

# Print formatted output
for subject, count in subject_counts.items():
    print(f"{subject} - {count} rows")


yes
300_P - 19458 rows
301_P - 24721 rows
302_P - 22766 rows
308_P - 26031 rows


In [10]:
import pandas as pd
import os

# Set paths relative to your repo root where you run this script/notebook
combined_csv_path = os.path.join('../data_combine', 'all_subjects_combined.csv')
labels_csv_path = os.path.join('../data', 'lables', 'processed', 'depression_labels.csv')

# Load combined features CSV
df = pd.read_csv(combined_csv_path)

# Load labels CSV
labels_df = pd.read_csv(labels_csv_path)

# Rename label column
labels_df.rename(columns={'Depression_label': 'label'}, inplace=True)

# Merge on subject_id from df and id from labels_df
df_labeled = pd.merge(df, labels_df, left_on='subject_id', right_on='id', how='left')

# Optional: drop the extra 'id' column (you already have subject_id)
df_labeled.drop(columns='id', inplace=True)

# Fill missing labels (if any) with 0
df_labeled['label'] = df_labeled['label'].fillna(0).astype(int)

# Save updated CSV
df_labeled.to_csv(combined_csv_path, index=False)

print(f"Labels added and saved to {combined_csv_path}")
print(df_labeled[['subject_id', 'label']].drop_duplicates())


Labels added and saved to ../data_combine/all_subjects_combined.csv
      subject_id  label
0          300_P      0
19458      301_P      0
44179      302_P      0
66945      308_P      1
