In [27]:
import os
import pandas as pd

# Define base paths for all modalities
BASE_DIR = '../../data/interim/'  # Assuming you're running from 'interim' folder
MODALITY_FOLDERS = ['audio_features', 'clinical_features', 'text_features', 'video_features']

def check_parquet_files_shapes_and_columns(modality_path, modality_name):
    print(f"\n🔍 Checking modality: {modality_name} ({modality_path})")

    time_columns_to_check = {'time', 'Start_Time', 'End_Time'}

    for user_folder in sorted(os.listdir(modality_path)):
        user_path = os.path.join(modality_path, user_folder)
        if not os.path.isdir(user_path) or user_folder.startswith('.'):
            continue

        for file in sorted(os.listdir(user_path)):
            if file.endswith('.parquet'):
                file_path = os.path.join(user_path, file)
                try:
                    df = pd.read_parquet(file_path)
                    shape_str = f"Shape: {df.shape} (rows, columns)"

                    # Check if any of the time columns exist
                    existing_time_cols = [col for col in time_columns_to_check if col in df.columns]

                    if existing_time_cols:
                        print(f"📁 {user_folder} | 📄 {file} --> {shape_str} | Time columns: {existing_time_cols}")
                    else:
                        print(f"📁 {user_folder} | 📄 {file} --> {shape_str}")

                except Exception as e:
                    print(f"❌ Error reading {file_path}: {e}")

# Main loop to check all modalities
for modality in MODALITY_FOLDERS:
    modality_path = os.path.join(BASE_DIR, modality)
    if os.path.exists(modality_path):
        check_parquet_files_shapes_and_columns(modality_path, modality)
    else:
        print(f"⚠️ Folder not found: {modality_path}")


🔍 Checking modality: audio_features (../../data/interim/audio_features)
📁 301_P | 📄 processed_audio_features.parquet --> Shape: (8239, 101) (rows, columns) | Time columns: ['time']
📁 301_P | 📄 processed_audio_features02.parquet --> Shape: (8239, 101) (rows, columns) | Time columns: ['time']
📁 302_P | 📄 processed_audio_features.parquet --> Shape: (7575, 101) (rows, columns) | Time columns: ['time']
📁 302_P | 📄 processed_audio_features02.parquet --> Shape: (7575, 101) (rows, columns) | Time columns: ['time']

🔍 Checking modality: clinical_features (../../data/interim/clinical_features)
📁 301_P | 📄 clinical_features.parquet --> Shape: (72, 770) (rows, columns) | Time columns: ['End_Time', 'Start_Time']
📁 302_P | 📄 clinical_features.parquet --> Shape: (99, 770) (rows, columns) | Time columns: ['End_Time', 'Start_Time']

🔍 Checking modality: text_features (../../data/interim/text_features)
📁 301_P | 📄 text_features.parquet --> Shape: (72, 770) (rows, columns) | Time columns: ['End_Time', '

In [28]:
import os
import pandas as pd

# Base directories
BASE_DIR = '../../data/interim'
OUTPUT_ROOT = '../../data/processed'

MODALITY_PATHS = {
    'clinical': os.path.join(BASE_DIR, 'clinical_features'),
    'text': os.path.join(BASE_DIR, 'text_features'),
    'video': os.path.join(BASE_DIR, 'video_features'),
    'audio': os.path.join(BASE_DIR, 'audio_features'),
}

FILE_NAMES = {
    'clinical': 'clinical_features.parquet',
    'text': 'text_features.parquet',
    'video': 'processed_video_features01.parquet',
    'audio': ['processed_audio_features.parquet', 'processed_audio_features02.parquet'],
}

def average_pool(df, intervals_df):
    pooled_rows = []
    for _, interval in intervals_df.iterrows():
        mask = (df['time'] >= interval['Start_Time']) & (df['time'] <= interval['End_Time'])
        window_df = df.loc[mask].drop(columns=['time'], errors='ignore')
        if len(window_df) > 0:
            pooled_rows.append(window_df.mean(numeric_only=True))
        else:
            pooled_rows.append(pd.Series([float('nan')] * (df.shape[1]-1), index=df.columns.drop('time')))
    return pd.DataFrame(pooled_rows)

def combine_all_modalities(user_id):
    clinical_path = os.path.join(MODALITY_PATHS['clinical'], user_id, FILE_NAMES['clinical'])
    text_path = os.path.join(MODALITY_PATHS['text'], user_id, FILE_NAMES['text'])
    video_path = os.path.join(MODALITY_PATHS['video'], user_id, FILE_NAMES['video'])
    audio_paths = [os.path.join(MODALITY_PATHS['audio'], user_id, f) for f in FILE_NAMES['audio']]

    # Check existence
    if not os.path.isfile(clinical_path):
        print(f"⚠️ Missing clinical file: {clinical_path}")
        return
    if not os.path.isfile(text_path):
        print(f"⚠️ Missing text file: {text_path}")
        return
    if not os.path.isfile(video_path):
        print(f"⚠️ Missing video file: {video_path}")
        return
    for p in audio_paths:
        if not os.path.isfile(p):
            print(f"⚠️ Missing audio file: {p}")
            return

    try:
        clinical_df = pd.read_parquet(clinical_path)
        text_df = pd.read_parquet(text_path)
        video_df = pd.read_parquet(video_path)
        audio_dfs = [pd.read_parquet(p) for p in audio_paths]

        # Check rows alignment
        if clinical_df.shape[0] != text_df.shape[0]:
            print(f"❌ Row count mismatch for user {user_id}: clinical {clinical_df.shape[0]}, text {text_df.shape[0]}")
            return

        interval_cols = ['Start_Time', 'End_Time']
        intervals_df = clinical_df[interval_cols].copy()

        clinical_features = clinical_df.drop(columns=interval_cols)
        text_features = text_df.drop(columns=interval_cols)

        pooled_video_df = average_pool(video_df, intervals_df)
        pooled_audio_dfs = [average_pool(df, intervals_df) for df in audio_dfs]
        pooled_audio_df = pd.concat(pooled_audio_dfs, axis=1)

        combined_df = pd.concat([intervals_df, clinical_features, text_features, pooled_video_df, pooled_audio_df], axis=1)

        # Save combined dataframe
        user_output_dir = os.path.join(OUTPUT_ROOT, user_id)
        os.makedirs(user_output_dir, exist_ok=True)
        output_path = os.path.join(user_output_dir, 'combined_features.parquet')
        combined_df.to_parquet(output_path)

        print(f"\n✅ User: {user_id}")
        print(f"Combined shape: {combined_df.shape}")
        print(f"Saved combined data to: {output_path}")

    except Exception as e:
        print(f"❌ Error processing user {user_id}: {e}")

# Get list of users (assuming same users in clinical folder)
user_folders = sorted(os.listdir(MODALITY_PATHS['clinical']))
user_folders = [u for u in user_folders if os.path.isdir(os.path.join(MODALITY_PATHS['clinical'], u)) and not u.startswith('.')]

for user_id in user_folders:
    combine_all_modalities(user_id)



✅ User: 301_P
Combined shape: (72, 1838)
Saved combined data to: ../../data/processed/301_P/combined_features.parquet

✅ User: 302_P
Combined shape: (99, 1838)
Saved combined data to: ../../data/processed/302_P/combined_features.parquet


In [29]:
import os
import pandas as pd

PROCESSED_DIR = '../../data/processed'

def analyze_combined_file(user_id):
    file_path = os.path.join(PROCESSED_DIR, user_id, 'combined_features.parquet')
    if not os.path.isfile(file_path):
        print(f"⚠️ File not found for user {user_id}: {file_path}")
        return

    try:
        df = pd.read_parquet(file_path)

        print(f"\n📊 User: {user_id}")
        print(f"Shape: {df.shape}")

        # Null columns
        null_cols = df.columns[df.isnull().any()].tolist()
        if null_cols:
            print(f"Columns with NULL values: {null_cols}")
        else:
            print("No columns contain NULL values.")

        # Time columns detection (case-insensitive)
        time_cols = [col for col in df.columns if 'time' in col.lower()]
        if time_cols:
            print(f"Time-related columns found: {time_cols}")
        else:
            print("No time-related columns found.")

    except Exception as e:
        print(f"❌ Error loading/analyzing user {user_id}: {e}")

# List all users based on folders in processed directory
user_folders = sorted(os.listdir(PROCESSED_DIR))
user_folders = [u for u in user_folders if os.path.isdir(os.path.join(PROCESSED_DIR, u)) and not u.startswith('.')]

for user_id in user_folders:
    analyze_combined_file(user_id)



📊 User: 301_P
Shape: (72, 1838)
No columns contain NULL values.
Time-related columns found: ['Start_Time', 'End_Time']

📊 User: 302_P
Shape: (99, 1838)
No columns contain NULL values.
Time-related columns found: ['Start_Time', 'End_Time']


In [32]:
import os
import pandas as pd

PROCESSED_DIR = '../../data/processed'

def get_file_info(user_id):
    file_path = os.path.join(PROCESSED_DIR, user_id, 'combined_features.parquet')
    if not os.path.isfile(file_path):
        print(f"⚠️ File not found for user {user_id}: {file_path}")
        return None
    
    try:
        df = pd.read_parquet(file_path)
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)  # size in MB
        rows, cols = df.shape
        return {
            'User': user_id,
            'File Size (MB)': round(file_size_mb, 3),
            'Rows': rows,
            'Columns': cols
        }
    except Exception as e:
        print(f"❌ Error reading file for user {user_id}: {e}")
        return None

# Collect info for all users
user_folders = sorted(os.listdir(PROCESSED_DIR))
user_folders = [u for u in user_folders if os.path.isdir(os.path.join(PROCESSED_DIR, u)) and not u.startswith('.')]

all_info = []

for user_id in user_folders:
    info = get_file_info(user_id)
    if info:
        all_info.append(info)

# Print combined table
if all_info:
    summary_df = pd.DataFrame(all_info)
    print("\n📋 Summary of all users' feature files:")
    print(summary_df.to_string(index=False))
else:
    print("No user files found or readable.")



📋 Summary of all users' feature files:
 User  File Size (MB)  Rows  Columns
301_P           1.553    72     1838
302_P           1.765    99     1838


In [37]:
import os
import pandas as pd

PROCESSED_DIR = '../../data/processed'
OUTPUT_FILE = os.path.join(PROCESSED_DIR, 'all_user_combined_data.parquet')

def load_and_combine_all_users(base_dir):
    user_folders = sorted([
        u for u in os.listdir(base_dir)
        if os.path.isdir(os.path.join(base_dir, u)) and not u.startswith('.')
    ])

    all_dfs = []
    for user_id in user_folders:
        file_path = os.path.join(base_dir, user_id, 'combined_features.parquet')
        if not os.path.isfile(file_path):
            print(f"⚠️ File not found for user {user_id}: {file_path}")
            continue

        try:
            file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
            df = pd.read_parquet(file_path)
            df['user_id'] = user_id  # Add user_id column
            all_dfs.append(df)
            print(f"✅ Loaded data for user: {user_id} | Shape: {df.shape} | File size: {file_size_mb:.2f} MB")
        except Exception as e:
            print(f"❌ Error loading data for user {user_id}: {e}")

    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True, sort=False)
        print(f"\n🔗 Combined data shape: {combined_df.shape}")
        return combined_df
    else:
        print("❌ No data loaded. Combined dataframe is empty.")
        return pd.DataFrame()

def preview_combined_data(df, max_cols=10, max_rows=10):
    # Always include user_id and time-related columns (case-insensitive)
    must_include = [col for col in df.columns if col == 'user_id' or 'time' in col.lower()]
    # Other columns to fill the rest of max_cols (excluding must_include)
    other_cols = [col for col in df.columns if col not in must_include]
    
    # Limit number of other columns to max_cols - len(must_include)
    n_other = max(0, max_cols - len(must_include))
    selected_cols = must_include + other_cols[:n_other]

    print(f"\n📋 Preview of combined data (first {max_rows} rows, columns shown: {len(selected_cols)})")
    print(df[selected_cols].head(max_rows).to_string(index=False))

def main():
    combined_data = load_and_combine_all_users(PROCESSED_DIR)
    if not combined_data.empty:
        combined_data.to_parquet(OUTPUT_FILE, index=False)
        final_size_mb = os.path.getsize(OUTPUT_FILE) / (1024 * 1024)
        print(f"💾 Saved combined data to: {OUTPUT_FILE} | File size: {final_size_mb:.2f} MB")
        
        # Print preview with required columns
        preview_combined_data(combined_data, max_cols=10, max_rows=10)

if __name__ == "__main__":
    main()


✅ Loaded data for user: 301_P | Shape: (72, 1839) | File size: 1.55 MB
✅ Loaded data for user: 302_P | Shape: (99, 1839) | File size: 1.76 MB

🔗 Combined data shape: (171, 1839)
💾 Saved combined data to: ../../data/processed/all_user_combined_data.parquet | File size: 2.36 MB

📋 Preview of combined data (first 10 rows, columns shown: 10)
 Start_Time  End_Time user_id  clinical_f0  clinical_f1  clinical_f2  clinical_f3  clinical_f4  clinical_f5  clinical_f6
        0.8       7.0   301_P     0.406030     0.313151    -0.116788     0.061381    -0.090267    -0.005268     0.106040
       41.9      42.5   301_P     0.233028     0.043382    -0.064205     0.025771    -0.487324     0.013373     0.252081
       52.9      55.8   301_P     0.344104     0.117400    -0.099952    -0.063165    -0.242215     0.155228     0.353929
       59.7      60.7   301_P     0.474687     0.313247    -0.061940    -0.060701    -0.246777    -0.055384     0.218618
       63.4      64.2   301_P     0.428376     0.279073

In [47]:
import os

file_path = '../../data/labels/'

files = os.listdir(file_path)
print("Files in", file_path)
for f in files:
    print(f)


Files in ../../data/labels/
Detailed_PHQ8_Labels.csv
.ipynb_checkpoints
detailed_lables.csv


In [51]:
import pandas as pd

file_path = '../../data/labels/detailed_lables.csv'

# Load the CSV file
df = pd.read_csv(file_path)

# Select only the desired columns
selected_cols = ['Participant', 'age', 'gender', 'Depression_label','split']
df_selected = df[selected_cols]

# Show the first 10 rows as a table
print(df_selected.head(10).to_string(index=False))


 Participant  age gender  Depression_label split
         300   33   male                 0   dev
         301   39   male                 0   dev
         302   25   male                 0 train
         303   41 female                 0 train
         304   22 female                 0 train
         305   55   male                 0 train
         306   62 female                 0   dev
         307   23 female                 0 train
         308   40 female                 1 train
         309   19   male                 1 train


In [56]:
import os
import pandas as pd

PROCESSED_DIR = '../../data/processed'
LABELS_FILE = '../../data/labels/detailed_lables.csv'
OUTPUT_FILE = os.path.join(PROCESSED_DIR, 'all_user_combined_data.parquet')

def load_and_combine_all_users(base_dir):
    user_folders = sorted([
        u for u in os.listdir(base_dir)
        if os.path.isdir(os.path.join(base_dir, u)) and not u.startswith('.')
    ])

    all_dfs = []
    for user_id in user_folders:
        file_path = os.path.join(base_dir, user_id, 'combined_features.parquet')
        if not os.path.isfile(file_path):
            print(f"⚠️ File not found for user {user_id}: {file_path}")
            continue

        try:
            file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
            df = pd.read_parquet(file_path)
            df['user_id'] = user_id  # Add user_id column
            all_dfs.append(df)
            print(f"✅ Loaded data for user: {user_id} | Shape: {df.shape} | File size: {file_size_mb:.2f} MB")
        except Exception as e:
            print(f"❌ Error loading data for user {user_id}: {e}")

    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True, sort=False)
        print(f"\n🔗 Combined data shape: {combined_df.shape}")
        return combined_df
    else:
        print("❌ No data loaded. Combined dataframe is empty.")
        return pd.DataFrame()

def preview_data(df, max_cols=15, max_rows=10):
    # Priority columns to show first if they exist
    metadata_cols = ['age', 'gender', 'Depression_label', 'split']
    metadata_present = [col for col in metadata_cols if col in df.columns]

    user_time_cols = []
    if 'user_id' in df.columns:
        user_time_cols.append('user_id')
    user_time_cols += [col for col in df.columns if 'time' in col.lower()]

    # Other columns excluding the prioritized ones
    other_cols = [col for col in df.columns if col not in metadata_present + user_time_cols]

    # Limit number of other columns shown
    n_other = max(0, max_cols - len(metadata_present) - len(user_time_cols))

    selected_cols = metadata_present + user_time_cols + other_cols[:n_other]

    print(f"\n📋 Preview of data (first {max_rows} rows, columns shown: {len(selected_cols)})")
    print(df[selected_cols].head(max_rows).to_string(index=False))

def main():
    combined_data = load_and_combine_all_users(PROCESSED_DIR)
    if combined_data.empty:
        return

    # Load labels
    labels_df = pd.read_csv(LABELS_FILE)
    selected_cols = ['Participant', 'age', 'gender', 'Depression_label', 'split']
    labels_df = labels_df[selected_cols]

    # Prepare for merge: convert user_id like '301_P' to int Participant '301'
    combined_data['Participant'] = combined_data['user_id'].str.replace('_P', '').astype(int)

    # Merge on Participant
    merged_df = combined_data.merge(labels_df, on='Participant', how='left')

    # Drop Participant column after merge
    merged_df = merged_df.drop(columns=['Participant'])

    # Save merged dataframe
    merged_df.to_parquet(OUTPUT_FILE, index=False)
    final_size_mb = os.path.getsize(OUTPUT_FILE) / (1024 * 1024)
    print(f"💾 Saved final merged data to: {OUTPUT_FILE} | File size: {final_size_mb:.2f} MB")

    # Preview merged data with desired columns shown first
    preview_data(merged_df, max_cols=10, max_rows=100)

if __name__ == "__main__":
    main()


✅ Loaded data for user: 301_P | Shape: (72, 1839) | File size: 1.55 MB
✅ Loaded data for user: 302_P | Shape: (99, 1839) | File size: 1.76 MB

🔗 Combined data shape: (171, 1839)
💾 Saved final merged data to: ../../data/processed/all_user_combined_data.parquet | File size: 2.36 MB

📋 Preview of data (first 100 rows, columns shown: 10)
 age gender  Depression_label split user_id  Start_Time  End_Time  clinical_f0  clinical_f1  clinical_f2
  39   male                 0   dev   301_P         0.8       7.0     0.406030     0.313151    -0.116788
  39   male                 0   dev   301_P        41.9      42.5     0.233028     0.043382    -0.064205
  39   male                 0   dev   301_P        52.9      55.8     0.344104     0.117400    -0.099952
  39   male                 0   dev   301_P        59.7      60.7     0.474687     0.313247    -0.061940
  39   male                 0   dev   301_P        63.4      64.2     0.428376     0.279073    -0.105950
  39   male                 0   de