In [7]:
import os
import pandas as pd

# Base path relative to text_preprocess.ipynb
data_dir = os.path.join('..', 'data')

# List all subject folders that end with _P
subject_folders = [
    folder for folder in os.listdir(data_dir)
    if os.path.isdir(os.path.join(data_dir, folder)) and folder.endswith('_P')
]

for subject in sorted(subject_folders):
    subject_id = subject.split('_')[0]  # e.g., '300' from '300_P'
    transcript_path = os.path.join(data_dir, subject, 'text', f'{subject_id}_Transcript.csv')

    if os.path.exists(transcript_path):
        print(f"\n✅ Transcript for {subject}:\n")
        df = pd.read_csv(transcript_path)
        display(df.head())  # or use print(df.head()) if not in Jupyter
    else:
        print(f"❌ Transcript file not found for {subject} at path: {transcript_path}")



✅ Transcript for 300_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,14.3,15.1,so I'm going to,0.93421
1,20.3,21.1,interview in Spanish,0.60847
2,23.9,24.3,okay,0.690606
3,62.1,62.7,good,0.951897
4,68.8,69.8,Atlanta Georgia,0.987629



✅ Transcript for 301_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,0.8,7.0,yeah there's also on Craigslist so that's why,0.883057
1,41.9,42.5,okay,0.960925
2,52.9,55.8,how are you doing today I'm doing good thank you,0.950963
3,59.7,60.7,I'm from Los Angeles,0.970176
4,63.4,64.2,I'm great,0.904099



✅ Transcript for 302_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,2.1,3.2,just move around a little bit,0.906568
1,26.3,27.1,when you're finished,0.793796
2,58.2,59.1,how are you doing today,0.85979
3,59.6,61.0,I'm fine how about yourself,0.987629
4,66.6,67.3,where you from,0.911304



✅ Transcript for 308_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,0.0,9.6,okay perfect so we just want to move around a ...,0.927091
1,11.7,13.6,all right now you got perfect,0.758649
2,17.4,18.0,okay,0.901707
3,20.0,21.0,good to go now,0.755682
4,24.9,31.1,when she's done talking to you when you go ah...,0.891598


In [9]:
"""
This script performs feature extraction from transcript CSV files for a multimodal depression detection system.

✔️ It reads subject-wise transcript files from the `text` folder.
✔️ Cleans and tokenizes each spoken text entry.
✔️ Extracts features using:
    - Word2Vec (semantic vector representations)
    - TF-IDF (term frequency-inverse document frequency)
    - N-grams (bi-grams and tri-grams)
✔️ Combines and reduces dimensionality using TruncatedSVD.
✔️ Applies StandardScaler to normalize the features.
✔️ Keeps the original Start_Time and End_Time for each entry.
✔️ Saves the final feature set (with timestamps) as a new CSV file 
   in the same `text` folder under the name: 
   `<original_filename>_processed_scaled.csv`
"""


import os
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Constants
BASE_DIR = "../data"
VECTOR_SIZE = 100
SVD_COMPONENTS = 50  # Target final feature size

# Text cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    try:
        tokens = word_tokenize(text)
    except LookupError:
        print("⚠️ Falling back to .split() due to NLTK error.")
        tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords.words('english') and len(w) > 1]
    return tokens

# Word2Vec vectorization
def get_word2vec_features(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not vectors:
        return [0] * vector_size
    return np.mean(vectors, axis=0).tolist()

# Iterate through all subject folders
for subject in os.listdir(BASE_DIR):
    if not subject.endswith("_P"):
        continue

    text_dir = os.path.join(BASE_DIR, subject, "text")
    if not os.path.exists(text_dir):
        print(f"🚫 Skipping {subject}, no text directory.")
        continue

    for file in os.listdir(text_dir):
        if not file.endswith("_Transcript.csv"):
            continue

        path = os.path.join(text_dir, file)
        df = pd.read_csv(path)

        col = 'Text' if 'Text' in df.columns else df.columns[2]
        df['clean_text'] = df[col].apply(clean_text)
        df = df[df['clean_text'].map(len) > 0]

        if df.empty:
            print(f"⚠️ Skipping {subject}, cleaned data is empty.")
            continue

        print(f"🔍 Processing {file} for {subject}...")

        # Train Word2Vec
        sentences = df['clean_text'].tolist()
        w2v_model = Word2Vec(sentences=sentences, vector_size=VECTOR_SIZE, window=5, min_count=1, workers=4)
        w2v_features = df['clean_text'].apply(lambda x: get_word2vec_features(x, w2v_model, VECTOR_SIZE))
        w2v_df = pd.DataFrame(w2v_features.tolist(), columns=[f"w2v_{i}" for i in range(VECTOR_SIZE)])

        # TF-IDF features
        tfidf = TfidfVectorizer(max_features=500)
        tfidf_matrix = tfidf.fit_transform(df['clean_text'].apply(lambda x: " ".join(x)))
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

        # N-gram features (bi-grams and tri-grams)
        ngram = CountVectorizer(ngram_range=(2, 3), max_features=200)
        ngram_matrix = ngram.fit_transform(df['clean_text'].apply(lambda x: " ".join(x)))
        ngram_df = pd.DataFrame(ngram_matrix.toarray(), columns=ngram.get_feature_names_out())

        # Combine all feature sets
        all_features = pd.concat([w2v_df, tfidf_df, ngram_df], axis=1)

        # Dimensionality reduction using TruncatedSVD
        reduced_components = min(SVD_COMPONENTS, all_features.shape[1])
        svd = TruncatedSVD(n_components=reduced_components)
        svd_features = svd.fit_transform(all_features)

        # Optional: Zero-pad if reduced_components < 50
        if reduced_components < SVD_COMPONENTS:
            padding = np.zeros((svd_features.shape[0], SVD_COMPONENTS - reduced_components))
            svd_features = np.hstack((svd_features, padding))

        # Normalize the final feature set
        scaled = StandardScaler().fit_transform(svd_features)

        # Include Start_Time and End_Time in the final DataFrame
        timestamp_df = df[['Start_Time', 'End_Time']].reset_index(drop=True)
        final_df = pd.DataFrame(scaled, columns=[f"feature_{i}" for i in range(SVD_COMPONENTS)])
        final_with_timestamps = pd.concat([timestamp_df, final_df], axis=1)

        # Save processed features
        output_filename = file.replace(".csv", "_processed_scaled.csv")
        output_path = os.path.join(text_dir, output_filename)
        final_with_timestamps.to_csv(output_path, index=False)

        print(f"✅ Saved to {output_path}")
        print(f"📊 Final feature shape: {final_with_timestamps.shape}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sachithdissanayaka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sachithdissanayaka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔍 Processing 300_Transcript.csv for 300_P...
✅ Saved to /Users/sachithdissanayaka/Documents/FYP/e19-fyp-depression-detection/data/300_P/text/300_Transcript_processed_scaled.csv
📊 Final feature shape: (68, 52)
🔍 Processing 302_Transcript.csv for 302_P...
✅ Saved to /Users/sachithdissanayaka/Documents/FYP/e19-fyp-depression-detection/data/302_P/text/302_Transcript_processed_scaled.csv
📊 Final feature shape: (94, 52)
🔍 Processing 308_Transcript.csv for 308_P...
✅ Saved to /Users/sachithdissanayaka/Documents/FYP/e19-fyp-depression-detection/data/308_P/text/308_Transcript_processed_scaled.csv
📊 Final feature shape: (144, 52)
🔍 Processing 301_Transcript.csv for 301_P...
✅ Saved to /Users/sachithdissanayaka/Documents/FYP/e19-fyp-depression-detection/data/301_P/text/301_Transcript_processed_scaled.csv
📊 Final feature shape: (69, 52)


In [14]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import glob

def preprocess_text(input_directory, output_directory, unwanted_columns=[], rename_dict={}, fill_value=0, default_column_names=None):
    """
    Preprocess text CSV files:
    - Finds files like '*_Transcript_processed_scaled.csv' in input_directory.
    - Keeps timestamp columns unchanged (if present).
    - Removes unwanted columns and columns containing 'unknown' or 'unknow'.
    - Renames columns as per rename_dict or default_column_names.
    - Fills missing values with fill_value.
    - Scales numeric columns except timestamps.
    - Saves processed data to output_directory with suffix '_reprocessed_scaled.csv'.
    - Prints progress and shape info.
    """

    if not os.path.exists(input_directory):
        print(f"❌ Error: Input directory {input_directory} does not exist!")
        return

    # Look for already processed CSV files for further processing
    text_csv_files = glob.glob(os.path.join(input_directory, '*_Transcript_processed_scaled.csv'))

    if not text_csv_files:
        print(f"⚠️ No '*_Transcript_processed_scaled.csv' files found in {input_directory}.")
        return

    print(f"\n🔍 Found text CSV files: {text_csv_files}")

    for text_file in text_csv_files:
        filename = os.path.basename(text_file)
        participant_id = filename.split('_')[0]  # e.g., '300' from '300_Transcript_processed_scaled.csv'

        print(f"\n🔍 Processing {filename} for {participant_id}_P...")

        df = pd.read_csv(text_file)
        print(f"Initial shape: {df.shape}")

        # Timestamp columns to keep unchanged if present
        timestamp_cols = ['Start_Time', 'End_Time']
        existing_timestamp_cols = [col for col in timestamp_cols if col in df.columns]

        # Feature columns exclude timestamps
        feature_cols = [col for col in df.columns if col not in existing_timestamp_cols]

        # Rename text feature columns if default_column_names provided
        if default_column_names is not None and len(default_column_names) == len(feature_cols):
            rename_map = dict(zip(feature_cols, default_column_names))
            df.rename(columns=rename_map, inplace=True)
            print(f"✅ Text feature columns renamed using default_column_names.")
        elif default_column_names is None:
            print(f"⚠️ No default_column_names provided, keeping original text feature column names.")

        # Remove unwanted columns if exist
        cols_to_drop = [col for col in unwanted_columns if col in df.columns]
        if cols_to_drop:
            df.drop(columns=cols_to_drop, inplace=True)
            print(f"Removed unwanted columns: {cols_to_drop}")

        # Remove columns containing 'unknown' or 'unknow' anywhere
        cols_with_unknowns = [col for col in feature_cols if col in df.columns and df[col].isin(['unknown', 'unknow']).any()]
        if cols_with_unknowns:
            df.drop(columns=cols_with_unknowns, inplace=True)
            print(f"Removed columns with 'unknown' values: {cols_with_unknowns}")

        # Rename columns according to rename_dict (excluding timestamps)
        rename_dict_filtered = {k: v for k, v in rename_dict.items() if k in df.columns and k not in existing_timestamp_cols}
        if rename_dict_filtered:
            df.rename(columns=rename_dict_filtered, inplace=True)
            print(f"Renamed columns as per rename_dict: {rename_dict_filtered}")

        # Fill missing values
        df.fillna(fill_value, inplace=True)

        # Scale numeric feature columns (exclude timestamps)
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.difference(existing_timestamp_cols)
        if not numeric_cols.empty:
            scaler = StandardScaler()
            df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
            print(f"Scaled numeric text feature columns.")

        # Prepare output directory
        os.makedirs(output_directory, exist_ok=True)

        # Save with a new suffix to avoid overwriting original processed file
        output_filename = filename.replace('_processed_scaled.csv', '_reprocessed_scaled.csv')
        output_path = os.path.join(output_directory, output_filename)
        df.to_csv(output_path, index=False)

        print(f"✅ Saved to {output_path}")

        # Print shape of saved file
        saved_df = pd.read_csv(output_path)
        print(f"📊 Final feature shape: {saved_df.shape}")


def process_all_text_participants(root_data_dir):
    """
    Processes text folders of all participants inside root_data_dir.
    Skips hidden files/folders like .DS_Store or .ipynb_checkpoints.
    """

    for participant_folder in os.listdir(root_data_dir):
        if participant_folder.startswith('.'):
            continue  # skip hidden/system files and folders

        participant_path = os.path.join(root_data_dir, participant_folder)
        text_dir = os.path.join(participant_path, "text")

        if os.path.isdir(text_dir):
            print(f"\n📂 Found text folder: {text_dir}")
            output_dir = os.path.join(text_dir, "processed")
            preprocess_text(
                input_directory=text_dir,
                output_directory=output_dir,
                unwanted_columns=["unwanted_column"],  # Customize if needed
                rename_dict={"old_name": "new_name"},  # Customize if needed
                fill_value=0,
                default_column_names=None  # Provide a list if you want to rename features
            )
        else:
            print(f"🚫 No text folder in: {participant_folder}")


if __name__ == "__main__":
    root_data_dir = "../data"
    process_all_text_participants(root_data_dir)



📂 Found text folder: ../data/300_P/text

🔍 Found text CSV files: ['../data/300_P/text/300_Transcript_processed_scaled.csv']

🔍 Processing 300_Transcript_processed_scaled.csv for 300_P...
Initial shape: (68, 52)
⚠️ No default_column_names provided, keeping original text feature column names.
Scaled numeric text feature columns.
✅ Saved to ../data/300_P/text/processed/300_Transcript_reprocessed_scaled.csv
📊 Final feature shape: (68, 52)
🚫 No text folder in: lables

📂 Found text folder: ../data/302_P/text

🔍 Found text CSV files: ['../data/302_P/text/302_Transcript_processed_scaled.csv']

🔍 Processing 302_Transcript_processed_scaled.csv for 302_P...
Initial shape: (94, 52)
⚠️ No default_column_names provided, keeping original text feature column names.
Scaled numeric text feature columns.
✅ Saved to ../data/302_P/text/processed/302_Transcript_reprocessed_scaled.csv
📊 Final feature shape: (94, 52)

📂 Found text folder: ../data/308_P/text

🔍 Found text CSV files: ['../data/308_P/text/308_T