In [2]:
import os
import pandas as pd

# Base path relative to text_preprocess.ipynb
data_dir = os.path.join('..', 'data')

# List all subject folders that end with _P
subject_folders = [
    folder for folder in os.listdir(data_dir)
    if os.path.isdir(os.path.join(data_dir, folder)) and folder.endswith('_P')
]

for subject in sorted(subject_folders):
    subject_id = subject.split('_')[0]  # e.g., '300' from '300_P'
    transcript_path = os.path.join(data_dir, subject, 'clinical', f'{subject_id}_Transcript.csv')

    if os.path.exists(transcript_path):
        print(f"\n✅ Transcript for {subject}:\n")
        df = pd.read_csv(transcript_path)
        display(df.head())  # or use print(df.head()) if not in Jupyter
    else:
        print(f"❌ Transcript file not found for {subject} at path: {transcript_path}")



✅ Transcript for 300_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,14.3,15.1,so I'm going to,0.93421
1,20.3,21.1,interview in Spanish,0.60847
2,23.9,24.3,okay,0.690606
3,62.1,62.7,good,0.951897
4,68.8,69.8,Atlanta Georgia,0.987629



✅ Transcript for 301_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,0.8,7.0,yeah there's also on Craigslist so that's why,0.883057
1,41.9,42.5,okay,0.960925
2,52.9,55.8,how are you doing today I'm doing good thank you,0.950963
3,59.7,60.7,I'm from Los Angeles,0.970176
4,63.4,64.2,I'm great,0.904099



✅ Transcript for 302_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,2.1,3.2,just move around a little bit,0.906568
1,26.3,27.1,when you're finished,0.793796
2,58.2,59.1,how are you doing today,0.85979
3,59.6,61.0,I'm fine how about yourself,0.987629
4,66.6,67.3,where you from,0.911304



✅ Transcript for 308_P:



Unnamed: 0,Start_Time,End_Time,Text,Confidence
0,0.0,9.6,okay perfect so we just want to move around a ...,0.927091
1,11.7,13.6,all right now you got perfect,0.758649
2,17.4,18.0,okay,0.901707
3,20.0,21.0,good to go now,0.755682
4,24.9,31.1,when she's done talking to you when you go ah...,0.891598


In [3]:
"""
This script extracts BioBERT sentence embeddings from clinical transcript segments for each subject. 
It reads each transcript CSV, uses BioBERT to embed the 'Text' column, and saves the output with
Start_Time, End_Time, and the 768-dimensional BioBERT features to a new CSV file in the same folder.
"""

import os
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# Load BioBERT model
BIOBERT_MODEL = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(BIOBERT_MODEL)
model = AutoModel.from_pretrained(BIOBERT_MODEL)
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Base directory containing subject folders
BASE_DIR = "../data"

# Confidence threshold for filtering low-confidence entries
CONFIDENCE_THRESHOLD = 0.7

# Function to extract BioBERT embedding from text
def get_biobert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

# Process each subject
for subject in os.listdir(BASE_DIR):
    if not subject.endswith("_P"):
        continue

    clinical_dir = os.path.join(BASE_DIR, subject, "clinical")
    if not os.path.exists(clinical_dir):
        print(f"🚫 Skipping {subject}, no clinical directory.")
        continue

    for file in os.listdir(clinical_dir):
        if not file.endswith("_Transcript.csv"):
            continue

        path = os.path.join(clinical_dir, file)
        df = pd.read_csv(path)

        # Identify text column
        text_col = "Text" if "Text" in df.columns else df.columns[2]
        if df.empty or df[text_col].isna().all():
            print(f"⚠️ Skipping {subject}, empty or invalid transcript.")
            continue

        print(f"🔍 Extracting BioBERT features for {subject} - {file}...")

        output_rows = []

        for i, row in tqdm(df.iterrows(), total=len(df), desc=subject):
            confidence = row["Confidence"] if "Confidence" in row else 1.0
            if confidence < CONFIDENCE_THRESHOLD:
                continue

            text = str(row[text_col]) if pd.notna(row[text_col]) else ""
            start_time = row["Start_Time"] if "Start_Time" in row else None
            end_time = row["End_Time"] if "End_Time" in row else None

            embedding = get_biobert_embedding(text)
            output_rows.append([start_time, end_time] + embedding.tolist())

        # Save final DataFrame with time and features
        if output_rows:
            feature_cols = [f"feature_{i}" for i in range(768)]
            output_df = pd.DataFrame(output_rows, columns=["Start_Time", "End_Time"] + feature_cols)

            output_path = os.path.join(clinical_dir, file.replace(".csv", "_biobert_features.csv"))
            output_df.to_csv(output_path, index=False)

            print(f"✅ Saved BioBERT features to: {output_path}")
            print(f"📊 Shape: {output_df.shape}")
        else:
            print(f"⚠️ No valid entries for {subject} - {file}")


🔍 Extracting BioBERT features for 300_P - 300_Transcript.csv...


300_P: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:03<00:00, 20.14it/s]


✅ Saved BioBERT features to: ../data/300_P/clinical/300_Transcript_biobert_features.csv
📊 Shape: (70, 770)
🔍 Extracting BioBERT features for 302_P - 302_Transcript.csv...


302_P: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:04<00:00, 19.80it/s]


✅ Saved BioBERT features to: ../data/302_P/clinical/302_Transcript_biobert_features.csv
📊 Shape: (96, 770)
🔍 Extracting BioBERT features for 308_P - 308_Transcript.csv...


308_P: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [00:07<00:00, 19.44it/s]


✅ Saved BioBERT features to: ../data/308_P/clinical/308_Transcript_biobert_features.csv
📊 Shape: (143, 770)
🔍 Extracting BioBERT features for 301_P - 301_Transcript.csv...


301_P: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72/72 [00:03<00:00, 18.97it/s]


✅ Saved BioBERT features to: ../data/301_P/clinical/301_Transcript_biobert_features.csv
📊 Shape: (70, 770)


In [5]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import glob

def preprocess_clinical(input_directory, output_directory, unwanted_columns=[], rename_dict={}, fill_value=0, default_column_names=None):
    """
    Preprocess clinical BioBERT transcript feature CSV files in a given directory:
    - Keeps timestamp columns (e.g., Start_Time, End_Time) unchanged.
    - Removes unwanted columns and columns containing 'unknown' values.
    - Renames columns if needed.
    - Fills missing values with fill_value.
    - Scales numeric clinical feature columns using StandardScaler.
    - Saves processed files in output_directory with 'processed_' prefix.
    """

    if not os.path.exists(input_directory):
        print(f"❌ Error: The input directory {input_directory} does not exist!")
        return

    # Find all clinical CSV files with the BioBERT feature pattern
    clinical_csv_files = glob.glob(os.path.join(input_directory, '*_Transcript_biobert_features.csv'))

    if not clinical_csv_files:
        print(f"⚠️ No '_Transcript_biobert_features.csv' files found in {input_directory}.")
        return

    print(f"\n🔍 Found clinical CSV files: {clinical_csv_files}")

    for clinical_file in clinical_csv_files:
        print(f"\n📄 Processing: {clinical_file}")
        df = pd.read_csv(clinical_file)
        print(f"Initial shape: {df.shape}")

        # Identify timestamp columns to keep them unchanged
        timestamp_cols = ['Start_Time', 'End_Time']
        existing_timestamp_cols = [col for col in timestamp_cols if col in df.columns]

        # Identify clinical feature columns (exclude timestamps)
        clinical_feature_cols = [col for col in df.columns if col not in existing_timestamp_cols]

        # If default_column_names provided and matches clinical feature count, rename only clinical feature columns
        if default_column_names is not None and len(default_column_names) == len(clinical_feature_cols):
            rename_map = dict(zip(clinical_feature_cols, default_column_names))
            df.rename(columns=rename_map, inplace=True)
            print(f"✅ Clinical feature columns renamed using default_column_names.")
        elif default_column_names is None:
            # If no default names given, keep original clinical feature column names
            print(f"⚠️ No default_column_names provided, keeping original clinical feature column names.")

        # Remove unwanted columns from clinical features
        cols_to_drop = [col for col in unwanted_columns if col in df.columns]
        df.drop(columns=cols_to_drop, inplace=True)
        if cols_to_drop:
            print(f"Removed unwanted columns: {cols_to_drop}")

        # Remove columns that contain 'unknown' or 'unknow' values anywhere in the column
        cols_with_unknowns = [col for col in clinical_feature_cols if df[col].isin(['unknown', 'unknow']).any()]
        if cols_with_unknowns:
            df.drop(columns=cols_with_unknowns, inplace=True)
            print(f"Removed columns containing 'unknown' values: {cols_with_unknowns}")

        # Rename other columns as per rename_dict (excluding timestamps)
        rename_dict_filtered = {k: v for k, v in rename_dict.items() if k in df.columns and k not in existing_timestamp_cols}
        if rename_dict_filtered:
            df.rename(columns=rename_dict_filtered, inplace=True)
            print(f"Renamed columns as per rename_dict: {rename_dict_filtered}")

        # Fill missing values with fill_value
        df.fillna(fill_value, inplace=True)

        # Scale numeric clinical feature columns (exclude timestamps)
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.difference(existing_timestamp_cols)
        if not numeric_cols.empty:
            scaler = StandardScaler()
            df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
            print(f"Scaled numeric clinical feature columns.")

        # Ensure output directory exists
        os.makedirs(output_directory, exist_ok=True)
        output_path = os.path.join(output_directory, f"processed_{os.path.basename(clinical_file)}")
        df.to_csv(output_path, index=False)
        print(f"✅ Saved processed file: {output_path}")

        # Show shape of saved file
        saved_df = pd.read_csv(output_path)
        print(f"Shape of saved file: {saved_df.shape}")


def process_all_clinical_participants(root_data_dir):
    """
    Walk through all participant folders in root_data_dir,
    locate their 'clinical' subfolders, and preprocess all clinical files found.
    Saves processed files inside 'processed' subfolder within each clinical folder.
    """
    for participant_folder in os.listdir(root_data_dir):
        participant_path = os.path.join(root_data_dir, participant_folder)
        clinical_dir = os.path.join(participant_path, "clinical")

        if os.path.isdir(clinical_dir):
            print(f"\n📂 Found clinical folder: {clinical_dir}")
            output_dir = os.path.join(clinical_dir, "processed")
            preprocess_clinical(
                input_directory=clinical_dir,
                output_directory=output_dir,
                unwanted_columns=["unwanted_column"],  # Customize this list as needed
                rename_dict={"old_name": "new_name"},  # Customize renaming as needed
                fill_value=0,
                default_column_names=None  # Or provide a list of column names for clinical features
            )
        else:
            print(f"🚫 No clinical folder in: {participant_folder}")


if __name__ == "__main__":
    root_data_dir = "../data"
    process_all_clinical_participants(root_data_dir)



📂 Found clinical folder: ../data/300_P/clinical

🔍 Found clinical CSV files: ['../data/300_P/clinical/300_Transcript_biobert_features.csv']

📄 Processing: ../data/300_P/clinical/300_Transcript_biobert_features.csv
Initial shape: (70, 770)
⚠️ No default_column_names provided, keeping original clinical feature column names.
Scaled numeric clinical feature columns.
✅ Saved processed file: ../data/300_P/clinical/processed/processed_300_Transcript_biobert_features.csv
Shape of saved file: (70, 770)
🚫 No clinical folder in: .DS_Store
🚫 No clinical folder in: lables

📂 Found clinical folder: ../data/302_P/clinical

🔍 Found clinical CSV files: ['../data/302_P/clinical/302_Transcript_biobert_features.csv']

📄 Processing: ../data/302_P/clinical/302_Transcript_biobert_features.csv
Initial shape: (96, 770)
⚠️ No default_column_names provided, keeping original clinical feature column names.
Scaled numeric clinical feature columns.
✅ Saved processed file: ../data/302_P/clinical/processed/processed_