In [1]:
"""
Key Features:
- Iterates through user folders in the raw data path.
- Reads transcript CSV files containing text data.
- Uses Hugging Face's RoBERTa model to extract [CLS] token embeddings for each text segment.
- Handles empty texts, missing files, and batch processing errors gracefully.
- Saves the extracted 768-dimensional RoBERTa features into `.parquet` files.
- Preserves additional timing information (Start_Time, End_Time) if available.
"""


import os
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel, logging
from tqdm import tqdm
import warnings
import gc

logging.set_verbosity_error()
warnings.filterwarnings("ignore")

RAW_DATA_PATH = '../../data/raw'
SAVE_BASE_PATH = '../../data/interim/text_features'
BATCH_SIZE = 8
MAX_LENGTH = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print("Loading RoBERTa model...")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.to(device)
model.eval()

def get_roberta_embeddings(texts, batch_size=BATCH_SIZE):
    """Extract RoBERTa embeddings from texts with proper memory management"""
    if not texts:
        return np.array([]).reshape(0, 768)
    
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch = texts[i:i + batch_size]
        
        # Handle empty strings
        batch = [text if text and text.strip() else "[EMPTY]" for text in batch]
        
        try:
            inputs = tokenizer(
                batch, 
                return_tensors='pt', 
                truncation=True, 
                padding=True, 
                max_length=MAX_LENGTH
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs)
                cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.append(cls_embeddings)
            
            # Clear memory
            del inputs, outputs
            if device.type == 'cuda':
                torch.cuda.empty_cache()
            gc.collect()
            
        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {e}")
            # Create zero embeddings for failed batch
            batch_size_actual = len(batch)
            zero_embeddings = np.zeros((batch_size_actual, 768))
            embeddings.append(zero_embeddings)
    
    return np.vstack(embeddings)

def process_user_data(user_path, save_dir, user_name):
    """Process a single user's data"""
    transcript_file = next(
        (f for f in os.listdir(user_path) if f.endswith('_Transcript.csv')), 
        None
    )
    
    if transcript_file is None:
        print(f"⚠️  No transcript file found for {user_name}")
        return False
    
    csv_path = os.path.join(user_path, transcript_file)
    
    try:
        # Check file size
        file_size_mb = os.path.getsize(csv_path) / (1024 * 1024)
        print(f"Processing {user_name} ({file_size_mb:.1f}MB)")
        
        df = pd.read_csv(csv_path)
        
        if 'Text' not in df.columns:
            print(f"⚠️  No 'Text' column found for {user_name}")
            return False
        
        # Get texts and handle NaN values
        texts = df['Text'].fillna('').astype(str).tolist()
        
        if not texts:
            print(f"⚠️  No text data found for {user_name}")
            return False
        
        print(f"Extracting features for {len(texts)} texts...")
        features = get_roberta_embeddings(texts)
        
        if features.size == 0:
            print(f"⚠️  No features extracted for {user_name}")
            return False
        
        # Create DataFrame with features
        df_features = pd.DataFrame(
            features, 
            columns=[f'roberta_{i}' for i in range(768)]
        )
        
        # Add Start_Time and End_Time if they exist in the original data
        if 'Start_Time' in df.columns:
            df_features['Start_Time'] = df['Start_Time'].reset_index(drop=True)
        else:
            print(f"⚠️  No 'Start_Time' column found for {user_name}")
            
        if 'End_Time' in df.columns:
            df_features['End_Time'] = df['End_Time'].reset_index(drop=True)
        else:
            print(f"⚠️  No 'End_Time' column found for {user_name}")
        
        # Save to parquet
        os.makedirs(save_dir, exist_ok=True)
        output_path = os.path.join(save_dir, 'text_features.parquet')
        df_features.to_parquet(output_path, index=False)
        
        print(f"✅ Saved RoBERTa features for {user_name} ({features.shape[0]} samples)")
        return True
        
    except Exception as e:
        print(f"❌ Error processing {user_name}: {e}")
        return False

def main():
    if not os.path.exists(RAW_DATA_PATH):
        print(f"❌ Raw data path does not exist: {RAW_DATA_PATH}")
        return
    
    users = [u for u in os.listdir(RAW_DATA_PATH) 
             if os.path.isdir(os.path.join(RAW_DATA_PATH, u))]
    
    if not users:
        print("❌ No user directories found")
        return
    
    print(f"Found {len(users)} users to process")
    
    successful = 0
    failed = 0
    
    for user in users:
        user_path = os.path.join(RAW_DATA_PATH, user, 'text')
        
        if not os.path.isdir(user_path):
            print(f"⚠️  Text directory not found for {user}")
            failed += 1
            continue
        
        save_dir = os.path.join(SAVE_BASE_PATH, user)
        
        if process_user_data(user_path, save_dir, user):
            successful += 1
        else:
            failed += 1
    
    print(f"\n📊 Processing complete:")
    print(f"✅ Successful: {successful}")
    print(f"❌ Failed: {failed}")
    print(f"📁 Total users: {len(users)}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <FB2FD416-6C4D-3621-B677-61F07C02A3C5> /opt/anaconda3/envs/depression-nlp-fyp/lib/python3.9/site-packages/torchvision/image.so
  warn(


Using device: cpu
Loading RoBERTa model...
Found 3 users to process
Processing 302_P (0.0MB)
Extracting features for 99 texts...


Processing batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.16it/s]


✅ Saved RoBERTa features for 302_P (99 samples)
Processing 301_P (0.0MB)
Extracting features for 72 texts...


Processing batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:03<00:00,  2.73it/s]

✅ Saved RoBERTa features for 301_P (72 samples)
⚠️  No transcript file found for .ipynb_checkpoints

📊 Processing complete:
✅ Successful: 2
❌ Failed: 1
📁 Total users: 3





In [2]:
import pandas as pd
import os

# Path to saved features
FEATURES_BASE_PATH = '../../data/interim/text_features'
user_id = '302_P'  # Change this to your user ID

# Load the data
file_path = os.path.join(FEATURES_BASE_PATH, user_id, 'text_features.parquet')
df = pd.read_parquet(file_path)

print(f"Dataset shape: {df.shape}")
print(f"Total columns: {len(df.columns)}")

# Select first 5 and last 5 columns
first_5_cols = df.columns[:5].tolist()
last_5_cols = df.columns[-5:].tolist()
selected_cols = first_5_cols + last_5_cols

# Create a subset with selected columns and first 5 rows
subset = df[selected_cols].head(5)

print(f"\nFirst 5 columns: {first_5_cols}")
print(f"Last 5 columns: {last_5_cols}")

print(f"\nTable (first 5 rows, first 5 + last 5 columns):")
print(subset)

Dataset shape: (99, 770)
Total columns: 770

First 5 columns: ['roberta_0', 'roberta_1', 'roberta_2', 'roberta_3', 'roberta_4']
Last 5 columns: ['roberta_765', 'roberta_766', 'roberta_767', 'Start_Time', 'End_Time']

Table (first 5 rows, first 5 + last 5 columns):
   roberta_0  roberta_1  roberta_2  roberta_3  roberta_4  roberta_765  \
0  -0.064926   0.078818  -0.011347  -0.113043   0.029975    -0.069268   
1  -0.042070   0.085485  -0.017387  -0.116804   0.060461    -0.036051   
2  -0.049929   0.104638  -0.014817  -0.095213   0.051695    -0.067631   
3  -0.066134   0.092893  -0.003128  -0.094649   0.026224    -0.039669   
4  -0.039353   0.088576  -0.026071  -0.093649   0.039769    -0.059562   

   roberta_766  roberta_767  Start_Time  End_Time  
0    -0.069701    -0.008505         2.1       3.2  
1    -0.030257    -0.027823        26.3      27.1  
2    -0.045495    -0.002897        58.2      59.1  
3    -0.043128    -0.031936        59.6      61.0  
4    -0.019522    -0.029947        6

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

SAVE_BASE_PATH = '../../data/interim/text_features'

def load_all_features(base_path):
    """Load all user feature parquet files and concatenate them."""
    all_features = []
    for user in os.listdir(base_path):
        user_path = os.path.join(base_path, user, 'text_features.parquet')
        if os.path.isfile(user_path):
            df = pd.read_parquet(user_path)
            # Select only RoBERTa feature columns (768 dims)
            feature_cols = [col for col in df.columns if col.startswith('roberta_')]
            all_features.append(df[feature_cols])
        else:
            print(f"⚠️ No features file for user {user}")
    return pd.concat(all_features, ignore_index=True)

def plot_explained_variance(pca):
    plt.figure(figsize=(8,5))
    plt.plot(np.cumsum(pca.explained_variance_ratio_) * 100)
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance (%)')
    plt.title('Explained Variance by PCA Components')
    plt.grid(True)
    plt.show()

def visualize_2d(pca_data, labels=None, title='PCA 2D projection'):
    plt.figure(figsize=(8,6))
    if labels is not None:
        scatter = plt.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='viridis', alpha=0.7)
        plt.colorbar(scatter, label='Label')
    else:
        plt.scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.7)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(title)
    plt.grid(True)
    plt.show()

def main():
    print("Loading all RoBERTa features...")
    features = load_all_features(SAVE_BASE_PATH)
    print(f"Loaded features shape: {features.shape}")
    
    
    features_np = features.values  # convert to numpy
    
    print("Applying PCA...")
    pca = PCA(n_components=50)  # or n_components=0.95 for variance threshold
    pca_result = pca.fit_transform(features_np)
    
    print(f"Explained variance by 50 components: {np.sum(pca.explained_variance_ratio_)*100:.2f}%")
    plot_explained_variance(pca)
    
    # Visualize first 2 principal components
    visualize_2d(pca_result[:, :2])
    
    pca_df = pd.DataFrame(
    pca_result, 
    columns=[f'pc_{i+1}' for i in range(pca_result.shape[1])]
    )
    
    pca_df.to_parquet(os.path.join(SAVE_BASE_PATH, 'pca_features.parquet'), index=False)

print(f"✅ Saved PCA features with shape {pca_df.shape} to {SAVE_BASE_PATH}")

if __name__ == "__main__":
    main()
