# Advanced Mental Health Text Classification

This notebook explores advanced techniques to improve upon the baseline SVM model.

## Techniques to Explore:
1. **Ensemble Methods** - Combine multiple models
2. **Feature Engineering** - Better text preprocessing
3. **Hyperparameter Tuning** - Optimize model parameters
4. **BERT/Transformers** - Use pre-trained language models
5. **Cross-Validation** - Better model evaluation

In [None]:
# Import libraries for advanced modeling
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load the data (same as baseline)
df = pd.read_csv("../data/cleaned_data.csv")
print(f"Dataset loaded: {df.shape}")
print(f"Class distribution:\n{df['target'].value_counts().sort_index()}")

# Prepare data
X = df['content']
y = df['target']
class_names = ["Stress", "Depression", "Bipolar", "Personality", "Anxiety"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print("="*50)

In [None]:
# Advanced Feature Engineering
print("=== ADVANCED FEATURE ENGINEERING ===")

# Create multiple TF-IDF vectorizers with different configurations
vectorizers = {
    'basic': TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,1)),
    'bigrams': TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2)),
    'trigrams': TfidfVectorizer(max_features=7000, stop_words='english', ngram_range=(1,3)),
    'advanced': TfidfVectorizer(
        max_features=10000,
        stop_words='english', 
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True,
        norm='l2'
    )
}

# Test each vectorizer with baseline SVM
vectorizer_results = {}
for name, vectorizer in vectorizers.items():
    print(f"\nTesting {name} vectorizer...")
    
    # Transform data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Train SVM
    svm = LinearSVC(random_state=42, max_iter=2000)
    svm.fit(X_train_vec, y_train)
    
    # Predict and evaluate
    pred = svm.predict(X_test_vec)
    accuracy = accuracy_score(y_test, pred)
    vectorizer_results[name] = accuracy
    
    print(f"{name} accuracy: {accuracy:.4f}")
    print(f"Feature matrix shape: {X_train_vec.shape}")

# Select best vectorizer
best_vectorizer_name = max(vectorizer_results, key=vectorizer_results.get)
best_vectorizer = vectorizers[best_vectorizer_name]
print(f"\n🏆 BEST VECTORIZER: {best_vectorizer_name} (Accuracy: {vectorizer_results[best_vectorizer_name]:.4f})")

# Use best vectorizer for remaining experiments
X_train_tfidf = best_vectorizer.fit_transform(X_train)
X_test_tfidf = best_vectorizer.transform(X_test)
print("="*50)

In [None]:
# Hyperparameter Tuning
print("=== HYPERPARAMETER TUNING ===")

# Define parameter grids for different models
param_grids = {
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'max_iter': [1000, 2000, 3000],
        'dual': [False]  # Better for n_samples > n_features
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10, 100],
        'max_iter': [500, 1000, 2000],
        'solver': ['liblinear', 'lbfgs']
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
}

# Models to tune
models_to_tune = {
    'SVM': LinearSVC(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# Perform grid search for each model
tuned_models = {}
best_scores = {}

for model_name, model in models_to_tune.items():
    print(f"\nTuning {model_name}...")
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(
        model, 
        param_grids[model_name], 
        cv=3,  # 3-fold CV for speed
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train_tfidf, y_train)
    
    # Store results
    tuned_models[model_name] = grid_search.best_estimator_
    best_scores[model_name] = grid_search.best_score_
    
    print(f"Best {model_name} parameters: {grid_search.best_params_}")
    print(f"Best {model_name} CV score: {grid_search.best_score_:.4f}")

print(f"\n🎯 HYPERPARAMETER TUNING RESULTS:")
for model_name, score in best_scores.items():
    print(f"{model_name}: {score:.4f}")
print("="*50)

In [None]:
# Ensemble Methods
print("=== ENSEMBLE METHODS ===")

# Test tuned models individually first
individual_results = {}
for model_name, model in tuned_models.items():
    pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, pred)
    individual_results[model_name] = accuracy
    print(f"Tuned {model_name} accuracy: {accuracy:.4f}")

# Create ensemble combinations
ensemble_configs = {
    'Voting_Hard': VotingClassifier([
        ('svm', tuned_models['SVM']),
        ('lr', tuned_models['LogisticRegression']),
        ('rf', tuned_models['RandomForest'])
    ], voting='hard'),
    
    'Voting_Soft': VotingClassifier([
        ('lr', tuned_models['LogisticRegression']),
        ('rf', tuned_models['RandomForest'])
    ], voting='soft'),  # SVM doesn't support soft voting by default
    
    'Best_Two': VotingClassifier([
        ('svm', tuned_models['SVM']),
        ('lr', tuned_models['LogisticRegression'])
    ], voting='hard')
}

# Train and evaluate ensemble models
ensemble_results = {}
print(f"\nEnsemble Results:")
for ensemble_name, ensemble in ensemble_configs.items():
    print(f"\nTraining {ensemble_name}...")
    ensemble.fit(X_train_tfidf, y_train)
    pred = ensemble.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, pred)
    ensemble_results[ensemble_name] = accuracy
    print(f"{ensemble_name} accuracy: {accuracy:.4f}")

# Find overall best model
all_results = {**individual_results, **ensemble_results}
best_model_name = max(all_results, key=all_results.get)
best_accuracy = all_results[best_model_name]

print(f"\n🏆 BEST OVERALL MODEL: {best_model_name}")
print(f"🎯 BEST ACCURACY: {best_accuracy:.4f}")
print("="*50)

In [None]:
# Model Interpretability Analysis
print("=== MODEL INTERPRETABILITY ===")

# Use the best performing model for interpretability
if best_model_name in tuned_models:
    best_model = tuned_models[best_model_name]
    model_type = "individual"
else:
    best_model = ensemble_configs[best_model_name]
    model_type = "ensemble"

print(f"Analyzing {best_model_name} ({model_type} model)")

# Feature importance analysis (works for SVM and Logistic Regression)
if hasattr(best_model, 'coef_') and model_type == "individual":
    print(f"\n📊 FEATURE IMPORTANCE ANALYSIS")
    
    # Get feature names
    feature_names = best_vectorizer.get_feature_names_out()
    coefficients = best_model.coef_
    
    # Top features for each class
    n_top_features = 10
    print(f"\nTop {n_top_features} most important features for each class:")
    
    for i, class_name in enumerate(class_names):
        print(f"\n🔍 {class_name.upper()}:")
        
        # Get top positive features (most predictive of this class)
        top_positive_idx = np.argsort(coefficients[i])[-n_top_features:]
        print("  Most predictive words:")
        for idx in reversed(top_positive_idx):
            print(f"    {feature_names[idx]}: {coefficients[i][idx]:.3f}")
        
        # Get top negative features (most predictive of NOT this class)
        top_negative_idx = np.argsort(coefficients[i])[:5]
        print("  Least predictive words:")
        for idx in top_negative_idx:
            print(f"    {feature_names[idx]}: {coefficients[i][idx]:.3f}")

else:
    print("Feature importance not available for ensemble models")
print("="*50)

In [None]:
# Error Analysis
print("=== ERROR ANALYSIS ===")

# Get predictions from best model
if model_type == "individual":
    best_predictions = best_model.predict(X_test_tfidf)
else:
    best_predictions = best_model.predict(X_test_tfidf)

# Create confusion matrix
cm = confusion_matrix(y_test, best_predictions)

# Plot enhanced confusion matrix
plt.figure(figsize=(12, 8))
sns.heatmap(cm, 
            annot=True, 
            fmt="d", 
            cmap="Blues",
            xticklabels=class_names,
            yticklabels=class_names,
            square=True)
plt.title(f'Confusion Matrix - {best_model_name}\nAccuracy: {best_accuracy:.4f}', 
          fontsize=14, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.tight_layout()
plt.show()

# Detailed error analysis
print(f"\n🔍 DETAILED ERROR ANALYSIS:")
print(f"Total test samples: {len(y_test)}")
print(f"Correct predictions: {(best_predictions == y_test).sum()}")
print(f"Incorrect predictions: {(best_predictions != y_test).sum()}")

# Per-class analysis
per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
print(f"\nPer-class accuracy:")
for i, class_name in enumerate(class_names):
    print(f"  {class_name}: {per_class_accuracy[i]:.4f}")

# Most common misclassifications
print(f"\nMost common misclassifications:")
misclass_pairs = []
for i in range(len(class_names)):
    for j in range(len(class_names)):
        if i != j and cm[i][j] > 0:
            misclass_pairs.append((class_names[i], class_names[j], cm[i][j]))

misclass_pairs.sort(key=lambda x: x[2], reverse=True)
for true_label, pred_label, count in misclass_pairs[:10]:
    print(f"  {true_label} → {pred_label}: {count} cases")

# Sample misclassified examples
print(f"\n📝 SAMPLE MISCLASSIFIED EXAMPLES:")
misclassified_indices = np.where(best_predictions != y_test)[0]
sample_size = min(5, len(misclassified_indices))

for i in range(sample_size):
    idx = misclassified_indices[i]
    original_idx = X_test.index[idx]
    
    print(f"\nExample {i+1}:")
    print(f"  True class: {class_names[y_test.iloc[idx]]}")
    print(f"  Predicted: {class_names[best_predictions[idx]]}")
    print(f"  Text: {X_test.iloc[idx][:200]}...")
print("="*50)

In [None]:
# Cross-Validation Analysis
print("=== CROSS-VALIDATION ANALYSIS ===")

# Perform stratified k-fold cross-validation
cv_folds = 5
skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

# Test top 3 models with CV
top_models = dict(sorted(all_results.items(), key=lambda x: x[1], reverse=True)[:3])

cv_results = {}
for model_name, _ in top_models.items():
    print(f"\nCross-validating {model_name}...")
    
    if model_name in tuned_models:
        model = tuned_models[model_name]
    else:
        model = ensemble_configs[model_name]
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=skf, scoring='accuracy')
    
    cv_results[model_name] = {
        'mean': cv_scores.mean(),
        'std': cv_scores.std(),
        'scores': cv_scores
    }
    
    print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"  Individual folds: {[f'{score:.4f}' for score in cv_scores]}")

# Visualize CV results
plt.figure(figsize=(12, 6))
model_names = list(cv_results.keys())
means = [cv_results[name]['mean'] for name in model_names]
stds = [cv_results[name]['std'] for name in model_names]

plt.bar(model_names, means, yerr=stds, capsize=5, alpha=0.7, color=['skyblue', 'lightcoral', 'lightgreen'])
plt.title('Cross-Validation Results (5-Fold)', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy', fontsize=12)
plt.xlabel('Models', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, (mean, std) in enumerate(zip(means, stds)):
    plt.text(i, mean + std + 0.01, f'{mean:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n🎯 MOST STABLE MODEL (lowest std): {min(cv_results.keys(), key=lambda x: cv_results[x]['std'])}")
print(f"🏆 HIGHEST CV ACCURACY: {max(cv_results.keys(), key=lambda x: cv_results[x]['mean'])}")
print("="*50)

In [None]:
# Model Comparison Summary & Deployment
print("=== FINAL MODEL COMPARISON & DEPLOYMENT ===")

# Create comprehensive results summary
results_summary = pd.DataFrame([
    {'Model': 'Baseline SVM (from previous notebook)', 'Test Accuracy': 0.8129, 'CV Mean': 'N/A', 'CV Std': 'N/A'},
])

# Add current results
for model_name, test_acc in all_results.items():
    cv_mean = cv_results.get(model_name, {}).get('mean', 'N/A')
    cv_std = cv_results.get(model_name, {}).get('std', 'N/A')
    
    results_summary = pd.concat([results_summary, pd.DataFrame([{
        'Model': model_name,
        'Test Accuracy': test_acc,
        'CV Mean': cv_mean if cv_mean != 'N/A' else 'N/A',
        'CV Std': cv_std if cv_std != 'N/A' else 'N/A'
    }])], ignore_index=True)

# Sort by test accuracy
results_summary = results_summary.sort_values('Test Accuracy', ascending=False)
print("📊 COMPREHENSIVE MODEL COMPARISON:")
print(results_summary.to_string(index=False))

# Calculate improvement
baseline_accuracy = 0.8129
best_improvement = best_accuracy - baseline_accuracy
print(f"\n🚀 IMPROVEMENT OVER BASELINE:")
print(f"  Baseline SVM: {baseline_accuracy:.4f}")
print(f"  Best Model ({best_model_name}): {best_accuracy:.4f}")
print(f"  Improvement: +{best_improvement:.4f} ({best_improvement*100:.2f}%)")

# Save the best model
print(f"\n💾 SAVING BEST MODEL...")
import os
from datetime import datetime

# Create models directory
os.makedirs("../models", exist_ok=True)

# Save best model and vectorizer
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f"../models/advanced_{best_model_name.lower().replace(' ', '_')}_model_{timestamp}.pkl"
vectorizer_filename = f"../models/advanced_tfidf_vectorizer_{timestamp}.pkl"

# Get the actual model object
if model_type == "individual":
    final_model = best_model
else:
    final_model = best_model

joblib.dump(final_model, model_filename)
joblib.dump(best_vectorizer, vectorizer_filename)

print(f"✅ Model saved: {model_filename}")
print(f"✅ Vectorizer saved: {vectorizer_filename}")

# Create prediction function for the advanced model
def predict_mental_health_advanced(text):
    """Advanced prediction function with confidence scores"""
    text_tfidf = best_vectorizer.transform([text])
    prediction = final_model.predict(text_tfidf)[0]
    
    # Get confidence scores (different methods for different models)
    if hasattr(final_model, 'predict_proba'):
        probabilities = final_model.predict_proba(text_tfidf)[0]
        confidence_scores = dict(zip(class_names, probabilities))
    elif hasattr(final_model, 'decision_function'):
        decision_scores = final_model.decision_function(text_tfidf)[0]
        # Convert to probabilities using softmax
        exp_scores = np.exp(decision_scores - np.max(decision_scores))
        probabilities = exp_scores / np.sum(exp_scores)
        confidence_scores = dict(zip(class_names, probabilities))
    else:
        # Fallback for ensemble methods
        confidence_scores = {class_name: 0.2 for class_name in class_names}
        confidence_scores[class_names[prediction]] = 0.8
    
    return {
        'predicted_class': class_names[prediction],
        'class_number': prediction,
        'confidence_scores': confidence_scores,
        'model_used': best_model_name
    }

# Test the advanced model
test_text = "I've been feeling really anxious lately and having panic attacks"
result = predict_mental_health_advanced(test_text)

print(f"\n🧪 TESTING ADVANCED MODEL:")
print(f"Text: '{test_text}'")
print(f"Predicted: {result['predicted_class']}")
print(f"Model Used: {result['model_used']}")
print(f"Confidence Scores:")
for class_name, score in result['confidence_scores'].items():
    print(f"  {class_name}: {score:.3f}")

print(f"\n🎉 ADVANCED MODEL READY!")
print(f"Final Performance: {best_accuracy:.4f} accuracy")
print(f"Improvement: +{best_improvement*100:.2f}% over baseline")
print("="*50)

In [None]:
# Technique 1: Ensemble Methods
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Prepare data
X = df['content']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Create ensemble
ensemble = VotingClassifier([
    ('lr', LogisticRegression(max_iter=500, random_state=42)),
    ('nb', MultinomialNB()),
    ('svm', LinearSVC(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
], voting='hard')

ensemble.fit(X_train_tfidf, y_train)
ensemble_pred = ensemble.predict(X_test_tfidf)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"Improvement over baseline: {ensemble_accuracy - 0.8129:.4f}")

In [None]:
# Technique 2: Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Tune SVM parameters
svm_params = {
    'C': [0.1, 1, 10, 100],
    'max_iter': [1000, 2000, 3000]
}

svm_grid = GridSearchCV(LinearSVC(random_state=42), svm_params, cv=5, scoring='accuracy')
svm_grid.fit(X_train_tfidf, y_train)

print(f"Best SVM parameters: {svm_grid.best_params_}")
print(f"Best SVM CV score: {svm_grid.best_score_:.4f}")

# Test best model
best_svm = svm_grid.best_estimator_
best_svm_pred = best_svm.predict(X_test_tfidf)
best_svm_accuracy = accuracy_score(y_test, best_svm_pred)
print(f"Tuned SVM Test Accuracy: {best_svm_accuracy:.4f}")

In [None]:
# Technique 3: Better Feature Engineering
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def advanced_text_preprocessing(text):
    """Advanced text preprocessing"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs, mentions, hashtags
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)
    
    # Keep only letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply preprocessing
X_train_clean = X_train.apply(advanced_text_preprocessing)
X_test_clean = X_test.apply(advanced_text_preprocessing)

# Advanced TF-IDF
tfidf_advanced = TfidfVectorizer(
    max_features=10000,    # More features
    stop_words='english',
    ngram_range=(1, 3),    # Include trigrams
    min_df=3,
    max_df=0.9,
    sublinear_tf=True     # Use log normalization
)

X_train_advanced = tfidf_advanced.fit_transform(X_train_clean)
X_test_advanced = tfidf_advanced.transform(X_test_clean)

# Test with advanced features
svm_advanced = LinearSVC(random_state=42, max_iter=2000)
svm_advanced.fit(X_train_advanced, y_train)
advanced_pred = svm_advanced.predict(X_test_advanced)
advanced_accuracy = accuracy_score(y_test, advanced_pred)

print(f"Advanced Features Accuracy: {advanced_accuracy:.4f}")
print(f"Feature matrix shape: {X_train_advanced.shape}")

## Next: BERT/Transformer Models

For even better performance, consider:
1. **Hugging Face Transformers**
2. **Fine-tuned BERT models**
3. **Mental health specific pre-trained models**

This would require additional libraries:
```bash
pip install transformers torch
```