# Model Improvement: Advanced Techniques for Higher Accuracy

This notebook implements several advanced techniques to improve model accuracy:
1. Multiple model comparison (Logistic Regression, Random Forest, XGBoost, SVM)
2. Hyperparameter tuning with GridSearchCV
3. Ensemble methods (Voting Classifier)
4. Feature selection
5. Cross-validation for robust evaluation
6. Better feature engineering


## 1. Import Libraries


In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    pass

print("Libraries imported successfully!")


Libraries imported successfully!


## 2. Preprocessing Function


In [3]:
# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text) or text is None:
        return ''
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r"<.*?>", " ", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)


## 3. Load and Preprocess Data


## 3.1. Data Visualization - Label Distribution


In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
label_counts = df['is_depression'].value_counts()
axes[0].bar(['No Depression', 'Depression'], label_counts.values, color=['#4caf50', '#f44336'], alpha=0.7)
axes[0].set_title('Label Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xlabel('Label', fontsize=12)
for i, v in enumerate(label_counts.values):
    axes[0].text(i, v + 50, str(v), ha='center', fontweight='bold', fontsize=12)

# Pie chart
label_props = df['is_depression'].value_counts(normalize=True)
colors = ['#4caf50', '#f44336']
axes[1].pie(label_props.values, labels=['No Depression', 'Depression'], autopct='%1.1f%%', 
            colors=colors, startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Label Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"Dataset is {'balanced' if abs(label_props[0] - label_props[1]) < 0.1 else 'imbalanced'}")


## 3.2. Text Length Analysis


In [None]:
# Calculate text lengths
df['text_length'] = df['clean_text'].apply(lambda x: len(str(x).split()))
df['char_length'] = df['clean_text'].apply(lambda x: len(str(x)))

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Word count distribution by label
for label in [0, 1]:
    label_name = 'No Depression' if label == 0 else 'Depression'
    data = df[df['is_depression'] == label]['text_length']
    axes[0, 0].hist(data, bins=50, alpha=0.6, label=label_name, 
                    color='#4caf50' if label == 0 else '#f44336', edgecolor='black')
axes[0, 0].set_xlabel('Word Count', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Word Count Distribution by Label', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Box plot comparison
df_box = df[['is_depression', 'text_length']].copy()
df_box['is_depression'] = df_box['is_depression'].map({0: 'No Depression', 1: 'Depression'})
sns.boxplot(data=df_box, x='is_depression', y='text_length', ax=axes[0, 1], 
            palette=['#4caf50', '#f44336'])
axes[0, 1].set_title('Word Count Distribution (Box Plot)', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Label', fontsize=12)
axes[0, 1].set_ylabel('Word Count', fontsize=12)
axes[0, 1].grid(True, alpha=0.3)

# 3. Average text length by label
avg_lengths = df.groupby('is_depression')['text_length'].mean()
bars = axes[1, 0].bar(['No Depression', 'Depression'], avg_lengths.values, 
                      color=['#4caf50', '#f44336'], alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Average Word Count by Label', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Average Word Count', fontsize=12)
axes[1, 0].set_xlabel('Label', fontsize=12)
for i, v in enumerate(avg_lengths.values):
    axes[1, 0].text(i, v + 2, f'{v:.1f}', ha='center', fontweight='bold', fontsize=12)
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 4. Character length distribution
for label in [0, 1]:
    label_name = 'No Depression' if label == 0 else 'Depression'
    data = df[df['is_depression'] == label]['char_length']
    axes[1, 1].hist(data, bins=50, alpha=0.6, label=label_name,
                    color='#4caf50' if label == 0 else '#f44336', edgecolor='black')
axes[1, 1].set_xlabel('Character Count', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].set_title('Character Count Distribution by Label', fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n=== Text Length Statistics ===")
print(f"\nOverall Statistics:")
print(df['text_length'].describe())
print(f"\nBy Label:")
print(df.groupby('is_depression')['text_length'].describe())


## 3.3. Word Frequency Analysis & Word Clouds


In [None]:
# Analyze most common words by label
def get_top_words(texts, n=20):
    """Get top N words from a list of texts"""
    all_words = []
    for text in texts:
        words = str(text).lower().split()
        all_words.extend(words)
    word_freq = Counter(all_words)
    return word_freq.most_common(n)

# Get top words for each class
depression_texts = df[df['is_depression'] == 1]['processed_text']
no_depression_texts = df[df['is_depression'] == 0]['processed_text']

top_depression_words = get_top_words(depression_texts, 20)
top_no_depression_words = get_top_words(no_depression_texts, 20)

# Visualize top words
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Depression words
dep_words, dep_counts = zip(*top_depression_words)
axes[0].barh(range(len(dep_words)), dep_counts, color='#f44336', alpha=0.7)
axes[0].set_yticks(range(len(dep_words)))
axes[0].set_yticklabels(dep_words, fontsize=10)
axes[0].set_xlabel('Frequency', fontsize=12)
axes[0].set_title('Top 20 Words - Depression Class', fontsize=14, fontweight='bold')
axes[0].invert_yaxis()
axes[0].grid(True, alpha=0.3, axis='x')

# No Depression words
no_dep_words, no_dep_counts = zip(*top_no_depression_words)
axes[1].barh(range(len(no_dep_words)), no_dep_counts, color='#4caf50', alpha=0.7)
axes[1].set_yticks(range(len(no_dep_words)))
axes[1].set_yticklabels(no_dep_words, fontsize=10)
axes[1].set_xlabel('Frequency', fontsize=12)
axes[1].set_title('Top 20 Words - No Depression Class', fontsize=14, fontweight='bold')
axes[1].invert_yaxis()
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

# Word Clouds
print("\nGenerating Word Clouds...")
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Depression word cloud
depression_text = ' '.join(depression_texts.astype(str))
wordcloud_dep = WordCloud(width=800, height=400, background_color='white', 
                          colormap='Reds', max_words=100).generate(depression_text)
axes[0].imshow(wordcloud_dep, interpolation='bilinear')
axes[0].axis('off')
axes[0].set_title('Word Cloud - Depression Class', fontsize=16, fontweight='bold', pad=20)

# No Depression word cloud
no_depression_text = ' '.join(no_depression_texts.astype(str))
wordcloud_no_dep = WordCloud(width=800, height=400, background_color='white', 
                            colormap='Greens', max_words=100).generate(no_depression_text)
axes[1].imshow(wordcloud_no_dep, interpolation='bilinear')
axes[1].axis('off')
axes[1].set_title('Word Cloud - No Depression Class', fontsize=16, fontweight='bold', pad=20)

plt.tight_layout()
plt.show()


In [4]:
# Set base directory
base_dir = 'D:/mental_health_detector'

# Load data
data_path = os.path.join(base_dir, 'data/processed/depression_dataset_processed.csv')
raw_data_path = os.path.join(base_dir, 'data/raw/depression_dataset_reddit_cleaned.csv')

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    if 'processed_text' not in df.columns:
        df['processed_text'] = df['clean_text'].apply(preprocess_text)
else:
    df = pd.read_csv(raw_data_path)
    df['processed_text'] = df['clean_text'].apply(preprocess_text)

# Clean data
df = df[df['processed_text'].notna() & (df['processed_text'].str.strip() != '')]
df = df[df['is_depression'].notna()]

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:")
print(df['is_depression'].value_counts())


Dataset shape: (7730, 4)
Label distribution:
is_depression
0    3900
1    3830
Name: count, dtype: int64


## 4. Enhanced Feature Engineering


## 7.1. Model Comparison Visualization


In [None]:
# Visualize model comparison results
if 'results' in locals() and len(results) > 0:
    model_names = list(results.keys())
    cv_scores = [results[name]['mean_score'] for name in model_names]
    cv_stds = [results[name]['std_score'] for name in model_names]
    
    fig, ax = plt.subplots(figsize=(14, 8))
    bars = ax.barh(model_names, cv_scores, xerr=cv_stds, color=['#2196F3', '#4caf50', '#ff9800', '#9c27b0', '#e91e63', '#00bcd4'], 
                   alpha=0.7, edgecolor='black', capsize=5)
    ax.set_xlabel('Cross-Validation Accuracy', fontsize=12)
    ax.set_title('Model Comparison (5-Fold CV)', fontsize=16, fontweight='bold')
    ax.set_xlim([0.85, 1.0])
    ax.grid(True, alpha=0.3, axis='x')
    
    # Add value labels
    for i, (name, score, std) in enumerate(zip(model_names, cv_scores, cv_stds)):
        ax.text(score + std + 0.005, i, f'{score:.4f} (±{std:.4f})', 
               va='center', fontweight='bold', fontsize=10)
    
    # Highlight best model
    best_idx = np.argmax(cv_scores)
    bars[best_idx].set_color('#f44336')
    bars[best_idx].set_edgecolor('black')
    bars[best_idx].set_linewidth(3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n=== Model Comparison Summary ===")
    for name in model_names:
        print(f"{name}: {results[name]['mean_score']:.4f} (±{results[name]['std_score']:.4f})")


In [5]:
# Enhanced TF-IDF with better parameters
vectorizer = TfidfVectorizer(
    max_features=8000,  # Increased from 5000
    ngram_range=(1, 2),
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.95,  # Ignore terms that appear in more than 95% of documents
    sublinear_tf=True  # Apply sublinear tf scaling (1 + log(tf))
)

processed_texts = df['processed_text'].fillna('').astype(str).tolist()
X_tfidf = vectorizer.fit_transform(processed_texts)
y = df['is_depression'].values

print(f"Feature matrix shape: {X_tfidf.shape}")
print(f"Number of features: {X_tfidf.shape[1]}")


Feature matrix shape: (7730, 8000)
Number of features: 8000


## 5. Feature Selection (Optional - can improve accuracy by removing noise)


In [6]:
# Feature selection - select top k features using chi2
# This can help remove noise and improve accuracy
k_best = 6000  # Select top 6000 features
selector = SelectKBest(chi2, k=min(k_best, X_tfidf.shape[1]))
X_selected = selector.fit_transform(X_tfidf, y)

print(f"Original features: {X_tfidf.shape[1]}")
print(f"Selected features: {X_selected.shape[1]}")

# Use selected features
X = X_selected
# Or use all features: X = X_tfidf


Original features: 8000
Selected features: 6000


## 6. Train-Test Split


## 10.1. Final Model Performance Visualization


In [None]:
# Create comprehensive performance visualizations for best model
best_model_name = max(final_results, key=lambda x: final_results[x]['accuracy'])
best_result = final_results[best_model_name]

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Confusion Matrix
y_pred_best = best_result['model'].predict(X_test)
cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0],
            xticklabels=['No Depression', 'Depression'],
            yticklabels=['No Depression', 'Depression'],
            cbar_kws={'label': 'Count'})
axes[0, 0].set_title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('True Label', fontsize=12)
axes[0, 0].set_xlabel('Predicted Label', fontsize=12)

# Add percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
for i in range(2):
    for j in range(2):
        axes[0, 0].text(j+0.5, i+0.7, f'{cm_percent[i, j]:.1f}%', 
                       ha='center', va='center', fontsize=10, color='red', fontweight='bold')

# 2. ROC Curve
y_pred_proba_best = best_result['model'].predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_best)
roc_auc = roc_auc_score(y_test, y_pred_proba_best)

axes[0, 1].plot(fpr, tpr, color='#2196F3', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[0, 1].plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Classifier')
axes[0, 1].set_xlim([0.0, 1.0])
axes[0, 1].set_ylim([0.0, 1.05])
axes[0, 1].set_xlabel('False Positive Rate', fontsize=12)
axes[0, 1].set_ylabel('True Positive Rate', fontsize=12)
axes[0, 1].set_title(f'ROC Curve - {best_model_name}', fontsize=14, fontweight='bold')
axes[0, 1].legend(loc="lower right")
axes[0, 1].grid(True, alpha=0.3)

# 3. Metrics Comparison
metrics = {
    'Accuracy': best_result['accuracy'],
    'Precision': best_result['precision'],
    'Recall': best_result['recall'],
    'F1-Score': best_result['f1']
}
bars = axes[1, 0].bar(metrics.keys(), metrics.values(), 
                     color=['#4caf50', '#2196F3', '#ff9800', '#9c27b0'], alpha=0.7, edgecolor='black')
axes[1, 0].set_ylim([0, 1])
axes[1, 0].set_ylabel('Score', fontsize=12)
axes[1, 0].set_title(f'Performance Metrics - {best_model_name}', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3, axis='y')
for i, (key, value) in enumerate(metrics.items()):
    axes[1, 0].text(i, value + 0.02, f'{value:.3f}', ha='center', fontweight='bold', fontsize=11)

# 4. Probability Distribution
y_pred_proba_depression = y_pred_proba_best
y_pred_proba_no_depression = 1 - y_pred_proba_best

axes[1, 1].hist(y_pred_proba_no_depression[y_test == 0], bins=30, alpha=0.6, 
                label='No Depression (True)', color='#4caf50', edgecolor='black')
axes[1, 1].hist(y_pred_proba_depression[y_test == 1], bins=30, alpha=0.6, 
                label='Depression (True)', color='#f44336', edgecolor='black')
axes[1, 1].axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold (0.5)')
axes[1, 1].set_xlabel('Predicted Probability', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].set_title('Prediction Probability Distribution', fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n=== {best_model_name} Performance Summary ===")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Accuracy: {best_result['accuracy']:.4f}")
print(f"Precision: {best_result['precision']:.4f}")
print(f"Recall: {best_result['recall']:.4f}")
print(f"F1-Score: {best_result['f1']:.4f}")


## 10.2. All Models Performance Comparison


In [None]:
# Compare all final models side by side
if len(final_results) > 1:
    model_names = list(final_results.keys())
    accuracies = [final_results[name]['accuracy'] for name in model_names]
    precisions = [final_results[name]['precision'] for name in model_names]
    recalls = [final_results[name]['recall'] for name in model_names]
    f1_scores = [final_results[name]['f1'] for name in model_names]
    
    x = np.arange(len(model_names))
    width = 0.2
    
    fig, ax = plt.subplots(figsize=(14, 8))
    bars1 = ax.bar(x - 1.5*width, accuracies, width, label='Accuracy', color='#4caf50', alpha=0.7, edgecolor='black')
    bars2 = ax.bar(x - 0.5*width, precisions, width, label='Precision', color='#2196F3', alpha=0.7, edgecolor='black')
    bars3 = ax.bar(x + 0.5*width, recalls, width, label='Recall', color='#ff9800', alpha=0.7, edgecolor='black')
    bars4 = ax.bar(x + 1.5*width, f1_scores, width, label='F1-Score', color='#9c27b0', alpha=0.7, edgecolor='black')
    
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Model Performance Comparison', fontsize=16, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names, fontsize=11)
    ax.legend(fontsize=11)
    ax.set_ylim([0, 1.05])
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for bars in [bars1, bars2, bars3, bars4]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Create comparison table
    comparison_df = pd.DataFrame({
        'Model': model_names,
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1-Score': f1_scores
    })
    print("\n=== Model Performance Comparison Table ===")
    print(comparison_df.to_string(index=False))


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")


Training samples: 6184
Test samples: 1546


## 7. Model Comparison - Test Multiple Algorithms


In [8]:
# Define models to test
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, class_weight='balanced', n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss'),
    'SVM': SVC(kernel='linear', probability=True, random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42),
    'Naive Bayes': MultinomialNB(alpha=0.1)
}

# Evaluate each model with cross-validation
results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Evaluating models with 5-fold cross-validation...\n")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    results[name] = {
        'mean_score': scores.mean(),
        'std_score': scores.std(),
        'model': model
    }
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

# Find best model
best_model_name = max(results, key=lambda x: results[x]['mean_score'])
print(f"\nBest model: {best_model_name} with CV accuracy: {results[best_model_name]['mean_score']:.4f}")


Evaluating models with 5-fold cross-validation...

Logistic Regression: 0.9531 (+/- 0.0141)
Random Forest: 0.8984 (+/- 0.0144)
XGBoost: 0.9544 (+/- 0.0125)
SVM: 0.9559 (+/- 0.0149)
Gradient Boosting: 0.9486 (+/- 0.0180)
Naive Bayes: 0.9125 (+/- 0.0197)

Best model: SVM with CV accuracy: 0.9559


## 8. Hyperparameter Tuning for Best Model


In [9]:
# Hyperparameter tuning for XGBoost (usually performs best)
print("Performing hyperparameter tuning for XGBoost...")

param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [5, 6, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}

xgb_base = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
grid_search = GridSearchCV(
    xgb_base, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

best_xgb = grid_search.best_estimator_


Performing hyperparameter tuning for XGBoost...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300, 'subsample': 1.0}
Best CV score: 0.9546


## 9. Ensemble Model - Voting Classifier


In [10]:
# Create ensemble of best performing models
ensemble = VotingClassifier(
    estimators=[
        ('xgb', best_xgb if 'best_xgb' in locals() else xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss')),
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, class_weight='balanced', n_jobs=-1)),
        ('lr', LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced', C=1.0))
    ],
    voting='soft',  # Use probability voting
    n_jobs=-1
)

print("Training ensemble model...")
ensemble.fit(X_train, y_train)

# Evaluate ensemble
ensemble_scores = cross_val_score(ensemble, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Ensemble CV accuracy: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std()*2:.4f})")


Training ensemble model...
Ensemble CV accuracy: 0.9575 (+/- 0.0120)


## 10. Final Model Evaluation on Test Set


In [11]:
# Train final models and evaluate on test set
final_models = {
    'Best XGBoost': best_xgb if 'best_xgb' in locals() else xgb.XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss'),
    'Ensemble': ensemble
}

final_results = {}

for name, model in final_models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    
    final_results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'model': model
    }
    
    print(f"\n=== {name} Performance ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))



=== Best XGBoost Performance ===
Accuracy: 0.9599
Precision: 0.9770
Recall: 0.9413
F1-Score: 0.9588

Confusion Matrix:
[[763  17]
 [ 45 721]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       780
           1       0.98      0.94      0.96       766

    accuracy                           0.96      1546
   macro avg       0.96      0.96      0.96      1546
weighted avg       0.96      0.96      0.96      1546


=== Ensemble Performance ===
Accuracy: 0.9638
Precision: 0.9876
Recall: 0.9386
F1-Score: 0.9625

Confusion Matrix:
[[771   9]
 [ 47 719]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       780
           1       0.99      0.94      0.96       766

    accuracy                           0.96      1546
   macro avg       0.97      0.96      0.96      1546
weighted avg       0.96      0.96      0.96      1546



## 11. Save Best Model


In [12]:
# Find best model
best_final_name = max(final_results, key=lambda x: final_results[x]['accuracy'])
best_final_model = final_results[best_final_name]['model']

print(f"Best model: {best_final_name}")
print(f"Accuracy: {final_results[best_final_name]['accuracy']:.4f}")

# Save model and vectorizer
models_dir = os.path.join(base_dir, 'models')
os.makedirs(models_dir, exist_ok=True)

# Save improved model
model_path = os.path.join(models_dir, 'mental_health_model_improved.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_final_model, f)
print(f"\nModel saved to {model_path}")

# Save vectorizer
vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer_improved.pkl')
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"Vectorizer saved to {vectorizer_path}")

# Save selector if used
if 'selector' in locals():
    selector_path = os.path.join(models_dir, 'feature_selector.pkl')
    with open(selector_path, 'wb') as f:
        pickle.dump(selector, f)
    print(f"Feature selector saved to {selector_path}")


Best model: Ensemble
Accuracy: 0.9638

Model saved to D:/mental_health_detector\models\mental_health_model_improved.pkl
Vectorizer saved to D:/mental_health_detector\models\tfidf_vectorizer_improved.pkl
Feature selector saved to D:/mental_health_detector\models\feature_selector.pkl
