# Model Improvement: Advanced Techniques for Higher Accuracy

This notebook implements several advanced techniques to improve model accuracy:
1. Multiple model comparison (Logistic Regression, Random Forest, XGBoost, SVM)
2. Hyperparameter tuning with GridSearchCV
3. Ensemble methods (Voting Classifier)
4. Feature selection
5. Cross-validation for robust evaluation
6. Better feature engineering


## 1. Import Libraries


In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    pass

print("Libraries imported successfully!")


Libraries imported successfully!


## 2. Preprocessing Function


In [3]:
# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text) or text is None:
        return ''
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r"<.*?>", " ", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)


## 3. Load and Preprocess Data


In [4]:
# Set base directory
base_dir = 'D:/mental_health_detector'

# Load data
data_path = os.path.join(base_dir, 'data/processed/depression_dataset_processed.csv')
raw_data_path = os.path.join(base_dir, 'data/raw/depression_dataset_reddit_cleaned.csv')

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    if 'processed_text' not in df.columns:
        df['processed_text'] = df['clean_text'].apply(preprocess_text)
else:
    df = pd.read_csv(raw_data_path)
    df['processed_text'] = df['clean_text'].apply(preprocess_text)

# Clean data
df = df[df['processed_text'].notna() & (df['processed_text'].str.strip() != '')]
df = df[df['is_depression'].notna()]

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:")
print(df['is_depression'].value_counts())


Dataset shape: (7730, 4)
Label distribution:
is_depression
0    3900
1    3830
Name: count, dtype: int64


## 4. Enhanced Feature Engineering


In [5]:
# Enhanced TF-IDF with better parameters
vectorizer = TfidfVectorizer(
    max_features=8000,  # Increased from 5000
    ngram_range=(1, 2),
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.95,  # Ignore terms that appear in more than 95% of documents
    sublinear_tf=True  # Apply sublinear tf scaling (1 + log(tf))
)

processed_texts = df['processed_text'].fillna('').astype(str).tolist()
X_tfidf = vectorizer.fit_transform(processed_texts)
y = df['is_depression'].values

print(f"Feature matrix shape: {X_tfidf.shape}")
print(f"Number of features: {X_tfidf.shape[1]}")


Feature matrix shape: (7730, 8000)
Number of features: 8000


## 5. Feature Selection (Optional - can improve accuracy by removing noise)


In [6]:
# Feature selection - select top k features using chi2
# This can help remove noise and improve accuracy
k_best = 6000  # Select top 6000 features
selector = SelectKBest(chi2, k=min(k_best, X_tfidf.shape[1]))
X_selected = selector.fit_transform(X_tfidf, y)

print(f"Original features: {X_tfidf.shape[1]}")
print(f"Selected features: {X_selected.shape[1]}")

# Use selected features
X = X_selected
# Or use all features: X = X_tfidf


Original features: 8000
Selected features: 6000


## 6. Train-Test Split


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")


Training samples: 6184
Test samples: 1546


## 7. Model Comparison - Test Multiple Algorithms


In [8]:
# Define models to test
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, class_weight='balanced', n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss'),
    'SVM': SVC(kernel='linear', probability=True, random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42),
    'Naive Bayes': MultinomialNB(alpha=0.1)
}

# Evaluate each model with cross-validation
results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Evaluating models with 5-fold cross-validation...\n")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    results[name] = {
        'mean_score': scores.mean(),
        'std_score': scores.std(),
        'model': model
    }
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

# Find best model
best_model_name = max(results, key=lambda x: results[x]['mean_score'])
print(f"\nBest model: {best_model_name} with CV accuracy: {results[best_model_name]['mean_score']:.4f}")


Evaluating models with 5-fold cross-validation...

Logistic Regression: 0.9531 (+/- 0.0141)
Random Forest: 0.8984 (+/- 0.0144)
XGBoost: 0.9544 (+/- 0.0125)
SVM: 0.9559 (+/- 0.0149)
Gradient Boosting: 0.9486 (+/- 0.0180)
Naive Bayes: 0.9125 (+/- 0.0197)

Best model: SVM with CV accuracy: 0.9559


## 8. Hyperparameter Tuning for Best Model


In [9]:
# Hyperparameter tuning for XGBoost (usually performs best)
print("Performing hyperparameter tuning for XGBoost...")

param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [5, 6, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}

xgb_base = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
grid_search = GridSearchCV(
    xgb_base, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

best_xgb = grid_search.best_estimator_


Performing hyperparameter tuning for XGBoost...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300, 'subsample': 1.0}
Best CV score: 0.9546


## 9. Ensemble Model - Voting Classifier


In [10]:
# Create ensemble of best performing models
ensemble = VotingClassifier(
    estimators=[
        ('xgb', best_xgb if 'best_xgb' in locals() else xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss')),
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, class_weight='balanced', n_jobs=-1)),
        ('lr', LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced', C=1.0))
    ],
    voting='soft',  # Use probability voting
    n_jobs=-1
)

print("Training ensemble model...")
ensemble.fit(X_train, y_train)

# Evaluate ensemble
ensemble_scores = cross_val_score(ensemble, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Ensemble CV accuracy: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std()*2:.4f})")


Training ensemble model...
Ensemble CV accuracy: 0.9575 (+/- 0.0120)


## 10. Final Model Evaluation on Test Set


In [11]:
# Train final models and evaluate on test set
final_models = {
    'Best XGBoost': best_xgb if 'best_xgb' in locals() else xgb.XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss'),
    'Ensemble': ensemble
}

final_results = {}

for name, model in final_models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    
    final_results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'model': model
    }
    
    print(f"\n=== {name} Performance ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))



=== Best XGBoost Performance ===
Accuracy: 0.9599
Precision: 0.9770
Recall: 0.9413
F1-Score: 0.9588

Confusion Matrix:
[[763  17]
 [ 45 721]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       780
           1       0.98      0.94      0.96       766

    accuracy                           0.96      1546
   macro avg       0.96      0.96      0.96      1546
weighted avg       0.96      0.96      0.96      1546


=== Ensemble Performance ===
Accuracy: 0.9638
Precision: 0.9876
Recall: 0.9386
F1-Score: 0.9625

Confusion Matrix:
[[771   9]
 [ 47 719]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       780
           1       0.99      0.94      0.96       766

    accuracy                           0.96      1546
   macro avg       0.97      0.96      0.96      1546
weighted avg       0.96      0.96      0.96      1546



## 11. Save Best Model


In [12]:
# Find best model
best_final_name = max(final_results, key=lambda x: final_results[x]['accuracy'])
best_final_model = final_results[best_final_name]['model']

print(f"Best model: {best_final_name}")
print(f"Accuracy: {final_results[best_final_name]['accuracy']:.4f}")

# Save model and vectorizer
models_dir = os.path.join(base_dir, 'models')
os.makedirs(models_dir, exist_ok=True)

# Save improved model
model_path = os.path.join(models_dir, 'mental_health_model_improved.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_final_model, f)
print(f"\nModel saved to {model_path}")

# Save vectorizer
vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer_improved.pkl')
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"Vectorizer saved to {vectorizer_path}")

# Save selector if used
if 'selector' in locals():
    selector_path = os.path.join(models_dir, 'feature_selector.pkl')
    with open(selector_path, 'wb') as f:
        pickle.dump(selector, f)
    print(f"Feature selector saved to {selector_path}")


Best model: Ensemble
Accuracy: 0.9638

Model saved to D:/mental_health_detector\models\mental_health_model_improved.pkl
Vectorizer saved to D:/mental_health_detector\models\tfidf_vectorizer_improved.pkl
Feature selector saved to D:/mental_health_detector\models\feature_selector.pkl
