# Mental Health Detection Model

This notebook contains a complete pipeline for detecting mental health issues (specifically depression) from text/tweets.

## Features:
- Text preprocessing pipeline
- TF-IDF vectorization
- Logistic Regression model (95.6% accuracy)
- Prediction function for new tweets
- Model saving and loading capabilities


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import string
import pickle
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

# Download NLTK resources (only needed once)
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    pass

print("Libraries imported successfully!")


## 1. Text Preprocessing Function

This function cleans and preprocesses text data to prepare it for model training and prediction.


In [None]:
# Initialize preprocessing components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess text by:
    1. Converting to lowercase
    2. Removing URLs
    3. Removing HTML tags
    4. Removing punctuation
    5. Removing numbers and non-alphabetic characters
    6. Removing stopwords
    7. Lemmatizing words
    
    Args:
        text (str): Input text to preprocess
        
    Returns:
        str: Preprocessed text
    """
    text = str(text).lower()                                   # Lowercase
    text = re.sub(r"http\S+|www\S+", '', text)                 # Remove URLs
    text = re.sub(r"<.*?>", " ", text)                         # Remove HTML tags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [w for w in words if w.isalpha()]                  # Remove numbers & non-alphabetic
    words = [w for w in words if w not in stop_words]          # Remove stopwords
    words = [lemmatizer.lemmatize(w) for w in words]           # Lemmatize
    return ' '.join(words)

# Test the preprocessing function
test_text = "I'm feeling really down today. Can't stop thinking about negative things. https://example.com"
print("Original:", test_text)
print("Processed:", preprocess_text(test_text))


## 2. Load and Prepare Data


In [None]:
# Set base directory (adjust if needed)
base_dir = 'D:/mental_health_detector'  # Change this to your project path if different

# Load the processed dataset
data_path = os.path.join(base_dir, 'data/processed/depression_dataset_processed.csv')
raw_data_path = os.path.join(base_dir, 'data/raw/depression_dataset_reddit_cleaned.csv')

# Check if processed data exists, otherwise use raw data
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print(f"Loaded processed dataset: {df.shape}")
    # If processed_text column exists, use it; otherwise process clean_text
    if 'processed_text' in df.columns:
        print("Using existing processed_text column")
    else:
        print("Processing clean_text column...")
        df['processed_text'] = df['clean_text'].apply(preprocess_text)
else:
    # Load raw data and process it
    print(f"Processed data not found. Loading raw data from {raw_data_path}")
    df = pd.read_csv(raw_data_path)
    print(f"Raw dataset shape: {df.shape}")
    df['processed_text'] = df['clean_text'].apply(preprocess_text)

# Display dataset info
print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nLabel distribution:")
print(df['is_depression'].value_counts())
print(f"\nLabel proportions:")
print(df['is_depression'].value_counts(normalize=True))
print(f"\nFirst few rows:")
df.head()


## 3. Feature Extraction with TF-IDF


In [None]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the processed text
X_tfidf = vectorizer.fit_transform(df['processed_text'])
y = df['is_depression']

print(f"TF-IDF matrix shape: {X_tfidf.shape}")
print(f"Number of features: {len(vectorizer.get_feature_names_out())}")
print(f"\nSample feature names: {vectorizer.get_feature_names_out()[:20]}")


## 4. Split Data into Train and Test Sets


In [None]:
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"\nLabel distribution in train:")
print(y_train.value_counts(normalize=True).to_dict())
print(f"\nLabel distribution in test:")
print(y_test.value_counts(normalize=True).to_dict())


## 5. Train the Model


In [None]:
# Initialize and train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
print("Training model...")
model.fit(X_train, y_train)
print("Model training completed!")

# Predict on test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f"\n=== Model Performance ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

print(f"\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print(f"\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))


## 6. Save Model and Vectorizer


In [None]:
# Set base directory (should match the one used above)
base_dir = 'D:/mental_health_detector'  # Change this to your project path if different

# Create models directory if it doesn't exist
models_dir = os.path.join(base_dir, 'models')
os.makedirs(models_dir, exist_ok=True)

# Save the model
model_path = os.path.join(models_dir, 'mental_health_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(model, f)
print(f"Model saved to {model_path}")

# Save the vectorizer
vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer.pkl')
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"Vectorizer saved to {vectorizer_path}")

# Save preprocessing function info (we'll recreate it in prediction)
print("\nModel and vectorizer saved successfully!")


## 7. Prediction Function for New Tweets

This function can be used to predict mental health status from a new tweet or text.


In [None]:
def predict_mental_health(text, model=None, vectorizer=None, return_probability=False, base_dir='D:/mental_health_detector'):
    """
    Predict mental health status from a text/tweet.
    
    Args:
        text (str): Input text/tweet to analyze
        model: Trained model (if None, loads from saved file)
        vectorizer: Trained vectorizer (if None, loads from saved file)
        return_probability (bool): If True, returns probability scores
        base_dir (str): Base directory path for loading saved models
        
    Returns:
        dict: Prediction results with label and confidence
    """
    # Load model and vectorizer if not provided
    if model is None:
        model_path = os.path.join(base_dir, 'models/mental_health_model.pkl')
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
    
    if vectorizer is None:
        vectorizer_path = os.path.join(base_dir, 'models/tfidf_vectorizer.pkl')
        with open(vectorizer_path, 'rb') as f:
            vectorizer = pickle.load(f)
    
    # Preprocess the text
    processed_text = preprocess_text(text)
    
    # Transform to TF-IDF features
    text_tfidf = vectorizer.transform([processed_text])
    
    # Make prediction
    prediction = model.predict(text_tfidf)[0]
    probability = model.predict_proba(text_tfidf)[0]
    
    # Map prediction to label
    label = "Depression detected" if prediction == 1 else "No depression detected"
    confidence = probability[prediction] * 100
    
    result = {
        'text': text,
        'prediction': int(prediction),
        'label': label,
        'confidence': round(confidence, 2)
    }
    
    if return_probability:
        result['probabilities'] = {
            'No depression': round(probability[0] * 100, 2),
            'Depression': round(probability[1] * 100, 2)
        }
    
    return result

# Test the prediction function with sample tweets
print("=== Testing Prediction Function ===\n")

test_tweets = [
    "I'm feeling really down today. Can't stop thinking about negative things. Life feels meaningless.",
    "Had a great day today! Went for a walk and met some friends. Feeling happy and energized!",
    "I don't want to get out of bed. Everything feels hopeless and I can't see a way out.",
    "Just finished a productive day at work. Looking forward to the weekend!",
    "I've been having suicidal thoughts lately. I don't know what to do anymore."
]

for tweet in test_tweets:
    result = predict_mental_health(tweet, return_probability=True)
    print(f"Tweet: {tweet[:80]}...")
    print(f"Prediction: {result['label']}")
    print(f"Confidence: {result['confidence']}%")
    print(f"Probabilities: {result['probabilities']}")
    print("-" * 80)


## 8. Interactive Prediction

Use this cell to test your own tweets or text.


In [None]:
# Enter your tweet/text here
your_tweet = "I'm feeling great today! Everything is going well."

# Make prediction
result = predict_mental_health(your_tweet, return_probability=True)

# Display results
print("=" * 80)
print("MENTAL HEALTH DETECTION RESULT")
print("=" * 80)
print(f"\nInput Text: {result['text']}")
print(f"\nPrediction: {result['label']}")
print(f"Confidence: {result['confidence']}%")
print(f"\nDetailed Probabilities:")
for label, prob in result['probabilities'].items():
    print(f"  {label}: {prob}%")
print("=" * 80)


## 9. Batch Prediction Function

For predicting multiple tweets at once.


In [None]:
def predict_batch(texts, model=None, vectorizer=None, base_dir='D:/mental_health_detector'):
    """
    Predict mental health status for multiple texts.
    
    Args:
        texts (list): List of texts/tweets to analyze
        model: Trained model (if None, loads from saved file)
        vectorizer: Trained vectorizer (if None, loads from saved file)
        base_dir (str): Base directory path for loading saved models
        
    Returns:
        list: List of prediction results
    """
    # Load model and vectorizer if not provided
    if model is None:
        model_path = os.path.join(base_dir, 'models/mental_health_model.pkl')
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
    
    if vectorizer is None:
        vectorizer_path = os.path.join(base_dir, 'models/tfidf_vectorizer.pkl')
        with open(vectorizer_path, 'rb') as f:
            vectorizer = pickle.load(f)
    
    # Preprocess all texts
    processed_texts = [preprocess_text(text) for text in texts]
    
    # Transform to TF-IDF features
    texts_tfidf = vectorizer.transform(processed_texts)
    
    # Make predictions
    predictions = model.predict(texts_tfidf)
    probabilities = model.predict_proba(texts_tfidf)
    
    # Format results
    results = []
    for i, text in enumerate(texts):
        label = "Depression detected" if predictions[i] == 1 else "No depression detected"
        confidence = probabilities[i][predictions[i]] * 100
        
        results.append({
            'text': text,
            'prediction': int(predictions[i]),
            'label': label,
            'confidence': round(confidence, 2),
            'probabilities': {
                'No depression': round(probabilities[i][0] * 100, 2),
                'Depression': round(probabilities[i][1] * 100, 2)
            }
        })
    
    return results

# Example: Batch prediction
sample_tweets = [
    "Feeling really sad and hopeless today",
    "Great weather today! Going to the park",
    "I can't find motivation to do anything",
    "Excited about my new project!"
]

batch_results = predict_batch(sample_tweets)

print("=== Batch Prediction Results ===\n")
for i, result in enumerate(batch_results, 1):
    print(f"{i}. {result['text']}")
    print(f"   â†’ {result['label']} (Confidence: {result['confidence']}%)")
    print()


## 10. Model Summary

This notebook provides a complete pipeline for mental health detection:

1. **Preprocessing**: Text cleaning, normalization, and feature extraction
2. **Model Training**: Logistic Regression with TF-IDF features
3. **Model Performance**: ~95.6% accuracy on test set
4. **Prediction Functions**: 
   - Single tweet prediction
   - Batch prediction
   - Probability scores included

### Usage:
- Use `predict_mental_health(text)` for single predictions
- Use `predict_batch(texts)` for multiple predictions
- Both functions can work with saved models or accept model/vectorizer as parameters

### Model Files:
- Model: `../models/mental_health_model.pkl`
- Vectorizer: `../models/tfidf_vectorizer.pkl`

### Note:
This model is trained on Reddit posts and may need fine-tuning for Twitter-specific language patterns.
