In [None]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Sample data for mental health classification
depression_texts = [
    "I feel so empty inside, nothing seems to matter anymore",
    "I can't get out of bed, everything feels pointless",
    "I'm tired all the time and have no energy for anything",
    "I feel like I'm drowning in sadness and can't escape",
    "Nothing brings me joy anymore, I feel numb",
    "I hate myself and feel worthless every day",
    "I can't stop crying and feel hopeless about the future",
    "Sleep is my only escape from this overwhelming sadness",
    "I feel like a burden to everyone around me",
    "Life feels meaningless and I don't see the point",
    "I've lost interest in everything I used to enjoy",
    "I feel trapped in this dark place with no way out",
    "Every day is a struggle just to exist",
    "I feel so alone even when surrounded by people",
    "I can't remember the last time I felt genuinely happy",
    "Everything feels like too much effort lately",
    "I feel disconnected from myself and others",
    "The world seems gray and colorless to me",
    "I constantly feel guilty about everything",
    "I have no motivation to do anything anymore"
]

anxiety_texts = [
    "My heart is racing and I can't calm down",
    "I'm constantly worried about everything that could go wrong",
    "I feel like I'm having a panic attack right now",
    "What if something terrible happens to my family?",
    "I can't stop overthinking every little detail",
    "My mind is racing with worst-case scenarios",
    "I feel like everyone is judging me constantly",
    "I'm terrified of making mistakes or failing",
    "I can't breathe properly, my chest feels tight",
    "What if I embarrass myself in front of everyone?",
    "I'm shaking and sweating for no reason",
    "I feel like something bad is about to happen",
    "I can't concentrate because of all these worries",
    "My stomach is in knots from anxiety",
    "I avoid social situations because they make me panic",
    "I keep checking things over and over again",
    "What if I'm not good enough for this job?",
    "I feel like I'm losing control of everything",
    "I'm scared of being alone with my thoughts",
    "Every phone call makes me anxious about bad news"
]

neutral_texts = [
    "I had a pretty good day at work today",
    "Just finished watching a great movie with friends",
    "Planning to go for a walk in the park this weekend",
    "Learning to cook a new recipe, it's challenging but fun",
    "Had an interesting conversation with my neighbor",
    "Reading a fascinating book about science",
    "The weather is nice today, perfect for outdoor activities",
    "Just completed my morning workout routine",
    "Enjoying my coffee while listening to music",
    "Had a productive meeting with my team",
    "Trying out a new restaurant this evening",
    "My garden is looking beautiful this season",
    "Just caught up with an old friend over video call",
    "Working on organizing my home office space",
    "Planning a weekend trip to visit family",
    "Attended an interesting online workshop today",
    "My cat did something amusing this morning",
    "Finished reading the news and having breakfast",
    "Looking forward to the upcoming holiday season",
    "Had a normal day, nothing special happened"
]

# Create dataset
data = []

# Add depression samples
for text in depression_texts:
    data.append({
        'text': text,
        'label': 'Depression',
        'label_encoded': 0
    })

# Add anxiety samples
for text in anxiety_texts:
    data.append({
        'text': text,
        'label': 'Anxiety',
        'label_encoded': 1
    })

# Add neutral samples
for text in neutral_texts:
    data.append({
        'text': text,
        'label': 'Neutral',
        'label_encoded': 2
    })

# Create DataFrame
df = pd.DataFrame(data)

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Save to CSV
df.to_csv('data/raw_data.csv', index=False)

print(f"Dataset created successfully!")
print(f"Total samples: {len(df)}")
print(f"Class distribution:")
print(df['label'].value_counts())
print(f"\nFirst 5 rows:")
print(df.head())

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pickle
import os

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        self.label_encoder = LabelEncoder()

    def clean_text(self, text):
        """Clean and preprocess text"""
        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)

        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def tokenize_and_lemmatize(self, text):
        """Tokenize and lemmatize text"""
        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and len(token) > 2
        ]

        return ' '.join(tokens)

    def preprocess_dataset(self, df):
        """Preprocess the entire dataset"""
        print("Starting text preprocessing...")

        # Clean text
        df['cleaned_text'] = df['text'].apply(self.clean_text)

        # Tokenize and lemmatize
        df['processed_text'] = df['cleaned_text'].apply(self.tokenize_and_lemmatize)

        # Remove empty texts
        df = df[df['processed_text'].str.len() > 0].reset_index(drop=True)

        print(f"Preprocessing completed. {len(df)} samples remaining.")

        return df

    def fit_transform_features(self, texts):
        """Fit TF-IDF vectorizer and transform texts"""
        print("Fitting TF-IDF vectorizer...")
        X = self.tfidf.fit_transform(texts)
        print(f"Feature matrix shape: {X.shape}")
        return X

    def transform_features(self, texts):
        """Transform texts using fitted TF-IDF vectorizer"""
        return self.tfidf.transform(texts)

    def fit_labels(self, labels):
        """Fit label encoder"""
        return self.label_encoder.fit_transform(labels)

    def transform_labels(self, labels):
        """Transform labels using fitted encoder"""
        return self.label_encoder.transform(labels)

    def inverse_transform_labels(self, encoded_labels):
        """Convert encoded labels back to original labels"""
        return self.label_encoder.inverse_transform(encoded_labels)

    def save_preprocessor(self, path='model/'):
        """Save the fitted preprocessor objects"""
        os.makedirs(path, exist_ok=True)

        # Save TF-IDF vectorizer
        with open(f'{path}/tfidf_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.tfidf, f)

        # Save label encoder
        with open(f'{path}/label_encoder.pkl', 'wb') as f:
            pickle.dump(self.label_encoder, f)

        print("Preprocessor objects saved successfully!")

    def load_preprocessor(self, path='model/'):
        """Load the fitted preprocessor objects"""
        # Load TF-IDF vectorizer
        with open(f'{path}/tfidf_vectorizer.pkl', 'rb') as f:
            self.tfidf = pickle.load(f)

        # Load label encoder
        with open(f'{path}/label_encoder.pkl', 'rb') as f:
            self.label_encoder = pickle.load(f)

        print("Preprocessor objects loaded successfully!")

def main():
    # Create directories
    os.makedirs('data', exist_ok=True)
    os.makedirs('model', exist_ok=True)

    # Load data
    print("Loading dataset...")
    df = pd.read_csv('data/raw_data.csv')
    print(f"Loaded {len(df)} samples")

    # Initialize preprocessor
    preprocessor = TextPreprocessor()

    # Preprocess dataset
    df_processed = preprocessor.preprocess_dataset(df)

    # Fit and transform features
    X = preprocessor.fit_transform_features(df_processed['processed_text'])

    # Fit and transform labels
    y = preprocessor.fit_labels(df_processed['label'])

    # Save processed data
    df_processed.to_csv('data/processed_data.csv', index=False)

    # Save preprocessor objects
    preprocessor.save_preprocessor()

    print("\nPreprocessing Summary:")
    print(f"- Original samples: {len(df)}")
    print(f"- Processed samples: {len(df_processed)}")
    print(f"- Feature dimensions: {X.shape[1]}")
    print(f"- Classes: {list(preprocessor.label_encoder.classes_)}")

    return X, y, preprocessor

if __name__ == "__main__":
    X, y, preprocessor = main()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from data_preprocessing import TextPreprocessor

class MentalHealthClassifier:
    def __init__(self):
        self.models = {
            'logistic_regression': LogisticRegression(random_state=42, max_iter=1000),
            'random_forest': RandomForestClassifier(random_state=42, n_estimators=100),
            'svm': SVC(random_state=42, probability=True)
        }
        self.best_model = None
        self.best_model_name = None
        self.preprocessor = TextPreprocessor()

    def load_data(self):
        """Load and preprocess data"""
        print("Loading preprocessed data...")

        # Load processed dataset
        df = pd.read_csv('data/processed_data.csv')

        # Load preprocessor objects
        self.preprocessor.load_preprocessor()

        # Transform features
        X = self.preprocessor.transform_features(df['processed_text'])
        y = df['label_encoded'].values

        return X, y, df

    def split_data(self, X, y, test_size=0.2, val_size=0.2):
        """Split data into train, validation, and test sets"""
        # First split: train+val and test
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )

        # Second split: train and val
        val_size_adjusted = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_size_adjusted, random_state=42, stratify=y_temp
        )

        print(f"Data split completed:")
        print(f"- Training set: {X_train.shape[0]} samples")
        print(f"- Validation set: {X_val.shape[0]} samples")
        print(f"- Test set: {X_test.shape[0]} samples")

        return X_train, X_val, X_test, y_train, y_val, y_test

    def train_models(self, X_train, y_train, X_val, y_val):
        """Train multiple models and compare performance"""
        results = {}

        print("\nTraining models...")

        for name, model in self.models.items():
            print(f"\nTraining {name}...")

            # Train model
            model.fit(X_train, y_train)

            # Predictions
            train_pred = model.predict(X_train)
            val_pred = model.predict(X_val)

            # Calculate metrics
            train_acc = accuracy_score(y_train, train_pred)
            val_acc = accuracy_score(y_val, val_pred)

            # Cross-validation score
            cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

            results[name] = {
                'model': model,
                'train_accuracy': train_acc,
                'val_accuracy': val_acc,
                'cv_mean': cv_scores.mean(),
                'cv_std': cv_scores.std(),
                'val_predictions': val_pred
            }

            print(f"- Training Accuracy: {train_acc:.4f}")
            print(f"- Validation Accuracy: {val_acc:.4f}")
            print(f"- CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

        return results

    def select_best_model(self, results, X_val, y_val):
        """Select the best performing model"""
        best_score = 0
        best_name = None

        print("\nModel Comparison:")
        print("-" * 60)

        for name, result in results.items():
            val_acc = result['val_accuracy']
            print(f"{name:20s}: {val_acc:.4f}")

            if val_acc > best_score:
                best_score = val_acc
                best_name = name

        self.best_model = results[best_name]['model']
        self.best_model_name = best_name

        print(f"\nBest model: {best_name} (Validation Accuracy: {best_score:.4f})")

        return self.best_model, best_name

    def hyperparameter_tuning(self, X_train, y_train):
        """Perform hyperparameter tuning on the best model"""
        print(f"\nPerforming hyperparameter tuning for {self.best_model_name}...")

        if self.best_model_name == 'logistic_regression':
            param_grid = {
                'C': [0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear']
            }
        elif self.best_model_name == 'random_forest':
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        elif self.best_model_name == 'svm':
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'linear'],
                'gamma': ['scale', 'auto']
            }

        grid_search = GridSearchCV(
            self.best_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1
        )

        grid_search.fit(X_train, y_train)

        self.best_model = grid_search.best_estimator_

        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV score: {grid_search.best_score_:.4f}")

    def evaluate_model(self, X_test, y_test):
        """Evaluate the final model on test set"""
        print("\nFinal Model Evaluation:")
        print("=" * 50)

        # Predictions
        y_pred = self.best_model.predict(X_test)
        y_pred_proba = self.best_model.predict_proba(X_test)

        # Accuracy
        test_acc = accuracy_score(y_test, y_pred)
        print(f"Test Accuracy: {test_acc:.4f}")

        # Classification report
        class_names = self.preprocessor.label_encoder.classes_
        print(f"\nClassification Report:")
        print(classification_report(y_test, y_pred, target_names=class_names))

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        self.plot_confusion_matrix(cm, class_names)

        return y_pred, y_pred_proba

    def plot_confusion_matrix(self, cm, class_names):
        """Plot confusion matrix"""
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names, yticklabels=class_names)
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig('model/confusion_matrix.png', dpi=300, bbox_inches='tight')
        plt.show()

        print("Confusion matrix saved as 'model/confusion_matrix.png'")

    def save_model(self):
        """Save the trained model"""
        model_path = 'model/mental_health_classifier.pkl'
        joblib.dump(self.best_model, model_path)

        # Save model info
        model_info = {
            'model_name': self.best_model_name,
            'model_params': self.best_model.get_params(),
            'classes': list(self.preprocessor.label_encoder.classes_)
        }

        joblib.dump(model_info, 'model/model_info.pkl')

        print(f"Model saved as '{model_path}'")
        print("Model info saved as 'model/model_info.pkl'")

    def predict_single_text(self, text):
        """Predict mental health category for a single text"""
        # Preprocess text
        cleaned_text = self.preprocessor.clean_text(text)
        processed_text = self.preprocessor.tokenize_and_lemmatize(cleaned_text)

        # Transform to features
        X = self.preprocessor.transform_features([processed_text])

        # Predict
        prediction = self.best_model.predict(X)[0]
        probabilities = self.best_model.predict_proba(X)[0]

        # Convert to label
        predicted_label = self.preprocessor.inverse_transform_labels([prediction])[0]

        # Create probability dictionary
        prob_dict = {}
        for i, class_name in enumerate(self.preprocessor.label_encoder.classes_):
            prob_dict[class_name] = probabilities[i]

        return predicted_label, prob_dict

def main():
    # Create directories
    os.makedirs('model', exist_ok=True)

    # Initialize classifier
    classifier = MentalHealthClassifier()

    # Load data
    X, y, df = classifier.load_data()

    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test = classifier.split_data(X, y)

    # Train models
    results = classifier.train_models(X_train, y_train, X_val, y_val)

    # Select best model
    best_model, best_name = classifier.select_best_model(results, X_val, y_val)

    # Hyperparameter tuning
    classifier.hyperparameter_tuning(X_train, y_train)

    # Final evaluation
    y_pred, y_pred_proba = classifier.evaluate_model(X_test, y_test)

    # Save model
    classifier.save_model()

    # Test single prediction
    print("\n" + "="*50)
    print("Testing Single Prediction:")
    test_text = "I feel so anxious about everything and can't stop worrying"
    predicted_label, probabilities = classifier.predict_single_text(test_text)

    print(f"Text: '{test_text}'")
    print(f"Predicted: {predicted_label}")
    print("Probabilities:")
    for label, prob in probabilities.items():
        print(f"  {label}: {prob:.4f}")

if __name__ == "__main__":
    main()

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from model_inference import MentalHealthPredictor
import os
import time
from datetime import datetime

# Page configuration
st.set_page_config(
    page_title="Mental Health Chat Classifier",
    page_icon="🧠",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .subheader {
        font-size: 1.5rem;
        color: #333;
        margin-bottom: 1rem;
    }
    .prediction-box {
        padding: 1rem;
        border-radius: 10px;
        margin: 1rem 0;
    }
    .depression {
        background-color: #ffebee;
        border-left: 5px solid #f44336;
    }
    .anxiety {
        background-color: #fff3e0;
        border-left: 5px solid #ff9800;
    }
    .neutral {
        background-color: #e8f5e8;
        border-left: 5px solid #4caf50;
    }
    .confidence-high {
        color: #4caf50;
        font-weight: bold;
    }
    .confidence-medium {
        color: #ff9800;
        font-weight: bold;
    }
    .confidence-low {
        color: #f44336;
        font-weight: bold;
    }
</style>
""", unsafe_allow_html=True)

@st.cache_resource
def load_predictor():
    """Load the model predictor (cached for performance)"""
    try:
        return MentalHealthPredictor()
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        st.error("Please make sure you have trained the model by running train_model.py first")
        return None

def get_confidence_class(confidence):
    """Get CSS class based on confidence level"""
    if confidence >= 0.7:
        return "confidence-high"
    elif confidence >= 0.5:
        return "confidence-medium"
    else:
        return "confidence-low"

def create_probability_chart(probabilities):
    """Create a probability visualization chart"""
    labels = list(probabilities.keys())
    values = list(probabilities.values())

    fig = go.Figure(data=[
        go.Bar(
            x=labels,
            y=values,
            marker_color=['#f44336', '#ff9800', '#4caf50'],
            text=[f'{v:.2%}' for v in values],
            textposition='auto',
        )
    ])

    fig.update_layout(
        title="Class Probabilities",
        xaxis_title="Mental Health Categories",
        yaxis_title="Probability",
        yaxis=dict(range=[0, 1]),
        height=400,
        showlegend=False
    )

    return fig

def main():
    # Header
    st.markdown('<h1 class="main-header">🧠 Mental Health Chat Classifier</h1>', unsafe_allow_html=True)

    st.markdown("""
    <div style="text-align: center; margin-bottom: 2rem;">
        <p style="font-size: 1.2rem; color: #666;">
        Analyze chat messages to identify potential mental health indicators
        </p>
        <p style="font-size: 1rem; color: #888;">
        <strong>Categories:</strong> Depression, Anxiety, Neutral
        </p>
    </div>
    """, unsafe_allow_html=True)

    # Load predictor
    predictor = load_predictor()

    if predictor is None:
        st.stop()

    # Sidebar
    st.sidebar.title("📊 About")
    st.sidebar.info("""
    This app uses machine learning to classify chat messages into mental health categories:

    🔴 **Depression**: Signs of sadness, hopelessness, or low mood

    🟠 **Anxiety**: Signs of worry, panic, or nervousness

    🟢 **Neutral**: Normal conversation without mental health indicators

    **Note**: This tool is for educational purposes only and should not replace professional mental health advice.
    """)

    # Model info
    if hasattr(predictor, 'model_info') and predictor.model_info:
        st.sidebar.subheader("🤖 Model Info")
        st.sidebar.write(f"**Algorithm**: {predictor.model_info['model_name'].replace('_', ' ').title()}")
        st.sidebar.write(f"**Classes**: {len(predictor.model_info['classes'])}")

    # Main content
    col1, col2 = st.columns([2, 1])

    with col1:
        st.subheader("💬 Enter Your Message")

        # Text input options
        input_method = st.radio("Choose input method:", ["Single Message", "Multiple Messages"])

        if input_method == "Single Message":
            # Single text input
            user_input = st.text_area(
                "Type your message here:",
                height=150,
                placeholder="e.g., I feel really anxious about my upcoming presentation and can't stop worrying about it..."
            )

            # Analyze button
            if st.button("🔍 Analyze Message", type="primary"):
                if user_input.strip():
                    with st.spinner("Analyzing message..."):
                        result = predictor.predict(user_input)

                    if result['error']:
                        st.error(f"Error: {result['error']}")
                    else:
                        # Display results
                        st.subheader("📋 Analysis Results")

                        # Main prediction
                        predicted_class = result['predicted_class']
                        confidence = result['confidence']

                        # Prediction box with styling
                        box_class = predicted_class.lower()
                        confidence_class = get_confidence_class(confidence)

                        st.markdown(f"""
                        <div class="prediction-box {box_class}">
                            <h3>Predicted Category: {predicted_class}</h3>
                            <p class="{confidence_class}">Confidence: {confidence:.2%}</p>
                        </div>
                        """, unsafe_allow_html=True)

                        # Probability chart
                        fig = create_probability_chart(result['probabilities'])
                        st.plotly_chart(fig, use_container_width=True)

                        # Feature analysis
                        with st.expander("🔬 Feature Analysis"):
                            analysis = predictor.analyze_text_features(user_input)
                            if 'error' not in analysis:
                                st.write("**Top contributing words/phrases:**")
                                for i, feature in enumerate(analysis['top_features'][:10], 1):
                                    st.write(f"{i}. **{feature['feature']}** (Score: {feature['tfidf_score']:.3f})")
                else:
                    st.warning("Please enter a message to analyze.")

        else:
            # Multiple text input
            st.subheader("📝 Batch Analysis")

            # Text area for multiple messages
            multiple_input = st.text_area(
                "Enter multiple messages (one per line):",
                height=200,
                placeholder="I feel so anxious about everything\nHad a great day today\nI can't get out of bed anymore"
            )

            if st.button("🔍 Analyze All Messages", type="primary"):
                if multiple_input.strip():
                    messages = [msg.strip() for msg in multiple_input.split('\n') if msg.strip()]

                    if messages:
                        with st.spinner(f"Analyzing {len(messages)} messages..."):
                            results = predictor.predict_batch(messages)

                        # Display batch results
                        st.subheader("📊 Batch Analysis Results")

                        # Summary statistics
                        valid_results = [r for r in results if not r['error']]
                        if valid_results:
                            categories = [r['predicted_class'] for r in valid_results]
                            category_counts = pd.Series(categories).value_counts()

                            # Category distribution chart
                            fig_pie = px.pie(
                                values=category_counts.values,
                                names=category_counts.index,
                                title="Distribution of Predicted Categories",
                                color_discrete_map={
                                    'Depression': '#f44336',
                                    'Anxiety': '#ff9800',
                                    'Neutral': '#4caf50'
                                }
                            )
                            st.plotly_chart(fig_pie, use_container_width=True)

                            # Detailed results table
                            st.subheader("📋 Detailed Results")
                            results_df = []
                            for i, result in enumerate(results, 1):
                                if result['error']:
                                    results_df.append({
                                        'Message #': i,
                                        'Text': result.get('original_text', 'N/A')[:50] + '...',
                                        'Prediction': 'Error',
                                        'Confidence': 0,
                                        'Error': result['error']
                                    })
                                else:
                                    results_df.append({
                                        'Message #': i,
                                        'Text': result['original_text'][:50] + '...',
                                        'Prediction': result['predicted_class'],
                                        'Confidence': f"{result['confidence']:.2%}",
                                        'Error': None
                                    })

                            df = pd.DataFrame(results_df)
                            st.dataframe(df, use_container_width=True)
                        else:
                            st.error("No valid messages could be processed.")
                    else:
                        st.warning("Please enter at least one message.")
                else:
                    st.warning("Please enter messages to analyze.")

    with col2:
        st.subheader("📈 Quick Stats")

        # Sample predictions for demo
        sample_texts = [
            "I feel anxious about my presentation",
            "Having a wonderful day today",
            "I feel hopeless and empty inside"
        ]

        if st.button("🎲 Try Sample Predictions"):
            st.subheader("Sample Analysis")
            for i, text in enumerate(sample_texts, 1):
                result = predictor.predict(text)
                if not result['error']:
                    st.write(f"**{i}.** _{text}_")
                    st.write(f"→ **{result['predicted_class']}** ({result['confidence']:.1%})")
                    st.write("---")

        # Tips section
        st.subheader("💡 Tips for Better Results")
        st.info("""
        - Use complete sentences
        - Include emotional context
        - Be specific about feelings
        - Avoid very short messages
        - Use natural language
        """)

        # Warning
        st.warning("""
        ⚠️ **Disclaimer**: This tool is for educational purposes only.

        If you're experiencing mental health issues, please consult with a qualified mental health professional.

        🆘 **Crisis Resources:**
        - National Suicide Prevention Lifeline: 988
        - Crisis Text Line: Text HOME to 741741
        """)

if __name__ == "__main__":
    main()

In [None]:
# Mental Health Chat Classifier - Training Notebook

# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

# Cell 2: Generate Sample Data
import sys
sys.path.append('.')
from generate_sample_data import *

# Generate sample data
exec(open('generate_sample_data.py').read())
print("Sample data generated!")

# Cell 3: Load and Explore Data
# Load the generated dataset
df = pd.read_csv('data/raw_data.csv')

print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nClass Distribution:")
print(df['label'].value_counts())

# Display sample texts
print("\nSample texts by category:")
for label in df['label'].unique():
    print(f"\n{label} examples:")
    samples = df[df['label'] == label]['text'].head(3)
    for i, text in enumerate(samples, 1):
        print(f"  {i}. {text}")

# Cell 4: Data Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Class distribution
df['label'].value_counts().plot(kind='bar', ax=axes[0,0], color=['#ff6b6b', '#4ecdc4', '#45b7d1'])
axes[0,0].set_title('Class Distribution')
axes[0,0].set_xlabel('Mental Health Category')
axes[0,0].set_ylabel('Count')
axes[0,0].tick_params(axis='x', rotation=45)

# Text length distribution
df['text_length'] = df['text'].str.len()
df.boxplot(column='text_length', by='label', ax=axes[0,1])
axes[0,1].set_title('Text Length Distribution by Category')
axes[0,1].set_xlabel('Mental Health Category')
axes[0,1].set_ylabel('Text Length (characters)')

# Word count distribution
df['word_count'] = df['text'].str.split().str.len()
for label in df['label'].unique():
    data = df[df['label'] == label]['word_count']
    axes[1,0].hist(data, alpha=0.7, label=label, bins=10)
axes[1,0].set_title('Word Count Distribution')
axes[1,0].set_xlabel('Number of Words')
axes[1,0].set_ylabel('Frequency')
axes[1,0].legend()

# Label encoding distribution
df['label_encoded'].value_counts().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
axes[1,1].set_title('Label Encoding Distribution')

plt.tight_layout()
plt.show()

# Cell 5: Text Preprocessing
from data_preprocessing import TextPreprocessor

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Preprocess the dataset
df_processed = preprocessor.preprocess_dataset(df.copy())

print("Preprocessing completed!")
print(f"Original text example: '{df.iloc[0]['text']}'")
print(f"Cleaned text: '{df_processed.iloc[0]['cleaned_text']}'")
print(f"Processed text: '{df_processed.iloc[0]['processed_text']}'")

# Cell 6: Feature Engineering
# Fit TF-IDF vectorizer
X = preprocessor.fit_transform_features(df_processed['processed_text'])
y = preprocessor.fit_labels(df_processed['label'])

print(f"Feature matrix shape: {X.shape}")
print(f"Number of unique features: {X.shape[1]}")
print(f"Sparsity: {(1.0 - X.nnz / (X.shape[0] * X.shape[1])) * 100:.2f}%")

# Show top TF-IDF features
feature_names = preprocessor.tfidf.get_feature_names_out()
tfidf_scores = X.mean(axis=0).A1
top_features = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)

print("\nTop 20 TF-IDF features:")
for i, (feature, score) in enumerate(top_features[:20], 1):
    print(f"{i:2d}. {feature:15s}: {score:.4f}")

# Cell 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Check class distribution in splits
train_labels = preprocessor.inverse_transform_labels(y_train)
test_labels = preprocessor.inverse_transform_labels(y_test)

print(f"\nTraining set distribution:")
print(pd.Series(train_labels).value_counts())
print(f"\nTest set distribution:")
print(pd.Series(test_labels).value_counts())

# Cell 8: Model Training and Comparison
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True)
}

results = {}

print("Training and evaluating models...")
print("=" * 50)

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Train model
    model.fit(X_train, y_train)

    # Predictions
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    # Calculate metrics
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)

    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

    results[name] = {
        'model': model,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_predictions': test_pred
    }

    print(f"  Training Accuracy: {train_acc:.4f}")
    print(f"  Test Accuracy: {test_acc:.4f}")
    print(f"  CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Cell 9: Model Comparison Visualization
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Training Accuracy': [results[name]['train_accuracy'] for name in results.keys()],
    'Test Accuracy': [results[name]['test_accuracy'] for name in results.keys()],
    'CV Mean': [results[name]['cv_mean'] for name in results.keys()],
    'CV Std': [results[name]['cv_std'] for name in results.keys()]
})

print("\nModel Comparison:")
print(comparison_df.round(4))

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Accuracy comparison
x = np.arange(len(results))
width = 0.25

axes[0].bar(x - width, comparison_df['Training Accuracy'], width, label='Training', alpha=0.8)
axes[0].bar(x, comparison_df['Test Accuracy'], width, label='Test', alpha=0.8)
axes[0].bar(x + width, comparison_df['CV Mean'], width, label='CV Mean', alpha=0.8)

axes[0].set_xlabel('Models')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Performance Comparison')
axes[0].set_xticks(x)
axes[0].set_xticklabels(comparison_df['Model'], rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# CV scores with error bars
axes[1].bar(comparison_df['Model'], comparison_df['CV Mean'],
           yerr=comparison_df['CV Std'], capsize=5, alpha=0.8)
axes[1].set_ylabel('Cross-Validation Accuracy')
axes[1].set_title('Cross-Validation Performance')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Cell 10: Select Best Model and Hyperparameter Tuning
# Select best model based on test accuracy
best_model_name = max(results.keys(), key=lambda x: results[x]['test_accuracy'])
best_model = results[best_model_name]['model']

print(f"Best performing model: {best_model_name}")
print(f"Test accuracy: {results[best_model_name]['test_accuracy']:.4f}")

# Hyperparameter tuning for the best model
print(f"\nPerforming hyperparameter tuning for {best_model_name}...")

if best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
elif best_model_name == 'SVM':
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }

# Grid search
grid_search = GridSearchCV(
    best_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Update best model
best_model_tuned = grid_search.best_estimator_

# Cell 11: Final Model Evaluation
print("Final Model Evaluation")
print("=" * 50)

# Predictions with tuned model
y_pred = best_model_tuned.predict(X_test)
y_pred_proba = best_model_tuned.predict_proba(X_test)

# Accuracy
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy: {final_accuracy:.4f}")

# Classification report
class_names = preprocessor.label_encoder.classes_
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:")
print(cm)

# Cell 12: Confusion Matrix Visualization
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix - Final Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Detailed confusion matrix analysis
print("\nConfusion Matrix Analysis:")
for i, true_class in enumerate(class_names):
    for j, pred_class in enumerate(class_names):
        count = cm[i, j]
        if i == j:
            print(f"✓ Correctly classified {true_class}: {count} samples")
        else:
            print(f"✗ {true_class} misclassified as {pred_class}: {count} samples")

# Cell 13: Feature Importance Analysis
if hasattr(best_model_tuned, 'coef_'):
    # For linear models (Logistic Regression, SVM with linear kernel)
    feature_importance = np.abs(best_model_tuned.coef_).mean(axis=0)
elif hasattr(best_model_tuned, 'feature_importances_'):
    # For tree-based models (Random Forest)
    feature_importance = best_model_tuned.feature_importances_
else:
    feature_importance = None

if feature_importance is not None:
    # Get feature names
    feature_names = preprocessor.tfidf.get_feature_names_out()

    # Create feature importance DataFrame
    feature_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)

    print("Top 20 Most Important Features:")
    print(feature_df.head(20))

    # Plot top features
    plt.figure(figsize=(12, 8))
    top_features = feature_df.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 15 Most Important Features')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

# Cell 14: Prediction Examples
print("Prediction Examples on Test Set:")
print("=" * 50)

# Get original text for test samples
test_indices = X_test.nonzero()[0][:5]  # First 5 test samples
test_texts = df_processed.iloc[test_indices]['text'].values
test_true_labels = preprocessor.inverse_transform_labels(y_test[:5])
test_pred_labels = preprocessor.inverse_transform_labels(y_pred[:5])
test_probabilities = y_pred_proba[:5]

for i in range(5):
    print(f"\nExample {i+1}:")
    print(f"Text: '{test_texts[i][:100]}{'...' if len(test_texts[i]) > 100 else ''}'")
    print(f"True Label: {test_true_labels[i]}")
    print(f"Predicted: {test_pred_labels[i]}")
    print(f"Probabilities:")
    for j, class_name in enumerate(class_names):
        print(f"  {class_name}: {test_probabilities[i][j]:.3f}")
    print("-" * 40)

# Cell 15: Save Model and Preprocessor
import os
import joblib

# Create model directory
os.makedirs('model', exist_ok=True)

# Save model
joblib.dump(best_model_tuned, 'model/mental_health_classifier.pkl')

# Save preprocessor components
preprocessor.save_preprocessor('model/')

# Save model info
model_info = {
    'model_name': best_model_name.lower().replace(' ', '_'),
    'model_params': best_model_tuned.get_params(),
    'classes': list(class_names),
    'final_accuracy': final_accuracy,
    'best_cv_score': grid_search.best_score_,
    'feature_count': X_train.shape[1]
}

joblib.dump(model_info, 'model/model_info.pkl')

print("Model and preprocessor saved successfully!")
print(f"Model accuracy: {final_accuracy:.4f}")
print(f"Model type: {best_model_name}")
print(f"Feature count: {X_train.shape[1]}")

# Cell 16: Test Inference Pipeline
from model_inference import MentalHealthPredictor

# Test the complete inference pipeline
print("Testing Inference Pipeline:")
print("=" * 30)

try:
    predictor = MentalHealthPredictor()

    # Test examples
    test_examples = [
        "I feel so anxious about my upcoming presentation and can't stop worrying",
        "Had an amazing day with friends, feeling really positive",
        "I can't get out of bed and feel completely hopeless about everything"
    ]

    for i, text in enumerate(test_examples, 1):
        result = predictor.predict(text)
        print(f"\nTest {i}:")
        print(f"Text: '{text}'")
        if result['error']:
            print(f"Error: {result['error']}")
        else:
            print(f"Predicted: {result['predicted_class']}")
            print(f"Confidence: {result['confidence']:.2%}")
            print("Probabilities:")
            for label, prob in result['probabilities'].items():
                print(f"  {label}: {prob:.3f}")

    print("\n✓ Inference pipeline working correctly!")

except Exception as e:
    print(f"✗ Error in inference pipeline: {e}")

# Cell 17: Model Summary and Next Steps
print("\n" + "="*60)
print("🧠 MENTAL HEALTH CHAT CLASSIFIER - TRAINING COMPLETE")
print("="*60)

print(f"\n📊 FINAL RESULTS:")
print(f"   • Model Type: {best_model_name}")
print(f"   • Test Accuracy: {final_accuracy:.2%}")
print(f"   • Cross-Validation Score: {grid_search.best_score_:.2%}")
print(f"   • Feature Count: {X_train.shape[1]}")
print(f"   • Training Samples: {X_train.shape[0]}")
print(f"   • Test Samples: {X_test.shape[0]}")

print(f"\n🎯 CLASS PERFORMANCE:")
# Calculate per-class metrics
for i, class_name in enumerate(class_names):
    class_mask = (y_test == i)
    class_pred_mask = (y_pred == i)

    # True positives, false positives, false negatives
    tp = np.sum((y_test == i) & (y_pred == i))
    fp = np.sum((y_test != i) & (y_pred == i))
    fn = np.sum((y_test == i) & (y_pred != i))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    print(f"   • {class_name:10s}: Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}")

print(f"\n📁 SAVED FILES:")
print(f"   • model/mental_health_classifier.pkl")
print(f"   • model/tfidf_vectorizer.pkl")
print(f"   • model/label_encoder.pkl")
print(f"   • model/model_info.pkl")
print(f"   • data/processed_data.csv")

print(f"\n🚀 NEXT STEPS:")
print(f"   1. Run the Streamlit app: streamlit run app.py")
print(f"   2. Test with your own text samples")
print(f"   3. Deploy to Streamlit Cloud or other platforms")
print(f"   4. Collect more data to improve performance")
print(f"   5. Try advanced models (BERT, RoBERTa)")

print(f"\n⚠️  IMPORTANT REMINDERS:")
print(f"   • This is for educational purposes only")
print(f"   • Not a substitute for professional mental health care")
print(f"   • Always include appropriate disclaimers in deployment")

print("\n✨ Training notebook completed successfully! ✨")

In [None]:
#!/usr/bin/env python3
"""
Mental Health Chat Classifier - Complete Pipeline Runner

This script runs the entire pipeline from data generation to model training.
Perfect for setting up the project from scratch.

Usage:
    python run_pipeline.py [--skip-data] [--quick-train]

Arguments:
    --skip-data: Skip data generation if data already exists
    --quick-train: Use faster training settings (fewer CV folds, smaller grid search)
"""

import os
import sys
import argparse
import time
from datetime import datetime

def print_banner():
    """Print project banner"""
    banner = """
    ╔══════════════════════════════════════════════════════════╗
    ║              🧠 Mental Health Chat Classifier             ║
    ║                     Pipeline Runner                      ║
    ╚══════════════════════════════════════════════════════════╝
    """
    print(banner)

def print_step(step_num, step_name, description=""):
    """Print formatted step information"""
    print(f"\n{'='*60}")
    print(f"STEP {step_num}: {step_name.upper()}")
    if description:
        print(f"Description: {description}")
    print(f"{'='*60}")

def check_dependencies():
    """Check if all required packages are installed"""
    print_step(0, "Dependency Check", "Verifying required packages are installed")

    required_packages = [
        'pandas', 'numpy', 'scikit-learn', 'nltk',
        'matplotlib', 'seaborn', 'joblib', 'streamlit'
    ]

    missing_packages = []

    for package in required_packages:
        try:
            __import__(package)
            print(f"✓ {package}")
        except ImportError:
            missing_packages.append(package)
            print(f"✗ {package}")

    if missing_packages:
        print(f"\n❌ Missing packages: {', '.join(missing_packages)}")
        print("Please install them using: pip install -r requirements.txt")
        return False

    print("\n✅ All dependencies satisfied!")
    return True

def create_directories():
    """Create necessary directories"""
    print_step(1, "Setup", "Creating project directories")

    directories = ['data', 'model', 'notebooks']

    for directory in directories:
        os.makedirs(directory, exist_ok=True)
        print(f"✓ Created/verified directory: {directory}/")

    print("✅ Directory structure ready!")

def generate_data(skip_if_exists=False):
    """Generate sample dataset"""
    print_step(2, "Data Generation", "Creating sample mental health text dataset")

    if skip_if_exists and os.path.exists('data/raw_data.csv'):
        print("⏭️  Data already exists, skipping generation...")
        return True

    try:
        print("📝 Generating sample mental health text data...")
        exec(open('generate_sample_data.py').read())

        # Verify data was created
        if os.path.exists('data/raw_data.csv'):
            import pandas as pd
            df = pd.read_csv('data/raw_data.csv')
            print(f"✅ Dataset created successfully!")
            print(f"   • Total samples: {len(df)}")
            print(f"   • Classes: {df['label'].unique().tolist()}")
            print(f"   • Distribution: {dict(df['label'].value_counts())}")
            return True
        else:
            print("❌ Failed to create dataset file")
            return False

    except Exception as e:
        print(f"❌ Error generating data: {str(e)}")
        return False

def preprocess_data():
    """Preprocess the generated data"""
    print_step(3, "Data Preprocessing", "Cleaning and preparing text data")

    try:
        print("🧹 Cleaning and preprocessing text data...")
        from data_preprocessing import main as preprocess_main
        X, y, preprocessor = preprocess_main()

        print("✅ Data preprocessing completed!")
        print(f"   • Feature matrix shape: {X.shape}")
        print(f"   • Processed samples: {len(y)}")
        print(f"   • Feature count: {X.shape[1]}")

        return True

    except Exception as e:
        print(f"❌ Error in preprocessing: {str(e)}")
        return False

def train_model(quick_mode=False):
    """Train the machine learning model"""
    print_step(4, "Model Training", "Training and evaluating ML models")

    try:
        print("🤖 Training machine learning models...")

        if quick_mode:
            print("⚡ Quick training mode enabled - faster but less thorough")

        # Import and run training
        from train_model import main as train_main
        train_main()

        # Verify model was saved
        if os.path.exists('model/mental_health_classifier.pkl'):
            print("✅ Model training completed successfully!")

            # Load and display model info
            import joblib
            model_info = joblib.load('model/model_info.pkl')
            print(f"   • Best model: {model_info['model_name']}")
            print(f"   • Test accuracy: {model_info['final_accuracy']:.2%}")
            print(f"   • Feature count: {model_info['feature_count']}")

            return True
        else:
            print("❌ Model file not found after training")
            return False

    except Exception as e:
        print(f"❌ Error in model training: {str(e)}")
        return False

def test_inference():
    """Test the model inference pipeline"""
    print_step(5, "Inference Testing", "Testing model prediction capabilities")

    try:
        print("🧪 Testing model inference pipeline...")

        from model_inference import MentalHealthPredictor

        # Initialize predictor
        predictor = MentalHealthPredictor()

        # Test examples
        test_examples = [
            "I feel so anxious and worried about everything",
            "Having a wonderful day with friends today",
            "I feel hopeless and can't find any motivation"
        ]

        print("\n📋 Test Predictions:")
        for i, text in enumerate(test_examples, 1):
            result = predictor.predict(text)
            if result['error']:
                print(f"   {i}. Error: {result['error']}")
                return False
            else:
                print(f"   {i}. '{text[:40]}...'")
                print(f"      → {result['predicted_class']} ({result['confidence']:.1%})")

        print("\n✅ Inference pipeline working correctly!")
        return True

    except Exception as e:
        print(f"❌ Error in inference testing: {str(e)}")
        return False

def check_app_readiness():
    """Check if Streamlit app is ready to run"""
    print_step(6, "App Readiness", "Verifying Streamlit app can be launched")

    try:
        # Check if app file exists
        if not os.path.exists('app.py'):
            print("❌ app.py not found")
            return False

        # Check if model files exist
        required_files = [
            'model/mental_health_classifier.pkl',
            'model/tfidf_vectorizer.pkl',
            'model/label_encoder.pkl',
            'model/model_info.pkl'
        ]

        for file_path in required_files:
            if not os.path.exists(file_path):
                print(f"❌ Required file missing: {file_path}")
                return False
            print(f"✓ {file_path}")

        print("\n✅ Streamlit app is ready to launch!")
        print("🚀 Run the app with: streamlit run app.py")
        return True

    except Exception as e:
        print(f"❌ Error checking app readiness: {str(e)}")
        return False

def print_summary(start_time, success=True):
    """Print pipeline execution summary"""
    end_time = time.time()
    duration = end_time - start_time

    print(f"\n{'='*60}")
    print("🏁 PIPELINE EXECUTION SUMMARY")
    print(f"{'='*60}")

    if success:
        print("✅ Pipeline completed successfully!")
    else:
        print("❌ Pipeline failed - see errors above")

    print(f"⏱️  Total execution time: {duration:.1f} seconds")
    print(f"🕐 Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    if success:
        print(f"\n🎉 Next Steps:")
        print(f"   1. Launch the web app: streamlit run app.py")
        print(f"   2. Open your browser to: http://localhost:8501")
        print(f"   3. Test the classifier with your own text!")
        print(f"   4. Explore the Jupyter notebook: notebooks/train_model.ipynb")

        print(f"\n📁 Generated Files:")
        print(f"   • data/raw_data.csv - Original dataset")
        print(f"   • data/processed_data.csv - Cleaned dataset")
        print(f"   • model/*.pkl - Trained model files")
        print(f"   • model/confusion_matrix.png - Model evaluation")

def main():
    """Main pipeline execution function"""
    parser = argparse.ArgumentParser(description="Mental Health Chat Classifier Pipeline")
    parser.add_argument('--skip-data', action='store_true',
                       help='Skip data generation if data already exists')
    parser.add_argument('--quick-train', action='store_true',
                       help='Use faster training settings')
    parser.add_argument('--test-only', action='store_true',
                       help='Only run inference testing (assumes model exists)')

    args = parser.parse_args()

    start_time = time.time()
    print_banner()

    # Pipeline steps
    steps = [
        ("Dependency Check", lambda: check_dependencies()),
        ("Setup Directories", lambda: create_directories()),
    ]

    if not args.test_only:
        steps.extend([
            ("Generate Data", lambda: generate_data(args.skip_data)),
            ("Preprocess Data", lambda: preprocess_data()),
            ("Train Model", lambda: train_model(args.quick_train)),
        ])

    steps.extend([
        ("Test Inference", lambda: test_inference()),
        ("Check App Readiness", lambda: check_app_readiness()),
    ])

    # Execute pipeline
    success = True
    for step_name, step_func in steps:
        try:
            if not step_func():
                success = False
                break
        except KeyboardInterrupt:
            print(f"\n\n⚠️  Pipeline interrupted by user")
            success = False
            break
        except Exception as e:
            print(f"\n❌ Unexpected error in {step_name}: {str(e)}")
            success = False
            break

    # Print summary
    print_summary(start_time, success)

    return 0 if success else 1

if __name__ == "__main__":
    sys.exit(main())