# Training

In [None]:
import json
import os
import pickle
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [None]:
class MentalHealthOrchestrator:
    def __init__(self, data_dir="qa_pair"):
        self.data_dir = data_dir
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.95
        )
        self.classifier = LogisticRegression(
            max_iter=1000,
            random_state=42,
            multi_class='multinomial'
        )
        self.classes = ['anxity', 'bipolar', 'depresion', 'ocd', 'schiz']
        self._is_trained = False  # Track if model is trained
        
    def load_data(self):
        """Load all QA pairs from JSON files"""
        questions = []
        labels = []
        
        for class_name in self.classes:
            file_path = Path(self.data_dir) / f"{class_name}_pairs.json"
            
            if not file_path.exists():
                print(f"Warning: {file_path} not found, skipping...")
                continue
            
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract questions only
            for item in data:
                if 'question' in item:
                    questions.append(item['question'])
                    labels.append(class_name)
        
        return questions, labels
    
    def train(self, test_size=0.2):
        """Train the orchestrator"""
        print("Loading data...")
        X, y = self.load_data()
        
        if len(X) == 0:
            raise ValueError(f"No data found in {self.data_dir}. Please check your data directory.")
        
        print(f"Total samples: {len(X)}")
        print(f"Class distribution: {dict(zip(*np.unique(y, return_counts=True)))}")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )
        
        # Vectorize
        print("\nVectorizing text...")
        X_train_vec = self.vectorizer.fit_transform(X_train)
        X_test_vec = self.vectorizer.transform(X_test)
        
        # Train classifier
        print("Training classifier...")
        self.classifier.fit(X_train_vec, y_train)
        
        # Mark as trained
        self._is_trained = True
        
        # Evaluate
        y_pred = self.classifier.predict(X_test_vec)
        
        print("\n" + "="*50)
        print("TRAINING RESULTS")
        print("="*50)
        print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        return accuracy_score(y_test, y_pred)
    
    def predict(self, text):
        """Predict class for a single text"""
        if not self._is_trained:
            raise ValueError("Model must be trained or loaded before making predictions!")
        
        text_vec = self.vectorizer.transform([text])
        prediction = self.classifier.predict(text_vec)[0]
        probabilities = self.classifier.predict_proba(text_vec)[0]
        
        # Create probability dictionary
        prob_dict = {
            class_name: float(prob) 
            for class_name, prob in zip(self.classifier.classes_, probabilities)
        }
        
        return prediction, prob_dict
    
    def save_model(self, filepath="orchestrator_model.pkl"):
        """Save the trained model"""
        if not self._is_trained:
            raise ValueError("Cannot save model before training! Call train() first.")
        
        model_data = {
            'vectorizer': self.vectorizer,
            'classifier': self.classifier,
            'classes': self.classes,
            'is_trained': self._is_trained
        }
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"\n✅ Model saved to {filepath}")
    
    def load_model(self, filepath="orchestrator_model.pkl"):
        """Load a trained model"""
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        self.vectorizer = model_data['vectorizer']
        self.classifier = model_data['classifier']
        self.classes = model_data['classes']
        self._is_trained = model_data.get('is_trained', True)
        
        # Verify the vectorizer is fitted
        if not hasattr(self.vectorizer, 'vocabulary_'):
            raise ValueError("Loaded model's vectorizer is not fitted! The saved model is corrupted.")
        
        print(f"✅ Model loaded from {filepath}")
        print(f"   - Vocabulary size: {len(self.vectorizer.vocabulary_)}")
        print(f"   - Classes: {self.classes}")





Loading data...
Total samples: 23200
Class distribution: {'anxity': 3189, 'bipolar': 6609, 'depresion': 4695, 'ocd': 6529, 'schiz': 2178}

Vectorizing text...
Training classifier...





TRAINING RESULTS

Accuracy: 0.8688

Classification Report:
              precision    recall  f1-score   support

      anxity       0.93      0.85      0.89       638
     bipolar       0.85      0.90      0.87      1322
   depresion       0.82      0.83      0.82       939
         ocd       0.89      0.92      0.91      1306
       schiz       0.88      0.74      0.81       435

    accuracy                           0.87      4640
   macro avg       0.87      0.85      0.86      4640
weighted avg       0.87      0.87      0.87      4640


✅ Model saved to mental_health_orchestrator.pkl

SAMPLE PREDICTIONS

Question: I feel worried all the time and can't stop thinking about bad things
Predicted class: ocd
Probabilities: {'anxity': 0.10047023350094901, 'bipolar': 0.17999654937198964, 'depresion': 0.21551832515582658, 'ocd': 0.4165474659561722, 'schiz': 0.08746742601506254}

Question: I have extreme mood swings from very happy to very sad
Predicted class: depresion
Probabilities: {'a

In [None]:
orchestrator = MentalHealthOrchestrator(data_dir="qa_pair")
orchestrator.train(test_size=0.2)
orchestrator.save_model("mental_health_orchestrator.pkl")

# Inference

In [3]:
orchestrator = MentalHealthOrchestrator(data_dir="qa_pairs")
orchestrator.load_model("mental_health_orchestrator.pkl")
prediction, probabilities = orchestrator.predict("What is the symptoms of bipolar and how can we cure it ?")

✅ Model loaded from mental_health_orchestrator.pkl
   - Vocabulary size: 5000
   - Classes: ['anxity', 'bipolar', 'depresion', 'ocd', 'schiz']


In [4]:
print(f"Prediction: {prediction}")
print(f"Probabilities: {probabilities}")

Prediction: bipolar
Probabilities: {'anxity': 0.028585588003056262, 'bipolar': 0.7638895196520479, 'depresion': 0.13106621966574447, 'ocd': 0.05898440070542949, 'schiz': 0.017474271973722016}
