<a href="https://colab.research.google.com/github/aymenchibouti/newversion/blob/master/claude_enhanced_model_rendom_30082025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install shap
!pip install lime
!pip install xgboost
!pip install lightgbm
!pip install tensorflow


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
COMPLETE ENHANCED MACHINE LEARNING PIPELINE FOR DROPOUT PREDICTION
==================================================================

This is the main execution script that combines all advanced techniques:
- Data preprocessing and feature engineering
- Multiple sampling strategies (SMOTE, ADASYN, Undersampling)
- Variational Autoencoder for feature extraction
- Comprehensive model comparison (RandomForest, XGBoost, LightGBM, etc.)
- Advanced hyperparameter tuning
- Model interpretation (SHAP, LIME, Permutation Importance)
- Model persistence and deployment utilities

Usage: python complete_pipeline.py

Author: Enhanced ML Pipeline
Date: 2024
"""

import os
import sys
import time
import warnings
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Import all required libraries
try:
    # Core ML libraries
    from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.neural_network import MLPClassifier
    from sklearn.svm import SVC
    from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                               confusion_matrix, classification_report, roc_auc_score, roc_curve)

    # Imbalanced data handling
    from imblearn.over_sampling import SMOTE, ADASYN
    from imblearn.under_sampling import RandomUnderSampler

    # Advanced models
    import xgboost as xgb
    import lightgbm as lgb

    # Model interpretation
    try:
        import shap
        SHAP_AVAILABLE = True
    except ImportError:
        print("Warning: SHAP not available. Install with: pip install shap")
        SHAP_AVAILABLE = False

    try:
        import lime
        import lime.lime_tabular
        LIME_AVAILABLE = True
    except ImportError:
        print("Warning: LIME not available. Install with: pip install lime")
        LIME_AVAILABLE = False

    # Deep learning for VAE
    try:
        import tensorflow as tf
        from tensorflow import keras
        from tensorflow.keras import layers
        TF_AVAILABLE = True
    except ImportError:
        print("Warning: TensorFlow not available. Install with: pip install tensorflow")
        TF_AVAILABLE = False

    # Model persistence
    import joblib
    import pickle

except ImportError as e:
    print(f"Error importing required libraries: {e}")
    print("Please install required packages:")
    print("pip install scikit-learn imbalanced-learn xgboost lightgbm shap lime tensorflow joblib")
    sys.exit(1)

class CompleteMLPipeline:
    """Complete enhanced machine learning pipeline"""

    def __init__(self, data_path='model1_210_features_spliting.csv', output_dir='ml_results'):
        self.data_path = data_path
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

        # Initialize attributes
        self.data = None
        self.X = None
        self.y = None
        self.feature_names = None
        self.scaler = StandardScaler()
        self.results = {}
        self.best_model = None
        self.best_score = 0

        # Create subdirectories
        (self.output_dir / 'models').mkdir(exist_ok=True)
        (self.output_dir / 'plots').mkdir(exist_ok=True)
        (self.output_dir / 'reports').mkdir(exist_ok=True)

        print(f"🚀 Enhanced ML Pipeline initialized")
        print(f"📁 Output directory: {self.output_dir.absolute()}")

    def load_and_explore_data(self):
        """Load and perform initial data exploration"""
        print("\\n" + "="*60)
        print("📊 DATA LOADING AND EXPLORATION")
        print("="*60)

        # Load data
        if not os.path.exists(self.data_path):
            raise FileNotFoundError(f"Data file not found: {self.data_path}")

        print(f"Loading data from: {self.data_path}")
        self.data = pd.read_csv(self.data_path)

        # Basic info
        print(f"Dataset shape: {self.data.shape}")
        print(f"Memory usage: {self.data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

        # Remove non-feature columns
        cols_to_drop = ['username', 'course_id', 'enrollment_id']
        existing_cols = [col for col in cols_to_drop if col in self.data.columns]
        if existing_cols:
            self.data = self.data.drop(columns=existing_cols)
            print(f"Removed columns: {existing_cols}")

        # Handle missing values
        missing_before = self.data.isnull().sum().sum()
        if missing_before > 0:
            print(f"Missing values found: {missing_before}")
            self.data = self.data.fillna(0)
            print("Missing values filled with 0")

        # Verify target column
        if 'dropout' not in self.data.columns:
            raise KeyError("Target column 'dropout' not found")

        # Prepare features and target
        self.X = self.data.drop(columns=['dropout'])
        self.y = self.data['dropout']
        self.feature_names = self.X.columns.tolist()

        # Target distribution
        target_counts = self.y.value_counts()
        target_pct = self.y.value_counts(normalize=True) * 100

        print(f"\\nTarget distribution:")
        print(f"  No dropout (0): {target_counts[0]:,} ({target_pct[0]:.1f}%)")
        print(f"  Dropout (1): {target_counts[1]:,} ({target_pct[1]:.1f}%)")
        print(f"  Imbalance ratio: {target_counts[0] / target_counts[1]:.1f}:1")

        # Create visualization
        self._create_eda_plots()

        return self.X, self.y

    def _create_eda_plots(self):
        """Create exploratory data analysis plots"""
        print("Creating EDA visualizations...")

        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        fig.suptitle('Exploratory Data Analysis', fontsize=16, fontweight='bold')

        # 1. Target distribution
        target_counts = self.y.value_counts()
        axes[0, 0].pie(target_counts.values, labels=['No Dropout', 'Dropout'],
                      autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightcoral'])
        axes[0, 0].set_title('Target Distribution')

        # 2. Feature correlation with target
        numeric_data = pd.concat([self.X, self.y], axis=1)
        correlations = numeric_data.corr()['dropout'].drop('dropout').abs().sort_values(ascending=False)[:20]

        axes[0, 1].barh(range(len(correlations)), correlations.values, color='skyblue')
        axes[0, 1].set_yticks(range(len(correlations)))
        axes[0, 1].set_yticklabels([name[:15] + '...' if len(name) > 15 else name for name in correlations.index], fontsize=8)
        axes[0, 1].set_title('Top 20 Features Correlated with Dropout')
        axes[0, 1].set_xlabel('Absolute Correlation')

        # 3. Sample feature distributions
        sample_features = self.X.columns[:6]  # First 6 features
        for i, feature in enumerate(sample_features):
            row, col = divmod(i + 2, 3)
            if row < 2:
                axes[row, col].hist(self.X[feature], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
                axes[row, col].set_title(f'{feature[:20]}...' if len(feature) > 20 else feature, fontsize=10)
                axes[row, col].set_xlabel('Value')
                axes[row, col].set_ylabel('Frequency')

        plt.tight_layout()
        plt.savefig(self.output_dir / 'plots' / 'eda_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        # Correlation heatmap for top features
        top_features = correlations.head(15).index.tolist() + ['dropout']
        plt.figure(figsize=(12, 10))
        correlation_matrix = numeric_data[top_features].corr()

        mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
        sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm',
                   center=0, square=True, fmt='.2f', cbar_kws={"shrink": .8})
        plt.title('Correlation Heatmap - Top Features + Target')
        plt.tight_layout()
        plt.savefig(self.output_dir / 'plots' / 'correlation_heatmap.png', dpi=300, bbox_inches='tight')
        plt.show()

    def prepare_datasets(self):
        """Prepare multiple datasets with different sampling strategies"""
        print("\\n" + "="*60)
        print("🔄 DATASET PREPARATION WITH MULTIPLE SAMPLING STRATEGIES")
        print("="*60)

        # Scale features
        print("Scaling features...")
        X_scaled = self.scaler.fit_transform(self.X)

        # Initial train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, self.y, test_size=0.2, random_state=42, stratify=self.y
        )

        print(f"Train set: {X_train.shape[0]:,} samples")
        print(f"Test set: {X_test.shape[0]:,} samples")

        datasets = {}

        # 1. Original dataset (imbalanced)
        datasets['original'] = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'description': 'Original imbalanced dataset'
        }
        print(f"✓ Original dataset prepared")

        # 2. SMOTE oversampling
        try:
            smote = SMOTE(random_state=42, k_neighbors=min(5, sum(y_train) - 1))
            X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
            datasets['smote'] = {
                'X_train': X_train_smote,
                'X_test': X_test,
                'y_train': y_train_smote,
                'y_test': y_test,
                'description': 'SMOTE oversampled dataset'
            }
            print(f"✓ SMOTE dataset prepared: {X_train_smote.shape[0]:,} samples")
        except Exception as e:
            print(f"Warning: SMOTE failed - {e}")

        # 3. ADASYN oversampling
        try:
            adasyn = ADASYN(random_state=42, n_neighbors=min(5, sum(y_train) - 1))
            X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
            datasets['adasyn'] = {
                'X_train': X_train_adasyn,
                'X_test': X_test,
                'y_train': y_train_adasyn,
                'y_test': y_test,
                'description': 'ADASYN oversampled dataset'
            }
            print(f"✓ ADASYN dataset prepared: {X_train_adasyn.shape[0]:,} samples")
        except Exception as e:
            print(f"Warning: ADASYN failed - {e}")

        # 4. Random undersampling
        try:
            undersampler = RandomUnderSampler(random_state=42)
            X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)
            datasets['undersample'] = {
                'X_train': X_train_under,
                'X_test': X_test,
                'y_train': y_train_under,
                'y_test': y_test,
                'description': 'Random undersampled dataset'
            }
            print(f"✓ Undersampling dataset prepared: {X_train_under.shape[0]:,} samples")
        except Exception as e:
            print(f"Warning: Undersampling failed - {e}")

        # Apply Variational Autoencoder if TensorFlow is available
        if TF_AVAILABLE:
            vae_datasets = self._apply_vae_feature_extraction(datasets)
            datasets.update(vae_datasets)
        else:
            print("⚠️ TensorFlow not available, skipping VAE feature extraction")

        return datasets

    def _apply_vae_feature_extraction(self, datasets, latent_dim=50):
        """Apply VAE for feature extraction"""
        print("\\n🧠 Applying Variational Autoencoder for feature extraction...")

        vae_datasets = {}

        for name, dataset in datasets.items():
            if 'vae' in name:  # Skip already processed VAE datasets
                continue

            try:
                print(f"  Processing {name} dataset with VAE...")

                # Build VAE
                input_dim = dataset['X_train'].shape[1]
                vae = self._build_vae(input_dim, latent_dim)

                # Train VAE
                history = vae.fit(
                    dataset['X_train'], dataset['X_train'],
                    epochs=50,
                    batch_size=128,
                    validation_split=0.2,
                    verbose=0
                )

                # Extract features using encoder
                encoder = keras.Model(vae.input, vae.get_layer('z_mean').output)
                X_train_vae = encoder.predict(dataset['X_train'], verbose=0)
                X_test_vae = encoder.predict(dataset['X_test'], verbose=0)

                vae_datasets[f"{name}_vae"] = {
                    'X_train': X_train_vae,
                    'X_test': X_test_vae,
                    'y_train': dataset['y_train'],
                    'y_test': dataset['y_test'],
                    'description': f"{dataset['description']} + VAE features ({latent_dim}D)"
                }

                print(f"    ✓ VAE features extracted: {latent_dim} dimensions")

            except Exception as e:
                print(f"    ❌ VAE failed for {name}: {e}")
                continue

        return vae_datasets

    def _build_vae(self, input_dim, latent_dim):
        """Build Variational Autoencoder"""
        # Encoder
        inputs = keras.Input(shape=(input_dim,))
        h1 = layers.Dense(128, activation='relu')(inputs)
        h1 = layers.BatchNormalization()(h1)
        h1 = layers.Dropout(0.2)(h1)

        h2 = layers.Dense(64, activation='relu')(h1)
        h2 = layers.BatchNormalization()(h2)
        h2 = layers.Dropout(0.2)(h2)

        z_mean = layers.Dense(latent_dim, name='z_mean')(h2)
        z_log_var = layers.Dense(latent_dim, name='z_log_var')(h2)

        # Reparameterization trick
        def sampling(args):
            z_mean, z_log_var = args
            batch = tf.shape(z_mean)[0]
            dim = tf.shape(z_mean)[1]
            epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
            return z_mean + tf.exp(0.5 * z_log_var) * epsilon

        z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

        # Decoder
        decoder_h1 = layers.Dense(64, activation='relu')(z)
        decoder_h1 = layers.BatchNormalization()(decoder_h1)

        decoder_h2 = layers.Dense(128, activation='relu')(decoder_h1)
        decoder_h2 = layers.BatchNormalization()(decoder_h2)

        outputs = layers.Dense(input_dim, activation='sigmoid')(decoder_h2)

        # VAE model
        vae = keras.Model(inputs, outputs)

        # VAE loss
        reconstruction_loss = keras.losses.mse(inputs, outputs)
        reconstruction_loss *= input_dim
        kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
        kl_loss = tf.reduce_mean(kl_loss) * -0.5
        vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)
        vae.add_loss(vae_loss)

        vae.compile(optimizer='adam')
        return vae

    def train_models(self, datasets):
        """Train multiple models with comprehensive hyperparameter tuning"""
        print("\\n" + "="*60)
        print("🤖 MODEL TRAINING WITH HYPERPARAMETER OPTIMIZATION")
        print("="*60)

        # Define model configurations
        model_configs = {
            'RandomForest': {
                'model': RandomForestClassifier(random_state=42, n_jobs=-1),
                'params': {
                    'n_estimators': [100, 200, 300, 500],
                    'max_depth': [10, 20, 30, None],
                    'min_samples_split': [2, 5, 10, 15],
                    'min_samples_leaf': [1, 2, 4, 8],
                    'max_features': ['sqrt', 'log2', None],
                    'bootstrap': [True, False],
                    'class_weight': ['balanced', 'balanced_subsample', None]
                }
            },
            'XGBoost': {
                'model': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [3, 6, 10],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'subsample': [0.8, 0.9, 1.0],
                    'colsample_bytree': [0.8, 0.9, 1.0],
                    'scale_pos_weight': [1, 3, 5]
                }
            },
            'LightGBM': {
                'model': lgb.LGBMClassifier(random_state=42, verbose=-1),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [3, 6, 10],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'num_leaves': [31, 50, 100],
                    'feature_fraction': [0.8, 0.9, 1.0],
                    'bagging_fraction': [0.8, 0.9, 1.0],
                    'scale_pos_weight': [1, 3, 5]
                }
            },
            'ExtraTrees': {
                'model': ExtraTreesClassifier(random_state=42, n_jobs=-1),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [10, 20, 30, None],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['sqrt', 'log2', None],
                    'class_weight': ['balanced', None]
                }
            }
        }

        results = {}

        for dataset_name, dataset in datasets.items():
            print(f"\\n📊 Processing {dataset_name} dataset...")
            print(f"   Description: {dataset['description']}")

            results[dataset_name] = {}
            X_train = dataset['X_train']
            X_test = dataset['X_test']
            y_train = dataset['y_train']
            y_test = dataset['y_test']

            for model_name, config in model_configs.items():
                print(f"\\n  🔧 Training {model_name}...")

                try:
                    # Hyperparameter tuning
                    start_time = time.time()

                    random_search = RandomizedSearchCV(
                        estimator=config['model'],
                        param_distributions=config['params'],
                        n_iter=100,  # Reduced for faster execution
                        cv=3,  # 3-fold CV for speed
                        scoring='f1',
                        random_state=42,
                        n_jobs=-1,
                        verbose=0
                    )

                    random_search.fit(X_train, y_train)
                    best_model = random_search.best_estimator_

                    training_time = time.time() - start_time

                    # Predictions
                    y_pred = best_model.predict(X_test)
                    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

                    # Calculate metrics
                    metrics = {
                        'accuracy': accuracy_score(y_test, y_pred),
                        'precision': precision_score(y_test, y_pred, zero_division=0),
                        'recall': recall_score(y_test, y_pred, zero_division=0),
                        'f1': f1_score(y_test, y_pred, zero_division=0),
                        'roc_auc': roc_auc_score(y_test, y_pred_proba)
                    }

                    results[dataset_name][model_name] = {
                        'model': best_model,
                        'best_params': random_search.best_params_,
                        'best_cv_score': random_search.best_score_,
                        'metrics': metrics,
                        'predictions': y_pred,
                        'probabilities': y_pred_proba,
                        'y_test': y_test,
                        'training_time': training_time
                    }

                    # Track best model
                    if metrics['f1'] > self.best_score:
                        self.best_score = metrics['f1']
                        self.best_model = {
                            'name': model_name,
                            'dataset': dataset_name,
                            'model': best_model,
                            'metrics': metrics,
                            'params': random_search.best_params_
                        }

                    print(f"    ✓ F1 Score: {metrics['f1']:.4f} (CV: {random_search.best_score_:.4f})")
                    print(f"    ⏱️ Training time: {training_time:.1f}s")

                except Exception as e:
                    print(f"    ❌ Error training {model_name}: {e}")
                    continue

        self.results = results
        print(f"\\n🏆 Best model: {self.best_model['name']} on {self.best_model['dataset']} (F1: {self.best_score:.4f})")
        return results

    def create_comprehensive_visualizations(self):
        """Create comprehensive visualizations of results"""
        print("\\n" + "="*60)
        print("📊 CREATING COMPREHENSIVE VISUALIZATIONS")
        print("="*60)

        # 1. Model comparison heatmap
        self._plot_model_comparison_heatmap()

        # 2. Best model detailed analysis
        self._plot_best_model_analysis()

        # 3. Feature importance analysis
        self._plot_feature_importance()

        # 4. Model interpretation (SHAP/LIME)
        if SHAP_AVAILABLE or LIME_AVAILABLE:
            self._create_model_interpretation()

    def _plot_model_comparison_heatmap(self):
        """Plot model comparison heatmap"""
        print("Creating model comparison heatmap...")

        # Prepare data for heatmap
        models = []
        datasets = []
        f1_scores = []

        for dataset_name, dataset_results in self.results.items():
            for model_name, result in dataset_results.items():
                models.append(model_name)
                datasets.append(dataset_name)
                f1_scores.append(result['metrics']['f1'])

        if not f1_scores:
            print("No results available for comparison")
            return

        # Create pivot table
        comparison_df = pd.DataFrame({
            'Model': models,
            'Dataset': datasets,
            'F1_Score': f1_scores
        })

        pivot_df = comparison_df.pivot(index='Model', columns='Dataset', values='F1_Score')

        # Create heatmap
        plt.figure(figsize=(12, 8))
        sns.heatmap(pivot_df, annot=True, cmap='YlOrRd', center=0.5,
                   square=True, fmt='.3f', cbar_kws={"shrink": .8})
        plt.title('Model Performance Comparison (F1 Scores)', fontsize=16, fontweight='bold')
        plt.xlabel('Dataset', fontweight='bold')
        plt.ylabel('Model', fontweight='bold')
        plt.tight_layout()
        plt.savefig(self.output_dir / 'plots' / 'model_comparison_heatmap.png', dpi=300, bbox_inches='tight')
        plt.show()

    def _plot_best_model_analysis(self):
        """Create detailed analysis of the best model"""
        if not self.best_model:
            print("No best model available for detailed analysis")
            return

        print(f"Creating detailed analysis for best model: {self.best_model['name']}")

        # Get best model results
        dataset_name = self.best_model['dataset']
        model_name = self.best_model['name']
        result = self.results[dataset_name][model_name]

        y_test = result['y_test']
        y_pred = result['predictions']
        y_pred_proba = result['probabilities']

        # Create subplots
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        fig.suptitle(f'Best Model Analysis: {model_name} on {dataset_name}', fontsize=16, fontweight='bold')

        # 1. Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0],
                   xticklabels=['No Dropout', 'Dropout'],
                   yticklabels=['No Dropout', 'Dropout'])
        axes[0, 0].set_title('Confusion Matrix')
        axes[0, 0].set_ylabel('True Label')
        axes[0, 0].set_xlabel('Predicted Label')

        # 2. ROC Curve
        from sklearn.metrics import roc_curve
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        axes[0, 1].plot(fpr, tpr, color='darkorange', lw=2,
                       label=f'ROC curve (AUC = {roc_auc:.3f})')
        axes[0, 1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        axes[0, 1].set_xlim([0.0, 1.0])
        axes[0, 1].set_ylim([0.0, 1.05])
        axes[0, 1].set_xlabel('False Positive Rate')
        axes[0, 1].set_ylabel('True Positive Rate')
        axes[0, 1].set_title('ROC Curve')
        axes[0, 1].legend(loc="lower right")
        axes[0, 1].grid(True, alpha=0.3)

        # 3. Prediction Distribution
        axes[0, 2].hist(y_pred_proba[y_test == 0], bins=30, alpha=0.7, label='No Dropout', color='blue')
        axes[0, 2].hist(y_pred_proba[y_test == 1], bins=30, alpha=0.7, label='Dropout', color='red')
        axes[0, 2].set_xlabel('Prediction Probability')
        axes[0, 2].set_ylabel('Frequency')
        axes[0, 2].set_title('Prediction Probability Distribution')
        axes[0, 2].legend()
        axes[0, 2].grid(True, alpha=0.3)

        # 4. Performance Metrics
        metrics = result['metrics']
        metric_names = list(metrics.keys())
        metric_values = list(metrics.values())
        bars = axes[1, 0].bar(metric_names, metric_values,
                             color=['blue', 'green', 'orange', 'red', 'purple'])
        axes[1, 0].set_title('Performance Metrics')
        axes[1, 0].set_ylabel('Score')
        axes[1, 0].set_ylim(0, 1.1)
        axes[1, 0].tick_params(axis='x', rotation=45)

        # Add values on bars
        for bar in bars:
            height = bar.get_height()
            axes[1, 0].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                           f'{height:.3f}', ha='center', va='bottom')

        # 5. Feature Importance (if available)
        model = result['model']
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            indices = np.argsort(importances)[::-1][:15]  # Top 15 features

            feature_names_short = [self.feature_names[i][:15] + '...'
                                 if len(self.feature_names[i]) > 15
                                 else self.feature_names[i] for i in indices]

            axes[1, 1].barh(range(len(indices)), importances[indices], color='lightblue')
            axes[1, 1].set_yticks(range(len(indices)))
            axes[1, 1].set_yticklabels(feature_names_short, fontsize=8)
            axes[1, 1].set_title('Top 15 Feature Importances')
            axes[1, 1].set_xlabel('Importance')
            axes[1, 1].invert_yaxis()
            axes[1, 1].grid(True, alpha=0.3)
        else:
            axes[1, 1].text(0.5, 0.5, 'Feature importance\\nnot available',
                           ha='center', va='center', transform=axes[1, 1].transAxes)
            axes[1, 1].set_title('Feature Importance')

        # 6. Classification Report (as text)
        report = classification_report(y_test, y_pred)
        axes[1, 2].text(0.01, 0.99, report, transform=axes[1, 2].transAxes,
                       fontsize=9, verticalalignment='top', fontfamily='monospace')
        axes[1, 2].set_title('Classification Report')
        axes[1, 2].axis('off')

        plt.tight_layout()
        plt.savefig(self.output_dir / 'plots' / 'best_model_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

    def _plot_feature_importance(self):
        """Plot comprehensive feature importance analysis"""
        if not self.best_model or not hasattr(self.best_model['model'], 'feature_importances_'):
            print("Feature importance not available for the best model")
            return

        print("Creating feature importance analysis...")

        model = self.best_model['model']
        importances = model.feature_importances_

        # Create feature importance DataFrame
        feature_importance_df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)

        # Create visualizations
        fig, axes = plt.subplots(2, 2, figsize=(20, 12))
        fig.suptitle('Feature Importance Analysis', fontsize=16, fontweight='bold')

        # 1. Top 20 features
        top_20 = feature_importance_df.head(20)
        axes[0, 0].barh(range(len(top_20)), top_20['importance'], color='lightblue')
        axes[0, 0].set_yticks(range(len(top_20)))
        axes[0, 0].set_yticklabels([name[:20] + '...' if len(name) > 20 else name for name in top_20['feature']], fontsize=8)
        axes[0, 0].set_title('Top 20 Most Important Features')
        axes[0, 0].set_xlabel('Importance')
        axes[0, 0].invert_yaxis()
        axes[0, 0].grid(True, alpha=0.3)

        # 2. Feature importance distribution
        axes[0, 1].hist(importances, bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
        axes[0, 1].set_xlabel('Feature Importance')
        axes[0, 1].set_ylabel('Number of Features')
        axes[0, 1].set_title('Distribution of Feature Importances')
        axes[0, 1].grid(True, alpha=0.3)

        # 3. Cumulative importance
        cumulative_importance = np.cumsum(np.sort(importances)[::-1])
        axes[1, 0].plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'b-', linewidth=2)
        axes[1, 0].axhline(y=0.8, color='r', linestyle='--', label='80% Threshold')
        axes[1, 0].axhline(y=0.9, color='orange', linestyle='--', label='90% Threshold')
        axes[1, 0].set_xlabel('Number of Features')
        axes[1, 0].set_ylabel('Cumulative Importance')
        axes[1, 0].set_title('Cumulative Feature Importance')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)

        # 4. Daily activity pattern (if applicable)
        daily_features = feature_importance_df[feature_importance_df['feature'].str.contains('day_')]
        if not daily_features.empty:
            # Extract day numbers and activity types
            daily_features['day'] = daily_features['feature'].str.extract(r'day_(\\d+)')[0].astype(float)
            daily_features['activity'] = daily_features['feature'].str.extract(r'day_\\d+_(.+)')[0]

            # Group by day and sum importances
            daily_importance = daily_features.groupby('day')['importance'].sum().reset_index()

            axes[1, 1].plot(daily_importance['day'], daily_importance['importance'], 'o-', linewidth=2, markersize=6)
            axes[1, 1].set_xlabel('Day')
            axes[1, 1].set_ylabel('Total Importance')
            axes[1, 1].set_title('Feature Importance by Day')
            axes[1, 1].grid(True, alpha=0.3)
        else:
            axes[1, 1].text(0.5, 0.5, 'No daily activity\\nfeatures found',
                           ha='center', va='center', transform=axes[1, 1].transAxes)
            axes[1, 1].set_title('Daily Pattern Analysis')

        plt.tight_layout()
        plt.savefig(self.output_dir / 'plots' / 'feature_importance_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        # Save feature importance data
        feature_importance_df.to_csv(self.output_dir / 'reports' / 'feature_importance.csv', index=False)
        print(f"Feature importance saved to: {self.output_dir / 'reports' / 'feature_importance.csv'}")

    def _create_model_interpretation(self):
        """Create model interpretation using SHAP and LIME"""
        if not self.best_model:
            return

        print("Creating model interpretation visualizations...")

        # Get best model data
        dataset_name = self.best_model['dataset']
        model_name = self.best_model['name']

        # Find the dataset
        dataset = None
        for name, ds in self.prepare_datasets().items():
            if name == dataset_name:
                dataset = ds
                break

        if not dataset:
            print("Could not find dataset for interpretation")
            return

        model = self.best_model['model']
        X_test_sample = dataset['X_test'][:100]  # Use sample for speed

        # SHAP Analysis
        if SHAP_AVAILABLE:
            try:
                print("  Creating SHAP visualizations...")

                # Create explainer based on model type
                if hasattr(model, 'tree_'):
                    explainer = shap.TreeExplainer(model)
                else:
                    explainer = shap.Explainer(model, X_test_sample)

                shap_values = explainer.shap_values(X_test_sample)

                # Handle different SHAP value formats
                if isinstance(shap_values, list):
                    shap_values = shap_values[1]  # For binary classification

                # Summary plot
                plt.figure()
                shap.summary_plot(shap_values, X_test_sample,
                                feature_names=self.feature_names, show=False, max_display=20)
                plt.title("SHAP Summary Plot (Top 20 Features)")
                plt.tight_layout()
                plt.savefig(self.output_dir / 'plots' / 'shap_summary.png', dpi=300, bbox_inches='tight')
                plt.show()

                # Waterfall plot for first instance
                if hasattr(shap, 'waterfall_plot'):
                    plt.figure()
                    shap.waterfall_plot(explainer.expected_value, shap_values[0], X_test_sample[0],
                                      feature_names=self.feature_names, max_display=15, show=False)
                    plt.title("SHAP Waterfall Plot (First Instance)")
                    plt.tight_layout()
                    plt.savefig(self.output_dir / 'plots' / 'shap_waterfall.png', dpi=300, bbox_inches='tight')
                    plt.show()

            except Exception as e:
                print(f"  SHAP analysis failed: {e}")

        # LIME Analysis
        if LIME_AVAILABLE:
            try:
                print("  Creating LIME visualizations...")

                # Create LIME explainer
                explainer = lime.lime_tabular.LimeTabularExplainer(
                    dataset['X_train'],
                    feature_names=self.feature_names,
                    class_names=['No Dropout', 'Dropout'],
                    mode='classification'
                )

                # Explain a few instances
                for idx in [0, 1, 2]:
                    if idx >= len(X_test_sample):
                        break

                    exp = explainer.explain_instance(
                        X_test_sample[idx],
                        model.predict_proba,
                        num_features=15,
                        top_labels=2
                    )

                    fig = exp.as_pyplot_figure()
                    plt.title(f"LIME Explanation - Instance {idx}")
                    plt.tight_layout()
                    plt.savefig(self.output_dir / 'plots' / f'lime_explanation_{idx}.png', dpi=300, bbox_inches='tight')
                    plt.show()

                    # Save HTML report
                    exp.save_to_file(self.output_dir / 'reports' / f'lime_explanation_{idx}.html')

            except Exception as e:
                print(f"  LIME analysis failed: {e}")

    def save_best_model(self):
        """Save the best model and related components"""
        if not self.best_model:
            print("No best model to save")
            return

        print("\\n" + "="*60)
        print("💾 SAVING BEST MODEL AND COMPONENTS")
        print("="*60)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save model
        model_path = self.output_dir / 'models' / f'best_model_{timestamp}.joblib'
        joblib.dump(self.best_model['model'], model_path)
        print(f"✓ Model saved: {model_path}")

        # Save scaler
        scaler_path = self.output_dir / 'models' / f'scaler_{timestamp}.joblib'
        joblib.dump(self.scaler, scaler_path)
        print(f"✓ Scaler saved: {scaler_path}")

        # Save metadata
        metadata = {
            'model_name': self.best_model['name'],
            'dataset': self.best_model['dataset'],
            'metrics': self.best_model['metrics'],
            'hyperparameters': self.best_model['params'],
            'feature_names': self.feature_names,
            'best_score': self.best_score,
            'timestamp': timestamp,
            'model_path': str(model_path),
            'scaler_path': str(scaler_path)
        }

        metadata_path = self.output_dir / 'models' / f'model_metadata_{timestamp}.pkl'
        with open(metadata_path, 'wb') as f:
            pickle.dump(metadata, f)
        print(f"✓ Metadata saved: {metadata_path}")

        # Create results summary
        self._create_results_summary(timestamp)

        print(f"\\n🎉 Best model summary:")
        print(f"   Model: {self.best_model['name']}")
        print(f"   Dataset: {self.best_model['dataset']}")
        print(f"   F1 Score: {self.best_score:.4f}")
        print(f"   All files saved in: {self.output_dir.absolute()}")

        return model_path, scaler_path, metadata_path

    def _create_results_summary(self, timestamp):
        """Create comprehensive results summary"""
        summary = {
            'execution_timestamp': timestamp,
            'best_model': self.best_model,
            'all_results': {}
        }

        # Add all results with simplified structure
        for dataset_name, dataset_results in self.results.items():
            summary['all_results'][dataset_name] = {}
            for model_name, result in dataset_results.items():
                summary['all_results'][dataset_name][model_name] = {
                    'metrics': result['metrics'],
                    'best_params': result['best_params'],
                    'best_cv_score': result['best_cv_score'],
                    'training_time': result['training_time']
                }

        # Save as JSON
        import json
        summary_path = self.output_dir / 'reports' / f'results_summary_{timestamp}.json'
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2, default=str)

        print(f"✓ Results summary saved: {summary_path}")

        # Create CSV for easy viewing
        results_list = []
        for dataset_name, dataset_results in self.results.items():
            for model_name, result in dataset_results.items():
                row = {
                    'dataset': dataset_name,
                    'model': model_name,
                    'training_time': result['training_time'],
                    **result['metrics']
                }
                results_list.append(row)

        results_df = pd.DataFrame(results_list)
        results_csv_path = self.output_dir / 'reports' / f'results_comparison_{timestamp}.csv'
        results_df.to_csv(results_csv_path, index=False)
        print(f"✓ Results comparison saved: {results_csv_path}")

    def run_complete_pipeline(self):
        """Run the complete enhanced ML pipeline"""
        start_time = time.time()

        print("🚀" * 20)
        print("ENHANCED MACHINE LEARNING PIPELINE FOR DROPOUT PREDICTION")
        print("🚀" * 20)
        print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        try:
            # Step 1: Load and explore data
            self.load_and_explore_data()

            # Step 2: Prepare datasets with different sampling strategies
            datasets = self.prepare_datasets()

            # Step 3: Train models with hyperparameter tuning
            self.train_models(datasets)

            # Step 4: Create comprehensive visualizations
            self.create_comprehensive_visualizations()

            # Step 5: Save best model and create reports
            self.save_best_model()

        except Exception as e:
            print(f"❌ Pipeline failed: {e}")
            import traceback
            traceback.print_exc()
            return None

        total_time = time.time() - start_time

        print("\\n" + "🎉" * 20)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("🎉" * 20)
        print(f"Total execution time: {total_time/60:.1f} minutes")
        print(f"Best model: {self.best_model['name']} on {self.best_model['dataset']}")
        print(f"Best F1 Score: {self.best_score:.4f}")
        print(f"All results saved in: {self.output_dir.absolute()}")

        return self.results, self.best_model

def main():
    """Main execution function"""
    # Check if data file exists
    data_file = 'model1_210_features_spliting.csv'
    if not os.path.exists(data_file):
        print(f"❌ Data file '{data_file}' not found!")
        print("Please ensure the CSV file is in the current directory.")
        return

    # Initialize and run pipeline
    pipeline = CompleteMLPipeline(data_path=data_file)
    results, best_model = pipeline.run_complete_pipeline()

    return pipeline, results, best_model

if __name__ == "__main__":
    # Run the complete pipeline
    pipeline, results, best_model = main()