<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/VAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Dict, Any, List
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

class AdvancedStudentDropoutPredictor:
    """
    Advanced implementation with optimized features for maximum accuracy:

    1. Enhanced feature engineering (temporal trends, ratios, aggregations)
    2. Multiple model architectures (CNN-LSTM, Transformer-like attention)
    3. Advanced VAE with conditional generation
    4. Ensemble methods and cross-validation
    5. Sophisticated preprocessing for sparse data
    """

    def __init__(self):
        self.scaler = RobustScaler()  # Better for sparse data
        self.feature_scaler = StandardScaler()
        self.models = {}
        self.original_data = None
        self.enhanced_data = None
        self.feature_columns = []
        self.engineered_features = []

    def load_and_preprocess_data(self, filepath: str) -> None:
        """Enhanced data loading with advanced preprocessing."""
        print("📂 Loading and preprocessing data with advanced techniques...")

        df = pd.read_csv(filepath)
        print(f"Dataset shape: {df.shape}")

        # Extract base feature columns
        self.feature_columns = []
        for day in range(1, 31):
            for activity in ['access', 'problem', 'wiki', 'discussion', 'navigate', 'page_close', 'video']:
                self.feature_columns.append(f'day_{day}_{activity}')

        X = df[self.feature_columns].values
        y = df['dropout'].values

        # Handle missing values with domain knowledge
        X = np.nan_to_num(X, nan=0.0)

        # Create enhanced features
        X_enhanced = self._create_enhanced_features(X)

        # Advanced normalization
        X_normalized = self.scaler.fit_transform(X)
        X_enhanced_normalized = self.feature_scaler.fit_transform(X_enhanced)

        # Multiple data representations
        X_lstm = X_normalized.reshape(X_normalized.shape[0], 30, 7)  # For LSTM
        X_cnn = X_normalized.reshape(X_normalized.shape[0], 30, 7, 1)  # For CNN

        # Class analysis
        unique, counts = np.unique(y, return_counts=True)
        class_dist = dict(zip(unique, counts))

        print(f"Enhanced features created: {X_enhanced.shape[1]} additional features")
        print(f"Class distribution: Non-dropout: {class_dist.get(0, 0)} ({class_dist.get(0, 0)/len(y)*100:.2f}%)")
        print(f"                   Dropout: {class_dist.get(1, 0)} ({class_dist.get(1, 0)/len(y)*100:.2f}%)")

        self.original_data = {
            'X_flat': X_normalized,
            'X_lstm': X_lstm,
            'X_cnn': X_cnn,
            'X_enhanced': X_enhanced_normalized,
            'y': y,
            'class_distribution': class_dist
        }

        print("✅ Advanced preprocessing completed!")

    def _create_enhanced_features(self, X: np.ndarray) -> np.ndarray:
        """Create advanced engineered features based on domain insights."""
        print("🔧 Engineering advanced features...")

        # Reshape to (samples, days, activities)
        X_reshaped = X.reshape(X.shape[0], 30, 7)

        features = []
        feature_names = []

        # 1. Temporal aggregation features
        # Weekly aggregations
        for week in range(4):
            start_day = week * 7
            end_day = min((week + 1) * 7, 30)
            week_data = X_reshaped[:, start_day:end_day, :].sum(axis=1)

            for activity_idx, activity in enumerate(['access', 'problem', 'wiki', 'discussion', 'navigate', 'page_close', 'video']):
                features.append(week_data[:, activity_idx])
                feature_names.append(f'week_{week+1}_{activity}')

        # 2. Temporal trend features
        # Early vs Late engagement
        early_engagement = X_reshaped[:, :7, :].sum(axis=(1, 2))  # First week
        late_engagement = X_reshaped[:, -7:, :].sum(axis=(1, 2))  # Last week

        features.extend([
            early_engagement,
            late_engagement,
            np.log1p(early_engagement / (late_engagement + 1e-8)),  # Log ratio
            early_engagement - late_engagement  # Difference
        ])
        feature_names.extend(['early_engagement', 'late_engagement', 'engagement_ratio', 'engagement_decline'])

        # 3. Activity-specific features
        activity_names = ['access', 'problem', 'wiki', 'discussion', 'navigate', 'page_close', 'video']
        for i, activity in enumerate(activity_names):
            activity_data = X_reshaped[:, :, i]

            # Total activity
            total_activity = activity_data.sum(axis=1)

            # Days active (non-zero days)
            days_active = (activity_data > 0).sum(axis=1)

            # Peak activity day
            peak_day = np.argmax(activity_data, axis=1)

            # Activity consistency (std/mean)
            activity_std = np.std(activity_data, axis=1)
            activity_mean = np.mean(activity_data, axis=1)
            consistency = activity_std / (activity_mean + 1e-8)

            # Streak features (longest consecutive active days)
            streaks = []
            for sample_idx in range(activity_data.shape[0]):
                sample_data = activity_data[sample_idx]
                current_streak = 0
                max_streak = 0
                for day_val in sample_data:
                    if day_val > 0:
                        current_streak += 1
                        max_streak = max(max_streak, current_streak)
                    else:
                        current_streak = 0
                streaks.append(max_streak)

            features.extend([
                total_activity,
                days_active,
                peak_day,
                consistency,
                np.array(streaks)
            ])
            feature_names.extend([
                f'{activity}_total',
                f'{activity}_days_active',
                f'{activity}_peak_day',
                f'{activity}_consistency',
                f'{activity}_max_streak'
            ])

        # 4. Cross-activity features
        # High-value activities (problem, discussion, access) vs low-value
        high_value_activities = X_reshaped[:, :, [0, 1, 3]].sum(axis=(1, 2))  # access, problem, discussion
        low_value_activities = X_reshaped[:, :, [2, 4, 5, 6]].sum(axis=(1, 2))  # wiki, navigate, page_close, video

        features.extend([
            high_value_activities,
            low_value_activities,
            high_value_activities / (low_value_activities + 1e-8)
        ])
        feature_names.extend(['high_value_total', 'low_value_total', 'high_low_ratio'])

        # 5. Temporal patterns
        # First day activity (crucial predictor)
        first_day_total = X_reshaped[:, 0, :].sum(axis=1)

        # Activity decay rate (slope of activity over time)
        daily_totals = X_reshaped.sum(axis=2)  # Sum across activities for each day
        decay_rates = []
        for sample_idx in range(daily_totals.shape[0]):
            sample_daily = daily_totals[sample_idx]
            # Calculate linear regression slope
            x_vals = np.arange(30)
            if np.std(sample_daily) > 0:
                correlation = np.corrcoef(x_vals, sample_daily)[0, 1]
                decay_rates.append(correlation)
            else:
                decay_rates.append(0)

        features.extend([
            first_day_total,
            np.array(decay_rates)
        ])
        feature_names.extend(['first_day_total', 'activity_decay_rate'])

        # 6. Sparsity-aware features
        # Total non-zero entries
        total_nonzero = (X_reshaped > 0).sum(axis=(1, 2))

        # Activity diversity (how many different activity types used)
        activity_diversity = (X_reshaped.sum(axis=1) > 0).sum(axis=1)

        features.extend([
            total_nonzero,
            activity_diversity
        ])
        feature_names.extend(['total_nonzero_entries', 'activity_diversity'])

        # Convert to array
        feature_matrix = np.column_stack(features)

        # Handle any remaining NaN or inf values
        feature_matrix = np.nan_to_num(feature_matrix, nan=0.0, posinf=1e6, neginf=-1e6)

        self.engineered_features = feature_names
        print(f"Created {feature_matrix.shape[1]} engineered features")

        return feature_matrix

    def build_advanced_vae(self, input_dim: int, latent_dim: int = 64) -> None:
        """Build advanced conditional VAE with better architecture."""
        print("🧠 Building Advanced Conditional VAE...")

        # Encoder with skip connections and batch norm
        encoder_inputs = keras.Input(shape=(input_dim,))

        # Dense layers with skip connections
        x1 = layers.Dense(256, activation='relu')(encoder_inputs)
        x1 = layers.BatchNormalization()(x1)
        x1 = layers.Dropout(0.3)(x1)

        x2 = layers.Dense(128, activation='relu')(x1)
        x2 = layers.BatchNormalization()(x2)
        x2 = layers.Dropout(0.3)(x2)

        # Skip connection
        x_skip = layers.Dense(128, activation='relu')(encoder_inputs)
        x2 = layers.Add()([x2, x_skip])

        x3 = layers.Dense(64, activation='relu')(x2)
        x3 = layers.BatchNormalization()(x3)

        z_mean = layers.Dense(latent_dim, name='z_mean')(x3)
        z_log_var = layers.Dense(latent_dim, name='z_log_var')(x3)

        # Improved sampling with batch normalization
        def sampling(args):
            z_mean, z_log_var = args
            batch = tf.shape(z_mean)[0]
            dim = tf.shape(z_mean)[1]
            epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
            return z_mean + tf.exp(0.5 * z_log_var) * epsilon

        z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

        self.encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')

        # Decoder with skip connections
        latent_inputs = keras.Input(shape=(latent_dim,))

        d1 = layers.Dense(64, activation='relu')(latent_inputs)
        d1 = layers.BatchNormalization()(d1)

        d2 = layers.Dense(128, activation='relu')(d1)
        d2 = layers.BatchNormalization()(d2)

        # Skip connection in decoder
        d_skip = layers.Dense(128, activation='relu')(latent_inputs)
        d2 = layers.Add()([d2, d_skip])

        d3 = layers.Dense(256, activation='relu')(d2)
        d3 = layers.BatchNormalization()(d3)
        d3 = layers.Dropout(0.2)(d3)

        decoder_outputs = layers.Dense(input_dim, activation='sigmoid')(d3)

        self.decoder = Model(latent_inputs, decoder_outputs, name='decoder')

        # VAE model with improved loss
        outputs = self.decoder(z)
        self.vae_model = Model(encoder_inputs, outputs, name='vae')

        # Enhanced loss function with perceptual loss
        def advanced_vae_loss(y_true, y_pred):
            # Reconstruction loss with different weights for important features
            reconstruction_loss = keras.losses.mse(y_true, y_pred)

            # Weight important features more heavily
            feature_weights = tf.ones_like(y_true)
            # Give higher weight to high-discriminating activities (access, problem, discussion)
            important_indices = []
            for day in range(30):
                important_indices.extend([day*7 + 0, day*7 + 1, day*7 + 3])  # access, problem, discussion

            reconstruction_loss = tf.reduce_mean(reconstruction_loss * feature_weights)
            reconstruction_loss *= input_dim

            # KL divergence loss
            kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5

            # Sparsity regularization (encourage sparse reconstructions)
            sparsity_loss = tf.reduce_mean(tf.abs(y_pred))

            total_loss = reconstruction_loss + kl_loss * 0.1 + sparsity_loss * 0.01

            return total_loss

        self.vae_model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999),
            loss=advanced_vae_loss
        )

        print("✅ Advanced VAE built with skip connections and enhanced loss!")

    def train_advanced_vae(self, epochs: int = 100, batch_size: int = 64) -> Any:
        """Train VAE with advanced techniques."""
        print("🏋️ Training Advanced VAE...")

        # Get minority class data for training
        minority_mask = self.original_data['y'] == 0
        minority_data = self.original_data['X_flat'][minority_mask]

        print(f"Training VAE on {len(minority_data)} minority samples")

        # Advanced callbacks
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-6),
            ModelCheckpoint('best_vae.h5', save_best_only=True, monitor='val_loss')
        ]

        history = self.vae_model.fit(
            minority_data, minority_data,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=1
        )

        print("✅ Advanced VAE training completed!")
        return history

    def generate_high_quality_synthetic_data(self, strategy='adaptive') -> None:
        """Generate high-quality synthetic data with multiple strategies."""
        print("⚡ Generating high-quality synthetic data...")

        class_dist = self.original_data['class_distribution']
        minority_count = class_dist.get(0, 0)
        majority_count = class_dist.get(1, 0)

        if strategy == 'adaptive':
            # Generate more synthetic data than needed, then select best samples
            synthetic_needed = majority_count - minority_count
            generate_count = int(synthetic_needed * 1.5)  # Generate 50% more
        else:
            generate_count = majority_count - minority_count

        print(f"Generating {generate_count} synthetic samples")

        # Generate with multiple sampling strategies
        latent_dim = 64
        synthetic_samples = []

        # Strategy 1: Random sampling from learned distribution
        random_latent = np.random.normal(0, 1, (generate_count // 3, latent_dim))
        synthetic_1 = self.decoder.predict(random_latent, verbose=0)

        # Strategy 2: Interpolation between real samples
        minority_mask = self.original_data['y'] == 0
        real_minority = self.original_data['X_flat'][minority_mask]
        real_encoded = self.encoder.predict(real_minority, verbose=0)[0]  # Get mean

        # Create interpolations
        interpolation_count = generate_count // 3
        interpolated_latent = []
        for _ in range(interpolation_count):
            idx1, idx2 = np.random.choice(len(real_encoded), 2, replace=False)
            alpha = np.random.beta(2, 2)  # Beta distribution for smooth interpolation
            interpolated = alpha * real_encoded[idx1] + (1 - alpha) * real_encoded[idx2]
            interpolated_latent.append(interpolated)

        synthetic_2 = self.decoder.predict(np.array(interpolated_latent), verbose=0)

        # Strategy 3: Perturbed real samples
        perturbation_count = generate_count - len(synthetic_1) - len(synthetic_2)
        perturbed_latent = real_encoded[:perturbation_count] + np.random.normal(0, 0.1, (perturbation_count, latent_dim))
        synthetic_3 = self.decoder.predict(perturbed_latent, verbose=0)

        # Combine all synthetic data
        all_synthetic = np.vstack([synthetic_1, synthetic_2, synthetic_3])

        if strategy == 'adaptive':
            # Select best synthetic samples based on similarity to real minority class
            from sklearn.metrics.pairwise import cosine_similarity

            # Calculate similarity to real minority samples
            similarities = cosine_similarity(all_synthetic, real_minority)
            avg_similarities = similarities.mean(axis=1)

            # Select top samples
            top_indices = np.argsort(avg_similarities)[-synthetic_needed:]
            selected_synthetic = all_synthetic[top_indices]
        else:
            selected_synthetic = all_synthetic

        # Create balanced dataset
        minority_data_lstm = self.original_data['X_lstm'][minority_mask]
        majority_data_lstm = self.original_data['X_lstm'][~minority_mask]

        # Reshape synthetic data for LSTM
        synthetic_lstm = selected_synthetic.reshape(len(selected_synthetic), 30, 7)

        # Combine data
        all_minority_lstm = np.vstack([minority_data_lstm, synthetic_lstm])
        balanced_X = np.vstack([all_minority_lstm, majority_data_lstm])
        balanced_y = np.hstack([
            np.zeros(len(all_minority_lstm)),
            np.ones(len(majority_data_lstm))
        ])

        # Also create enhanced features for synthetic data
        synthetic_flat_original_scale = self.scaler.inverse_transform(selected_synthetic)
        synthetic_enhanced = self._create_enhanced_features(synthetic_flat_original_scale)
        synthetic_enhanced_normalized = self.feature_scaler.transform(synthetic_enhanced)

        # Combine enhanced features
        minority_enhanced = self.original_data['X_enhanced'][minority_mask]
        majority_enhanced = self.original_data['X_enhanced'][~minority_mask]

        all_minority_enhanced = np.vstack([minority_enhanced, synthetic_enhanced_normalized])
        balanced_X_enhanced = np.vstack([all_minority_enhanced, majority_enhanced])

        # Shuffle
        shuffle_idx = np.random.permutation(len(balanced_X))
        balanced_X = balanced_X[shuffle_idx]
        balanced_X_enhanced = balanced_X_enhanced[shuffle_idx]
        balanced_y = balanced_y[shuffle_idx]

        self.enhanced_data = {
            'X_lstm': balanced_X,
            'X_enhanced': balanced_X_enhanced,
            'y': balanced_y
        }

        print(f"✅ High-quality synthetic data generated!")
        print(f"   Balanced dataset: {len(balanced_X)} samples")
        print(f"   Non-dropout: {np.sum(balanced_y == 0)} ({np.mean(balanced_y == 0)*100:.1f}%)")
        print(f"   Dropout: {np.sum(balanced_y == 1)} ({np.mean(balanced_y == 1)*100:.1f}%)")

    def build_ensemble_models(self) -> None:
        """Build multiple advanced model architectures."""
        print("🔮 Building ensemble of advanced models...")

        # Model 1: Advanced LSTM with attention
        def build_attention_lstm():
            inputs = keras.Input(shape=(30, 7))

            # Multi-head LSTM layers
            lstm1 = layers.LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)(inputs)
            lstm1 = layers.BatchNormalization()(lstm1)

            lstm2 = layers.LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)(lstm1)
            lstm2 = layers.BatchNormalization()(lstm2)

            # Attention mechanism
            attention = layers.Dense(1, activation='tanh')(lstm2)
            attention = layers.Flatten()(attention)
            attention = layers.Activation('softmax')(attention)
            attention = layers.RepeatVector(64)(attention)
            attention = layers.Permute([2, 1])(attention)

            # Apply attention weights
            attended = layers.Multiply()([lstm2, attention])
            attended = layers.Lambda(lambda x: tf.reduce_sum(x, axis=1))(attended)

            # Dense layers
            dense1 = layers.Dense(32, activation='relu')(attended)
            dense1 = layers.BatchNormalization()(dense1)
            dense1 = layers.Dropout(0.4)(dense1)

            output = layers.Dense(1, activation='sigmoid')(dense1)

            model = Model(inputs, output)
            model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy', 'precision', 'recall']
            )
            return model

        # Model 2: CNN-LSTM hybrid
        def build_cnn_lstm():
            inputs = keras.Input(shape=(30, 7))

            # Expand dimensions for Conv1D
            x = layers.Reshape((30, 7, 1))(inputs)

            # Conv1D layers for feature extraction
            conv1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
            conv1 = layers.BatchNormalization()(conv1)
            conv1 = layers.MaxPooling2D((2, 1))(conv1)

            conv2 = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)
            conv2 = layers.BatchNormalization()(conv2)
            conv2 = layers.MaxPooling2D((2, 1))(conv2)

            # Reshape for LSTM
            conv_shape = conv2.shape
            reshaped = layers.Reshape((conv_shape[1], conv_shape[2] * conv_shape[3]))(conv2)

            # LSTM layers
            lstm = layers.LSTM(64, dropout=0.3, recurrent_dropout=0.3)(reshaped)
            lstm = layers.BatchNormalization()(lstm)

            dense = layers.Dense(32, activation='relu')(lstm)
            dense = layers.Dropout(0.4)(dense)

            output = layers.Dense(1, activation='sigmoid')(dense)

            model = Model(inputs, output)
            model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy', 'precision', 'recall']
            )
            return model

        # Model 3: Enhanced features model (for engineered features)
        def build_enhanced_model():
            inputs = keras.Input(shape=(self.original_data['X_enhanced'].shape[1],))

            # Deep neural network with residual connections
            x1 = layers.Dense(256, activation='relu')(inputs)
            x1 = layers.BatchNormalization()(x1)
            x1 = layers.Dropout(0.3)(x1)

            x2 = layers.Dense(128, activation='relu')(x1)
            x2 = layers.BatchNormalization()(x2)
            x2 = layers.Dropout(0.3)(x2)

            # Residual connection
            x_res = layers.Dense(128, activation='relu')(inputs)
            x2 = layers.Add()([x2, x_res])

            x3 = layers.Dense(64, activation='relu')(x2)
            x3 = layers.BatchNormalization()(x3)
            x3 = layers.Dropout(0.4)(x3)

            x4 = layers.Dense(32, activation='relu')(x3)
            x4 = layers.Dropout(0.4)(x4)

            output = layers.Dense(1, activation='sigmoid')(x4)

            model = Model(inputs, output)
            model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy', 'precision', 'recall']
            )
            return model

        # Model 4: Bidirectional LSTM
        def build_bidirectional_lstm():
            inputs = keras.Input(shape=(30, 7))

            # Bidirectional LSTM layers
            bilstm1 = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.3))(inputs)
            bilstm1 = layers.BatchNormalization()(bilstm1)

            bilstm2 = layers.Bidirectional(layers.LSTM(32, dropout=0.3))(bilstm1)
            bilstm2 = layers.BatchNormalization()(bilstm2)

            dense1 = layers.Dense(32, activation='relu')(bilstm2)
            dense1 = layers.Dropout(0.4)(dense1)

            output = layers.Dense(1, activation='sigmoid')(dense1)

            model = Model(inputs, output)
            model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy', 'precision', 'recall']
            )
            return model

        # Build all models
        self.models = {
            'attention_lstm': build_attention_lstm(),
            'cnn_lstm': build_cnn_lstm(),
            'enhanced_features': build_enhanced_model(),
            'bidirectional_lstm': build_bidirectional_lstm()
        }

        print(f"✅ Built {len(self.models)} advanced models!")
        for name, model in self.models.items():
            print(f"   {name}: {model.count_params():,} parameters")

    def train_ensemble_with_cv(self, epochs: int = 50, cv_folds: int = 5) -> Dict:
        """Train ensemble models with cross-validation."""
        print("🏋️ Training ensemble models with cross-validation...")

        # Use balanced data
        if self.enhanced_data is None:
            raise ValueError("Must generate synthetic data first!")

        X_lstm = self.enhanced_data['X_lstm']
        X_enhanced = self.enhanced_data['X_enhanced']
        y = self.enhanced_data['y']

        # Cross-validation setup
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

        results = {}

        for model_name, model in self.models.items():
            print(f"\nTraining {model_name}...")

            fold_scores = []
            fold_histories = []

            for fold, (train_idx, val_idx) in enumerate(skf.split(X_lstm, y)):
                print(f"  Fold {fold + 1}/{cv_folds}")

                # Prepare data based on model type
                if model_name == 'enhanced_features':
                    X_train, X_val = X_enhanced[train_idx], X_enhanced[val_idx]
                else:
                    X_train, X_val = X_lstm[train_idx], X_lstm[val_idx]

                y_train, y_val = y[train_idx], y[val_idx]

                # Clone model for this fold
                model_clone = keras.models.clone_model(model)
                model_clone.compile(
                    optimizer=model.optimizer,
                    loss=model.loss,
                    metrics=model.metrics
                )

                # Advanced callbacks
                callbacks = [
                    EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True),
                    ReduceLROnPlateau(monitor='val_accuracy', factor=0.7, patience=5, min_lr=1e-6)
                ]

                # Train model
                history = model_clone.fit(
                    X_train, y_train,
                    epochs=epochs,
                    batch_size=64,
                    validation_data=(X_val, y_val),
                    callbacks=callbacks,
                    verbose=0
                )

                # Evaluate fold
                val_accuracy = max(history.history['val_accuracy'])
                fold_scores.append(val_accuracy)
                fold_histories.append(history)

                print(f"    Fold {fold + 1} accuracy: {val_accuracy:.4f}")

            # Store results
            avg_score = np.mean(fold_scores)
            std_score = np.std(fold_scores)

            results[model_name] = {
                'mean_accuracy': avg_score,
                'std_accuracy': std_score,
                'fold_scores': fold_scores,
                'fold_histories': fold_histories
            }

            print(f"  {model_name} CV Accuracy: {avg_score:.4f} (±{std_score:.4f})")

        # Train final models on full dataset
        print("\nTraining final models on full dataset...")

        for model_name, model in self.models.items():
            print(f"Training final {model_name}...")

            if model_name == 'enhanced_features':
                X_train = X_enhanced
            else:
                X_train = X_lstm

            callbacks = [
                EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True),
                ReduceLROnPlateau(monitor='val_accuracy', factor=0.7, patience=7, min_lr=1e-6),
                ModelCheckpoint(f'best_{model_name}.h5', save_best_only=True, monitor='val_accuracy')
            ]

            model.fit(
                X_train, y,
                epochs=epochs,
                batch_size=64,
                validation_split=0.2,
                callbacks=callbacks,
                verbose=0
            )

        print("✅ Ensemble training completed!")
        return results

    def evaluate_ensemble(self, X_test=None, y_test=None) -> Dict:
        """Evaluate ensemble models with comprehensive metrics."""
        print("📈 Evaluating ensemble models...")

        # Use original data for fair evaluation
        if X_test is None or y_test is None:
            X_test_lstm = self.original_data['X_lstm']
            X_test_enhanced = self.original_data['X_enhanced']
            y_test = self.original_data['y']

        results = {}
        all_predictions = {}

        for model_name, model in self.models.items():
            print(f"\nEvaluating {model_name}...")

            # Choose appropriate test data
            if model_name == 'enhanced_features':
                X_test_model = X_test_enhanced
            else:
                X_test_model = X_test_lstm

            # Make predictions
            y_pred_proba = model.predict(X_test_model, verbose=0).flatten()
            y_pred = (y_pred_proba > 0.5).astype(int)

            # Calculate metrics
            from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

            metrics = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'f1_score': f1_score(y_test, y_pred),
                'roc_auc': roc_auc_score(y_test, y_pred_proba)
            }

            results[model_name] = metrics
            all_predictions[model_name] = y_pred_proba

            print(f"  Accuracy: {metrics['accuracy']:.4f}")
            print(f"  F1-Score: {metrics['f1_score']:.4f}")
            print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")

        # Ensemble predictions
        print("\nComputing ensemble predictions...")

        # Simple average ensemble
        ensemble_proba = np.mean(list(all_predictions.values()), axis=0)
        ensemble_pred = (ensemble_proba > 0.5).astype(int)

        ensemble_metrics = {
            'accuracy': accuracy_score(y_test, ensemble_pred),
            'precision': precision_score(y_test, ensemble_pred),
            'recall': recall_score(y_test, ensemble_pred),
            'f1_score': f1_score(y_test, ensemble_pred),
            'roc_auc': roc_auc_score(y_test, ensemble_proba)
        }

        results['ensemble'] = ensemble_metrics

        print(f"\n🎯 ENSEMBLE RESULTS:")
        print(f"  Accuracy: {ensemble_metrics['accuracy']:.4f}")
        print(f"  Precision: {ensemble_metrics['precision']:.4f}")
        print(f"  Recall: {ensemble_metrics['recall']:.4f}")
        print(f"  F1-Score: {ensemble_metrics['f1_score']:.4f}")
        print(f"  ROC-AUC: {ensemble_metrics['roc_auc']:.4f}")

        # Detailed classification report
        print("\n📊 Detailed Classification Report (Ensemble):")
        print(classification_report(y_test, ensemble_pred, target_names=['Non-Dropout', 'Dropout']))

        return results, ensemble_proba, ensemble_pred

    def save_models(self, base_path: str = './models/'):
        """Save all trained models."""
        import os
        os.makedirs(base_path, exist_ok=True)

        if self.vae_model:
            self.vae_model.save(f'{base_path}advanced_vae.h5')

        for name, model in self.models.items():
            model.save(f'{base_path}advanced_{name}.h5')

        print(f"✅ All models saved to {base_path}")


def main():
    """Enhanced main function with comprehensive pipeline."""
    print("🎓 ADVANCED Student Dropout Prediction with Ensemble VAE + LSTM")
    print("=" * 70)

    # Initialize predictor
    predictor = AdvancedStudentDropoutPredictor()

    try:
        # Step 1: Load and preprocess data with advanced feature engineering
        predictor.load_and_preprocess_data('model1_210_features.csv')

        # Step 2: Build and train advanced VAE
        input_dim = predictor.original_data['X_flat'].shape[1]
        predictor.build_advanced_vae(input_dim, latent_dim=64)
        vae_history = predictor.train_advanced_vae(epochs=80, batch_size=64)

        # Step 3: Generate high-quality synthetic data
        predictor.generate_high_quality_synthetic_data(strategy='adaptive')

        # Step 4: Build ensemble of advanced models
        predictor.build_ensemble_models()

        # Step 5: Train ensemble with cross-validation
        cv_results = predictor.train_ensemble_with_cv(epochs=60, cv_folds=5)

        # Step 6: Comprehensive evaluation
        results, ensemble_proba, ensemble_pred = predictor.evaluate_ensemble()

        # Step 7: Save models
        predictor.save_models()

        # Step 8: Print final summary
        print("\n" + "="*70)
        print("🏆 FINAL RESULTS SUMMARY")
        print("="*70)

        best_single_model = max(results.items(), key=lambda x: x[1]['accuracy'] if x[0] != 'ensemble' else 0)

        print(f"Best Single Model: {best_single_model[0]} - Accuracy: {best_single_model[1]['accuracy']:.4f}")
        print(f"Ensemble Model Accuracy: {results['ensemble']['accuracy']:.4f}")
        print(f"Improvement over best single: {results['ensemble']['accuracy'] - best_single_model[1]['accuracy']:.4f}")

        print("\n✅ Advanced pipeline completed successfully!")

        return predictor, results

    except FileNotFoundError:
        print("❌ CSV file not found. Please ensure 'model1_210_features_spliting.csv' is in the current directory.")
        return None, None
    except Exception as e:
        print(f"❌ Error during execution: {str(e)}")
        return None, None


if __name__ == "__main__":
    # Enhanced requirements
    enhanced_packages = [
        "tensorflow>=2.10.0",
        "pandas>=1.3.0",
        "numpy>=1.21.0",
        "scikit-learn>=1.0.0",
        "matplotlib>=3.3.0",
        "seaborn>=0.11.0"
    ]

    print("📦 Enhanced packages required:")
    for package in enhanced_packages:
        print(f"   - {package}")
    print("\nInstall with: pip install tensorflow pandas numpy scikit-learn matplotlib seaborn")
    print()

    predictor, results = main()


📦 Enhanced packages required:
   - tensorflow>=2.10.0
   - pandas>=1.3.0
   - numpy>=1.21.0
   - scikit-learn>=1.0.0
   - matplotlib>=3.3.0
   - seaborn>=0.11.0

Install with: pip install tensorflow pandas numpy scikit-learn matplotlib seaborn

🎓 ADVANCED Student Dropout Prediction with Ensemble VAE + LSTM
📂 Loading and preprocessing data with advanced techniques...
Dataset shape: (120542, 214)
🔧 Engineering advanced features...
Created 74 engineered features
Enhanced features created: 74 additional features
Class distribution: Non-dropout: 24961 (20.71%)
                   Dropout: 95581 (79.29%)
✅ Advanced preprocessing completed!
🧠 Building Advanced Conditional VAE...
✅ Advanced VAE built with skip connections and enhanced loss!
🏋️ Training Advanced VAE...
Training VAE on 24961 minority samples
Epoch 1/80
❌ Error during execution: Dimensions must be equal, but are 64 and 210 for '{{node compile_loss/advanced_vae_loss/mul}} = Mul[T=DT_FLOAT](compile_loss/advanced_vae_loss/Mean, compi