<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/VAELSTMDropout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, optimizers, losses
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

class VAELSTMDropoutPredictor:
    def __init__(self, sequence_length=30, features_per_day=7, latent_dim=32):
        """
        Initialize the VAE-LSTM Dropout Predictor

        Args:
            sequence_length (int): Number of days in sequence (30)
            features_per_day (int): Number of features per day (7)
            latent_dim (int): Latent dimension for VAE
        """
        self.sequence_length = sequence_length
        self.features_per_day = features_per_day
        self.latent_dim = latent_dim
        self.input_shape = (sequence_length, features_per_day)

        # Models
        self.vae = None
        self.encoder = None
        self.decoder = None
        self.lstm_model = None

        # Data
        self.scaler = StandardScaler()
        self.original_data = None
        self.augmented_data = None
        self.history = {}

    def load_and_preprocess_data(self, csv_file_path):
        """
        Load and preprocess the student activity data

        Args:
            csv_file_path (str): Path to the CSV file
        """
        print("Loading and preprocessing data...")

        # Load data
        df = pd.read_csv(csv_file_path)
        print(f"Loaded {len(df)} samples with {len(df.columns)} columns")

        # Extract feature columns (day_1_access to day_30_video)
        feature_columns = []
        for day in range(1, 31):  # 30 days
            feature_columns.extend([
                f'day_{day}_access',
                f'day_{day}_problem',
                f'day_{day}_wiki',
                f'day_{day}_discussion',
                f'day_{day}_navigate',
                f'day_{day}_page_close',
                f'day_{day}_video'
            ])

        # Extract features and labels
        X = df[feature_columns].values
        y = df['dropout'].values

        # Reshape data for LSTM (samples, timesteps, features)
        X_sequences = X.reshape(-1, self.sequence_length, self.features_per_day)

        # Normalize features
        X_flat = X_sequences.reshape(-1, self.sequence_length * self.features_per_day)
        X_normalized = self.scaler.fit_transform(X_flat)
        X_sequences = X_normalized.reshape(-1, self.sequence_length, self.features_per_day)

        # Analyze class distribution
        unique, counts = np.unique(y, return_counts=True)
        class_distribution = dict(zip(unique, counts))
        print(f"Class distribution: {class_distribution}")

        # Separate by class
        dropout_indices = np.where(y == 1)[0]
        no_dropout_indices = np.where(y == 0)[0]

        self.dropout_sequences = X_sequences[dropout_indices]
        self.no_dropout_sequences = X_sequences[no_dropout_indices]

        print(f"Dropout samples: {len(self.dropout_sequences)}")
        print(f"No dropout samples: {len(self.no_dropout_sequences)}")
        print(f"Imbalance ratio: {len(self.dropout_sequences) / len(self.no_dropout_sequences):.2f}")

        self.original_data = {
            'X': X_sequences,
            'y': y,
            'class_distribution': class_distribution
        }

        return X_sequences, y

    def build_vae(self):
        """
        Build Variational Autoencoder for data augmentation
        """
        print("Building VAE model...")

        # Encoder
        encoder_inputs = layers.Input(shape=self.input_shape, name='encoder_input')
        x = layers.Flatten()(encoder_inputs)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dense(128, activation='relu')(x)

        # Latent space
        z_mean = layers.Dense(self.latent_dim, name='z_mean')(x)
        z_log_var = layers.Dense(self.latent_dim, name='z_log_var')(x)

        # Sampling function
        def sampling(args):
            z_mean, z_log_var = args
            batch = tf.shape(z_mean)[0]
            dim = tf.shape(z_mean)[1]
            epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
            return z_mean + tf.exp(0.5 * z_log_var) * epsilon

        z = layers.Lambda(sampling, output_shape=(self.latent_dim,), name='z')([z_mean, z_log_var])

        # Create encoder model
        self.encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')

        # Decoder
        decoder_inputs = layers.Input(shape=(self.latent_dim,), name='decoder_input')
        x = layers.Dense(128, activation='relu')(decoder_inputs)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dense(self.sequence_length * self.features_per_day, activation='sigmoid')(x)
        decoder_outputs = layers.Reshape(self.input_shape)(x)

        # Create decoder model
        self.decoder = Model(decoder_inputs, decoder_outputs, name='decoder')

        # VAE model
        vae_outputs = self.decoder(self.encoder(encoder_inputs)[2])
        self.vae = Model(encoder_inputs, vae_outputs, name='vae')

        # VAE loss function
        def vae_loss(x, x_decoded_mean):
            reconstruction_loss = losses.mse(tf.keras.backend.flatten(x),
                                           tf.keras.backend.flatten(x_decoded_mean))
            reconstruction_loss *= self.sequence_length * self.features_per_day

            kl_loss = 1 + z_log_var - tf.keras.backend.square(z_mean) - tf.keras.backend.exp(z_log_var)
            kl_loss = tf.keras.backend.sum(kl_loss, axis=-1)
            kl_loss *= -0.5

            return tf.keras.backend.mean(reconstruction_loss + kl_loss)

        self.vae.compile(optimizer='adam', loss=vae_loss)

        print("VAE model built successfully!")
        return self.vae

    def train_vae(self, epochs=100, batch_size=32):
        """
        Train the VAE on minority class data

        Args:
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        print("Training VAE on minority class (no dropout)...")

        # Use minority class for VAE training
        minority_data = self.no_dropout_sequences

        # Callbacks
        early_stopping = EarlyStopping(patience=15, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(factor=0.5, patience=10)

        # Train VAE
        vae_history = self.vae.fit(
            minority_data, minority_data,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        self.history['vae'] = vae_history.history
        print("VAE training completed!")

        return vae_history

    def generate_synthetic_data(self, num_samples):
        """
        Generate synthetic data using trained VAE

        Args:
            num_samples (int): Number of synthetic samples to generate
        """
        print(f"Generating {num_samples} synthetic samples...")

        # Sample from latent space
        random_latent_vectors = np.random.normal(size=(num_samples, self.latent_dim))

        # Add some structure by sampling around the learned distribution
        if len(self.no_dropout_sequences) > 0:
            # Encode some real samples to get latent distribution
            z_mean, z_log_var, _ = self.encoder.predict(self.no_dropout_sequences[:100])

            # Sample around the mean of the latent distribution
            latent_mean = np.mean(z_mean, axis=0)
            latent_std = np.std(z_mean, axis=0)

            # Generate samples around the learned distribution
            random_latent_vectors = np.random.normal(
                loc=latent_mean,
                scale=latent_std * 1.5,  # Add some variability
                size=(num_samples, self.latent_dim)
            )

        # Decode to generate synthetic data
        synthetic_data = self.decoder.predict(random_latent_vectors)

        print(f"Generated {len(synthetic_data)} synthetic samples")
        return synthetic_data

    def create_balanced_dataset(self):
        """
        Create balanced dataset using original + synthetic data
        """
        print("Creating balanced dataset...")

        # Calculate how many synthetic samples needed
        majority_count = len(self.dropout_sequences)
        minority_count = len(self.no_dropout_sequences)
        samples_needed = majority_count - minority_count

        if samples_needed > 0:
            # Generate synthetic data
            synthetic_data = self.generate_synthetic_data(samples_needed)

            # Combine original and synthetic data
            balanced_X = np.vstack([
                self.original_data['X'],  # Original data
                synthetic_data  # Synthetic minority samples
            ])

            # Create corresponding labels
            balanced_y = np.hstack([
                self.original_data['y'],  # Original labels
                np.zeros(samples_needed)  # Synthetic minority labels (0 = no dropout)
            ])

        else:
            print("Dataset is already balanced or minority is larger")
            balanced_X = self.original_data['X']
            balanced_y = self.original_data['y']

        print(f"Balanced dataset size: {len(balanced_X)}")
        print(f"Class distribution after balancing: {np.unique(balanced_y, return_counts=True)}")

        self.augmented_data = {
            'X': balanced_X,
            'y': balanced_y,
            'synthetic_samples': samples_needed if samples_needed > 0 else 0
        }

        return balanced_X, balanced_y

    def build_lstm_model(self):
        """
        Build LSTM model for dropout prediction
        """
        print("Building LSTM model...")

        model = keras.Sequential([
            # First LSTM layer
            layers.LSTM(64,
                       return_sequences=True,
                       input_shape=self.input_shape,
                       dropout=0.2,
                       recurrent_dropout=0.2),

            # Second LSTM layer
            layers.LSTM(32,
                       return_sequences=False,
                       dropout=0.2,
                       recurrent_dropout=0.2),

            # Dense layers
            layers.Dense(16, activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(8, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(1, activation='sigmoid')
        ])

        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy', 'precision', 'recall']
        )

        self.lstm_model = model
        print("LSTM model built successfully!")

        return model

    def train_lstm(self, epochs=100, batch_size=64):
        """
        Train LSTM model on balanced dataset

        Args:
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        print("Training LSTM model...")

        if self.augmented_data is None:
            print("Creating balanced dataset first...")
            self.create_balanced_dataset()

        X = self.augmented_data['X']
        y = self.augmented_data['y']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Callbacks
        early_stopping = EarlyStopping(patience=20, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(factor=0.5, patience=10)

        # Train model
        lstm_history = self.lstm_model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        self.history['lstm'] = lstm_history.history

        # Evaluate on test set
        test_predictions = self.lstm_model.predict(X_test)
        test_predictions_binary = (test_predictions > 0.5).astype(int).flatten()

        # Calculate metrics
        self.test_metrics = {
            'accuracy': accuracy_score(y_test, test_predictions_binary),
            'precision': precision_score(y_test, test_predictions_binary),
            'recall': recall_score(y_test, test_predictions_binary),
            'f1_score': f1_score(y_test, test_predictions_binary)
        }

        print("\nLSTM training completed!")
        print("\nTest Set Performance:")
        for metric, value in self.test_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")

        return lstm_history, X_test, y_test, test_predictions

    def plot_training_history(self):
        """
        Plot training history for both VAE and LSTM
        """
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # VAE Loss
        if 'vae' in self.history:
            axes[0, 0].plot(self.history['vae']['loss'], label='Training Loss')
            axes[0, 0].plot(self.history['vae']['val_loss'], label='Validation Loss')
            axes[0, 0].set_title('VAE Training Loss')
            axes[0, 0].set_xlabel('Epoch')
            axes[0, 0].set_ylabel('Loss')
            axes[0, 0].legend()
            axes[0, 0].grid(True)

        # LSTM Loss
        if 'lstm' in self.history:
            axes[0, 1].plot(self.history['lstm']['loss'], label='Training Loss')
            axes[0, 1].plot(self.history['lstm']['val_loss'], label='Validation Loss')
            axes[0, 1].set_title('LSTM Training Loss')
            axes[0, 1].set_xlabel('Epoch')
            axes[0, 1].set_ylabel('Loss')
            axes[0, 1].legend()
            axes[0, 1].grid(True)

        # LSTM Accuracy
        if 'lstm' in self.history:
            axes[1, 0].plot(self.history['lstm']['accuracy'], label='Training Accuracy')
            axes[1, 0].plot(self.history['lstm']['val_accuracy'], label='Validation Accuracy')
            axes[1, 0].set_title('LSTM Training Accuracy')
            axes[1, 0].set_xlabel('Epoch')
            axes[1, 0].set_ylabel('Accuracy')
            axes[1, 0].legend()
            axes[1, 0].grid(True)

        # Model Performance Metrics
        if hasattr(self, 'test_metrics'):
            metrics_names = list(self.test_metrics.keys())
            metrics_values = list(self.test_metrics.values())

            axes[1, 1].bar(metrics_names, metrics_values)
            axes[1, 1].set_title('Test Set Performance Metrics')
            axes[1, 1].set_ylabel('Score')
            axes[1, 1].set_ylim(0, 1)

            # Add value labels on bars
            for i, v in enumerate(metrics_values):
                axes[1, 1].text(i, v + 0.01, f'{v:.3f}', ha='center')

        plt.tight_layout()
        plt.show()

    def plot_confusion_matrix(self, y_true, y_pred):
        """
        Plot confusion matrix
        """
        y_pred_binary = (y_pred > 0.5).astype(int).flatten()
        cm = confusion_matrix(y_true, y_pred_binary)

        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['No Dropout', 'Dropout'],
                    yticklabels=['No Dropout', 'Dropout'])
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()

        # Print classification report
        print("\nClassification Report:")
        print(classification_report(y_true, y_pred_binary,
                                  target_names=['No Dropout', 'Dropout']))

    def predict_dropout(self, student_sequences):
        """
        Predict dropout probability for new student data

        Args:
            student_sequences: Array of shape (n_students, 30, 7)

        Returns:
            Dropout probabilities
        """
        if self.lstm_model is None:
            raise ValueError("LSTM model not trained yet!")

        # Normalize the input sequences
        sequences_flat = student_sequences.reshape(-1, self.sequence_length * self.features_per_day)
        sequences_normalized = self.scaler.transform(sequences_flat)
        sequences_reshaped = sequences_normalized.reshape(-1, self.sequence_length, self.features_per_day)

        # Predict
        predictions = self.lstm_model.predict(sequences_reshaped)

        return predictions.flatten()

    def run_complete_pipeline(self, csv_file_path, vae_epochs=100, lstm_epochs=100):
        """
        Run the complete VAE-LSTM pipeline

        Args:
            csv_file_path (str): Path to the CSV file
            vae_epochs (int): Epochs for VAE training
            lstm_epochs (int): Epochs for LSTM training
        """
        print("="*50)
        print("STARTING VAE-LSTM DROPOUT PREDICTION PIPELINE")
        print("="*50)

        # Step 1: Load and preprocess data
        X, y = self.load_and_preprocess_data(csv_file_path)

        # Step 2: Build and train VAE
        self.build_vae()
        self.train_vae(epochs=vae_epochs)

        # Step 3: Create balanced dataset
        balanced_X, balanced_y = self.create_balanced_dataset()

        # Step 4: Build and train LSTM
        self.build_lstm_model()
        lstm_history, X_test, y_test, test_predictions = self.train_lstm(epochs=lstm_epochs)

        # Step 5: Visualize results
        print("\n" + "="*50)
        print("TRAINING COMPLETED - GENERATING VISUALIZATIONS")
        print("="*50)

        self.plot_training_history()
        self.plot_confusion_matrix(y_test, test_predictions)

        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*50)

        return self.test_metrics

# Example usage
if __name__ == "__main__":
    # Initialize the predictor
    predictor = VAELSTMDropoutPredictor(
        sequence_length=30,
        features_per_day=7,
        latent_dim=32
    )

    # Run the complete pipeline
    # Replace 'your_data.csv' with the actual path to your CSV file
    csv_file_path = 'model1_210_features.csv'

    try:
        final_metrics = predictor.run_complete_pipeline(
            csv_file_path=csv_file_path,
            vae_epochs=50,  # Reduce for faster training
            lstm_epochs=50
        )

        print("\nFinal Model Performance:")
        for metric, value in final_metrics.items():
            print(f"{metric}: {value:.4f}")

    except FileNotFoundError:
        print(f"Error: Could not find the file '{csv_file_path}'")
        print("Please make sure the CSV file is in the correct location.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

    # Example of making predictions on new data
    # Uncomment the following lines if you want to test predictions
    """
    # Create some dummy test data (replace with real data)
    test_sequences = np.random.rand(5, 30, 7)  # 5 students, 30 days, 7 features

    # Make predictions
    dropout_probabilities = predictor.predict_dropout(test_sequences)

    print("\nSample Predictions:")
    for i, prob in enumerate(dropout_probabilities):
        risk_level = "High" if prob > 0.7 else "Medium" if prob > 0.3 else "Low"
        print(f"Student {i+1}: {prob:.3f} probability ({risk_level} risk)")
    """

STARTING VAE-LSTM DROPOUT PREDICTION PIPELINE
Loading and preprocessing data...
Loaded 120542 samples with 214 columns
Class distribution: {np.int64(0): np.int64(24961), np.int64(1): np.int64(95581)}
Dropout samples: 95581
No dropout samples: 24961
Imbalance ratio: 3.83
Building VAE model...
VAE model built successfully!
Training VAE on minority class (no dropout)...
Epoch 1/50
An error occurred: Tried to convert 'x' to a tensor and failed. Error: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLay

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, optimizers, losses
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

class VAELSTMDropoutPredictor:
    def __init__(self, sequence_length=30, features_per_day=7, latent_dim=32):
        """
        Initialize the VAE-LSTM Dropout Predictor

        Args:
            sequence_length (int): Number of days in sequence (30)
            features_per_day (int): Number of features per day (7)
            latent_dim (int): Latent dimension for VAE
        """
        self.sequence_length = sequence_length
        self.features_per_day = features_per_day
        self.latent_dim = latent_dim
        self.input_shape = (sequence_length, features_per_day)

        # Models
        self.vae = None
        self.encoder = None
        self.decoder = None
        self.lstm_model = None

        # Data
        self.scaler = StandardScaler()
        self.original_data = None
        self.augmented_data = None
        self.history = {}

    def load_and_preprocess_data(self, csv_file_path):
        """
        Load and preprocess the student activity data

        Args:
            csv_file_path (str): Path to the CSV file
        """
        print("Loading and preprocessing data...")

        # Load data
        df = pd.read_csv(csv_file_path)
        print(f"Loaded {len(df)} samples with {len(df.columns)} columns")

        # Extract feature columns (day_1_access to day_30_video)
        feature_columns = []
        for day in range(1, 31):  # 30 days
            feature_columns.extend([
                f'day_{day}_access',
                f'day_{day}_problem',
                f'day_{day}_wiki',
                f'day_{day}_discussion',
                f'day_{day}_navigate',
                f'day_{day}_page_close',
                f'day_{day}_video'
            ])

        # Extract features and labels
        X = df[feature_columns].values
        y = df['dropout'].values

        # Reshape data for LSTM (samples, timesteps, features)
        X_sequences = X.reshape(-1, self.sequence_length, self.features_per_day)

        # Normalize features
        X_flat = X_sequences.reshape(-1, self.sequence_length * self.features_per_day)
        X_normalized = self.scaler.fit_transform(X_flat)
        X_sequences = X_normalized.reshape(-1, self.sequence_length, self.features_per_day)

        # Analyze class distribution
        unique, counts = np.unique(y, return_counts=True)
        class_distribution = dict(zip(unique, counts))
        print(f"Class distribution: {class_distribution}")

        # Separate by class
        dropout_indices = np.where(y == 1)[0]
        no_dropout_indices = np.where(y == 0)[0]

        self.dropout_sequences = X_sequences[dropout_indices]
        self.no_dropout_sequences = X_sequences[no_dropout_indices]

        print(f"Dropout samples: {len(self.dropout_sequences)}")
        print(f"No dropout samples: {len(self.no_dropout_sequences)}")
        print(f"Imbalance ratio: {len(self.dropout_sequences) / len(self.no_dropout_sequences):.2f}")

        self.original_data = {
            'X': X_sequences,
            'y': y,
            'class_distribution': class_distribution
        }

        return X_sequences, y

    def build_vae(self):
        """
        Build Variational Autoencoder for data augmentation
        """
        print("Building VAE model...")

        # Custom sampling layer
        class Sampling(layers.Layer):
            def call(self, inputs):
                z_mean, z_log_var = inputs
                batch = tf.shape(z_mean)[0]
                dim = tf.shape(z_mean)[1]
                epsilon = tf.random.normal(shape=(batch, dim))
                return z_mean + tf.exp(0.5 * z_log_var) * epsilon

        # Encoder
        encoder_inputs = layers.Input(shape=self.input_shape, name='encoder_input')
        x = layers.Flatten()(encoder_inputs)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dense(128, activation='relu')(x)

        # Latent space
        z_mean = layers.Dense(self.latent_dim, name='z_mean')(x)
        z_log_var = layers.Dense(self.latent_dim, name='z_log_var')(x)

        # Sampling
        z = Sampling()([z_mean, z_log_var])

        # Create encoder model
        self.encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')

        # Decoder
        decoder_inputs = layers.Input(shape=(self.latent_dim,), name='decoder_input')
        x = layers.Dense(128, activation='relu')(decoder_inputs)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dense(self.sequence_length * self.features_per_day, activation='sigmoid')(x)
        decoder_outputs = layers.Reshape(self.input_shape)(x)

        # Create decoder model
        self.decoder = Model(decoder_inputs, decoder_outputs, name='decoder')

        # VAE model
        vae_outputs = self.decoder(self.encoder(encoder_inputs)[2])
        self.vae = Model(encoder_inputs, vae_outputs, name='vae')

        # Custom VAE loss using subclassing
        class VAE(Model):
            def __init__(self, encoder, decoder, **kwargs):
                super(VAE, self).__init__(**kwargs)
                self.encoder = encoder
                self.decoder = decoder
                self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
                self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
                self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

            @property
            def metrics(self):
                return [
                    self.total_loss_tracker,
                    self.reconstruction_loss_tracker,
                    self.kl_loss_tracker,
                ]

            def train_step(self, data):
                with tf.GradientTape() as tape:
                    z_mean, z_log_var, z = self.encoder(data)
                    reconstruction = self.decoder(z)

                    # Reconstruction loss
                    reconstruction_loss = tf.reduce_mean(
                        tf.reduce_sum(
                            keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)
                        )
                    )

                    # KL divergence loss
                    kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
                    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))

                    total_loss = reconstruction_loss + kl_loss

                grads = tape.gradient(total_loss, self.trainable_weights)
                self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

                self.total_loss_tracker.update_state(total_loss)
                self.reconstruction_loss_tracker.update_state(reconstruction_loss)
                self.kl_loss_tracker.update_state(kl_loss)

                return {
                    "loss": self.total_loss_tracker.result(),
                    "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                    "kl_loss": self.kl_loss_tracker.result(),
                }

            def call(self, inputs):
                z_mean, z_log_var, z = self.encoder(inputs)
                return self.decoder(z)

        # Create the custom VAE model
        self.vae = VAE(self.encoder, self.decoder)
        self.vae.compile(optimizer='adam')

        print("VAE model built successfully!")
        return self.vae

    def train_vae(self, epochs=100, batch_size=32):
        """
        Train the VAE on minority class data

        Args:
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        print("Training VAE on minority class (no dropout)...")

        # Use minority class for VAE training
        minority_data = self.no_dropout_sequences

        # Callbacks
        early_stopping = EarlyStopping(
            monitor='loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        )
        reduce_lr = ReduceLROnPlateau(
            monitor='loss',
            factor=0.5,
            patience=10,
            verbose=1
        )

        # Train VAE
        vae_history = self.vae.fit(
            minority_data,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        self.history['vae'] = vae_history.history
        print("VAE training completed!")

        return vae_history

    def generate_synthetic_data(self, num_samples):
        """
        Generate synthetic data using trained VAE

        Args:
            num_samples (int): Number of synthetic samples to generate
        """
        print(f"Generating {num_samples} synthetic samples...")

        # Sample from latent space
        random_latent_vectors = np.random.normal(size=(num_samples, self.latent_dim))

        # Add some structure by sampling around the learned distribution
        if len(self.no_dropout_sequences) > 0:
            # Encode some real samples to get latent distribution
            z_mean, z_log_var, _ = self.encoder.predict(self.no_dropout_sequences[:100])

            # Sample around the mean of the latent distribution
            latent_mean = np.mean(z_mean, axis=0)
            latent_std = np.std(z_mean, axis=0)

            # Generate samples around the learned distribution
            random_latent_vectors = np.random.normal(
                loc=latent_mean,
                scale=latent_std * 1.5,  # Add some variability
                size=(num_samples, self.latent_dim)
            )

        # Decode to generate synthetic data
        synthetic_data = self.decoder.predict(random_latent_vectors)

        print(f"Generated {len(synthetic_data)} synthetic samples")
        return synthetic_data

    def create_balanced_dataset(self):
        """
        Create balanced dataset using original + synthetic data
        """
        print("Creating balanced dataset...")

        # Calculate how many synthetic samples needed
        majority_count = len(self.dropout_sequences)
        minority_count = len(self.no_dropout_sequences)
        samples_needed = majority_count - minority_count

        if samples_needed > 0:
            # Generate synthetic data
            synthetic_data = self.generate_synthetic_data(samples_needed)

            # Combine original and synthetic data
            balanced_X = np.vstack([
                self.original_data['X'],  # Original data
                synthetic_data  # Synthetic minority samples
            ])

            # Create corresponding labels
            balanced_y = np.hstack([
                self.original_data['y'],  # Original labels
                np.zeros(samples_needed)  # Synthetic minority labels (0 = no dropout)
            ])

        else:
            print("Dataset is already balanced or minority is larger")
            balanced_X = self.original_data['X']
            balanced_y = self.original_data['y']

        print(f"Balanced dataset size: {len(balanced_X)}")
        print(f"Class distribution after balancing: {np.unique(balanced_y, return_counts=True)}")

        self.augmented_data = {
            'X': balanced_X,
            'y': balanced_y,
            'synthetic_samples': samples_needed if samples_needed > 0 else 0
        }

        return balanced_X, balanced_y

    def build_lstm_model(self):
        """
        Build LSTM model for dropout prediction
        """
        print("Building LSTM model...")

        model = keras.Sequential([
            # First LSTM layer
            layers.LSTM(64,
                       return_sequences=True,
                       input_shape=self.input_shape,
                       dropout=0.2,
                       recurrent_dropout=0.2),

            # Second LSTM layer
            layers.LSTM(32,
                       return_sequences=False,
                       dropout=0.2,
                       recurrent_dropout=0.2),

            # Dense layers
            layers.Dense(16, activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(8, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(1, activation='sigmoid')
        ])

        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy', 'precision', 'recall']
        )

        self.lstm_model = model
        print("LSTM model built successfully!")

        return model

    def train_lstm(self, epochs=100, batch_size=64):
        """
        Train LSTM model on balanced dataset

        Args:
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        print("Training LSTM model...")

        if self.augmented_data is None:
            print("Creating balanced dataset first...")
            self.create_balanced_dataset()

        X = self.augmented_data['X']
        y = self.augmented_data['y']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Callbacks
        early_stopping = EarlyStopping(patience=20, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(factor=0.5, patience=10)

        # Train model
        lstm_history = self.lstm_model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        self.history['lstm'] = lstm_history.history

        # Evaluate on test set
        test_predictions = self.lstm_model.predict(X_test)
        test_predictions_binary = (test_predictions > 0.5).astype(int).flatten()

        # Calculate metrics
        self.test_metrics = {
            'accuracy': accuracy_score(y_test, test_predictions_binary),
            'precision': precision_score(y_test, test_predictions_binary),
            'recall': recall_score(y_test, test_predictions_binary),
            'f1_score': f1_score(y_test, test_predictions_binary)
        }

        print("\nLSTM training completed!")
        print("\nTest Set Performance:")
        for metric, value in self.test_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")

        return lstm_history, X_test, y_test, test_predictions

    def plot_training_history(self):
        """
        Plot training history for both VAE and LSTM
        """
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # VAE Loss
        if 'vae' in self.history:
            axes[0, 0].plot(self.history['vae']['loss'], label='Training Loss')
            if 'val_loss' in self.history['vae']:
                axes[0, 0].plot(self.history['vae']['val_loss'], label='Validation Loss')
            axes[0, 0].set_title('VAE Training Loss')
            axes[0, 0].set_xlabel('Epoch')
            axes[0, 0].set_ylabel('Loss')
            axes[0, 0].legend()
            axes[0, 0].grid(True)

        # LSTM Loss
        if 'lstm' in self.history:
            axes[0, 1].plot(self.history['lstm']['loss'], label='Training Loss')
            axes[0, 1].plot(self.history['lstm']['val_loss'], label='Validation Loss')
            axes[0, 1].set_title('LSTM Training Loss')
            axes[0, 1].set_xlabel('Epoch')
            axes[0, 1].set_ylabel('Loss')
            axes[0, 1].legend()
            axes[0, 1].grid(True)

        # LSTM Accuracy
        if 'lstm' in self.history:
            axes[1, 0].plot(self.history['lstm']['accuracy'], label='Training Accuracy')
            axes[1, 0].plot(self.history['lstm']['val_accuracy'], label='Validation Accuracy')
            axes[1, 0].set_title('LSTM Training Accuracy')
            axes[1, 0].set_xlabel('Epoch')
            axes[1, 0].set_ylabel('Accuracy')
            axes[1, 0].legend()
            axes[1, 0].grid(True)

        # Model Performance Metrics
        if hasattr(self, 'test_metrics'):
            metrics_names = list(self.test_metrics.keys())
            metrics_values = list(self.test_metrics.values())

            axes[1, 1].bar(metrics_names, metrics_values)
            axes[1, 1].set_title('Test Set Performance Metrics')
            axes[1, 1].set_ylabel('Score')
            axes[1, 1].set_ylim(0, 1)

            # Add value labels on bars
            for i, v in enumerate(metrics_values):
                axes[1, 1].text(i, v + 0.01, f'{v:.3f}', ha='center')

        plt.tight_layout()
        plt.show()

    def plot_confusion_matrix(self, y_true, y_pred):
        """
        Plot confusion matrix
        """
        y_pred_binary = (y_pred > 0.5).astype(int).flatten()
        cm = confusion_matrix(y_true, y_pred_binary)

        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['No Dropout', 'Dropout'],
                    yticklabels=['No Dropout', 'Dropout'])
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()

        # Print classification report
        print("\nClassification Report:")
        print(classification_report(y_true, y_pred_binary,
                                  target_names=['No Dropout', 'Dropout']))

    def predict_dropout(self, student_sequences):
        """
        Predict dropout probability for new student data

        Args:
            student_sequences: Array of shape (n_students, 30, 7)

        Returns:
            Dropout probabilities
        """
        if self.lstm_model is None:
            raise ValueError("LSTM model not trained yet!")

        # Normalize the input sequences
        sequences_flat = student_sequences.reshape(-1, self.sequence_length * self.features_per_day)
        sequences_normalized = self.scaler.transform(sequences_flat)
        sequences_reshaped = sequences_normalized.reshape(-1, self.sequence_length, self.features_per_day)

        # Predict
        predictions = self.lstm_model.predict(sequences_reshaped)

        return predictions.flatten()

    def run_complete_pipeline(self, csv_file_path, vae_epochs=100, lstm_epochs=100):
        """
        Run the complete VAE-LSTM pipeline

        Args:
            csv_file_path (str): Path to the CSV file
            vae_epochs (int): Epochs for VAE training
            lstm_epochs (int): Epochs for LSTM training
        """
        print("="*50)
        print("STARTING VAE-LSTM DROPOUT PREDICTION PIPELINE")
        print("="*50)

        # Step 1: Load and preprocess data
        X, y = self.load_and_preprocess_data(csv_file_path)

        # Step 2: Build and train VAE
        self.build_vae()
        self.train_vae(epochs=vae_epochs)

        # Step 3: Create balanced dataset
        balanced_X, balanced_y = self.create_balanced_dataset()

        # Step 4: Build and train LSTM
        self.build_lstm_model()
        lstm_history, X_test, y_test, test_predictions = self.train_lstm(epochs=lstm_epochs)

        # Step 5: Visualize results
        print("\n" + "="*50)
        print("TRAINING COMPLETED - GENERATING VISUALIZATIONS")
        print("="*50)

        self.plot_training_history()
        self.plot_confusion_matrix(y_test, test_predictions)

        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*50)

        return self.test_metrics

# Example usage
if __name__ == "__main__":
    # Initialize the predictor
    predictor = VAELSTMDropoutPredictor(
        sequence_length=30,
        features_per_day=7,
        latent_dim=32
    )

    # Run the complete pipeline
    # Replace 'your_data.csv' with the actual path to your CSV file
    csv_file_path = 'model1_210_features.csv'

    try:
        final_metrics = predictor.run_complete_pipeline(
            csv_file_path=csv_file_path,
            vae_epochs=50,  # Reduce for faster training
            lstm_epochs=50
        )

        print("\nFinal Model Performance:")
        for metric, value in final_metrics.items():
            print(f"{metric}: {value:.4f}")

    except FileNotFoundError:
        print(f"Error: Could not find the file '{csv_file_path}'")
        print("Please make sure the CSV file is in the correct location.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

    # Example of making predictions on new data
    # Uncomment the following lines if you want to test predictions
    """
    # Create some dummy test data (replace with real data)
    test_sequences = np.random.rand(5, 30, 7)  # 5 students, 30 days, 7 features

    # Make predictions
    dropout_probabilities = predictor.predict_dropout(test_sequences)

    print("\nSample Predictions:")
    for i, prob in enumerate(dropout_probabilities):
        risk_level = "High" if prob > 0.7 else "Medium" if prob > 0.3 else "Low"
        print(f"Student {i+1}: {prob:.3f} probability ({risk_level} risk)")
    """

STARTING VAE-LSTM DROPOUT PREDICTION PIPELINE
Loading and preprocessing data...
Loaded 120542 samples with 214 columns
Class distribution: {np.int64(0): np.int64(24961), np.int64(1): np.int64(95581)}
Dropout samples: 95581
No dropout samples: 24961
Imbalance ratio: 3.83
Building VAE model...
VAE model built successfully!
Training VAE on minority class (no dropout)...
Epoch 1/50
An error occurred: Invalid reduction dimension 2 for input with 2 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](Mean, Sum/reduction_indices)' with input shapes: [32,30], [2] and with computed input tensors: input[1] = <1 2>.


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, optimizers, losses
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

class VAELSTMDropoutPredictor:
    def __init__(self, sequence_length=30, features_per_day=7, latent_dim=32):
        """
        Initialize the VAE-LSTM Dropout Predictor

        Args:
            sequence_length (int): Number of days in sequence (30)
            features_per_day (int): Number of features per day (7)
            latent_dim (int): Latent dimension for VAE
        """
        self.sequence_length = sequence_length
        self.features_per_day = features_per_day
        self.latent_dim = latent_dim
        self.input_shape = (sequence_length, features_per_day)

        # Models
        self.vae = None
        self.encoder = None
        self.decoder = None
        self.lstm_model = None

        # Data
        self.scaler = StandardScaler()
        self.original_data = None
        self.augmented_data = None
        self.history = {}

    def load_and_preprocess_data(self, csv_file_path):
        """
        Load and preprocess the student activity data

        Args:
            csv_file_path (str): Path to the CSV file
        """
        print("Loading and preprocessing data...")

        # Load data
        df = pd.read_csv(csv_file_path)
        print(f"Loaded {len(df)} samples with {len(df.columns)} columns")

        # Extract feature columns (day_1_access to day_30_video)
        feature_columns = []
        for day in range(1, 31):  # 30 days
            feature_columns.extend([
                f'day_{day}_access',
                f'day_{day}_problem',
                f'day_{day}_wiki',
                f'day_{day}_discussion',
                f'day_{day}_navigate',
                f'day_{day}_page_close',
                f'day_{day}_video'
            ])

        # Extract features and labels
        X = df[feature_columns].values
        y = df['dropout'].values

        # Reshape data for LSTM (samples, timesteps, features)
        X_sequences = X.reshape(-1, self.sequence_length, self.features_per_day)

        # Normalize features
        X_flat = X_sequences.reshape(-1, self.sequence_length * self.features_per_day)
        X_normalized = self.scaler.fit_transform(X_flat)
        X_sequences = X_normalized.reshape(-1, self.sequence_length, self.features_per_day)

        # Analyze class distribution
        unique, counts = np.unique(y, return_counts=True)
        class_distribution = dict(zip(unique, counts))
        print(f"Class distribution: {class_distribution}")

        # Separate by class
        dropout_indices = np.where(y == 1)[0]
        no_dropout_indices = np.where(y == 0)[0]

        self.dropout_sequences = X_sequences[dropout_indices]
        self.no_dropout_sequences = X_sequences[no_dropout_indices]

        print(f"Dropout samples: {len(self.dropout_sequences)}")
        print(f"No dropout samples: {len(self.no_dropout_sequences)}")
        print(f"Imbalance ratio: {len(self.dropout_sequences) / len(self.no_dropout_sequences):.2f}")

        self.original_data = {
            'X': X_sequences,
            'y': y,
            'class_distribution': class_distribution
        }

        return X_sequences, y

    def build_vae(self):
        """
        Build Variational Autoencoder for data augmentation
        """
        print("Building VAE model...")

        # Custom sampling layer
        class Sampling(layers.Layer):
            def call(self, inputs):
                z_mean, z_log_var = inputs
                batch = tf.shape(z_mean)[0]
                dim = tf.shape(z_mean)[1]
                epsilon = tf.random.normal(shape=(batch, dim))
                return z_mean + tf.exp(0.5 * z_log_var) * epsilon

        # Encoder
        encoder_inputs = layers.Input(shape=self.input_shape, name='encoder_input')
        x = layers.Flatten()(encoder_inputs)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dense(128, activation='relu')(x)

        # Latent space
        z_mean = layers.Dense(self.latent_dim, name='z_mean')(x)
        z_log_var = layers.Dense(self.latent_dim, name='z_log_var')(x)

        # Sampling
        z = Sampling()([z_mean, z_log_var])

        # Create encoder model
        self.encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name='encoder')

        # Decoder
        decoder_inputs = layers.Input(shape=(self.latent_dim,), name='decoder_input')
        x = layers.Dense(128, activation='relu')(decoder_inputs)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dense(self.sequence_length * self.features_per_day, activation='sigmoid')(x)
        decoder_outputs = layers.Reshape(self.input_shape)(x)

        # Create decoder model
        self.decoder = Model(decoder_inputs, decoder_outputs, name='decoder')

        # VAE model
        vae_outputs = self.decoder(self.encoder(encoder_inputs)[2])
        self.vae = Model(encoder_inputs, vae_outputs, name='vae')

        # Custom VAE loss using subclassing
        class VAE(Model):
            def __init__(self, encoder, decoder, **kwargs):
                super(VAE, self).__init__(**kwargs)
                self.encoder = encoder
                self.decoder = decoder
                self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
                self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
                self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

            @property
            def metrics(self):
                return [
                    self.total_loss_tracker,
                    self.reconstruction_loss_tracker,
                    self.kl_loss_tracker,
                ]

            def train_step(self, data):
                with tf.GradientTape() as tape:
                    z_mean, z_log_var, z = self.encoder(data)
                    reconstruction = self.decoder(z)

                    # Flatten both data and reconstruction for loss calculation
                    data_flat = tf.reshape(data, [tf.shape(data)[0], -1])
                    reconstruction_flat = tf.reshape(reconstruction, [tf.shape(reconstruction)[0], -1])

                    # Reconstruction loss - MSE loss
                    reconstruction_loss = tf.reduce_mean(
                        tf.reduce_sum(
                            tf.square(data_flat - reconstruction_flat), axis=1
                        )
                    )

                    # KL divergence loss
                    kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
                    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))

                    # Total loss with weighting
                    total_loss = reconstruction_loss + 0.5 * kl_loss

                grads = tape.gradient(total_loss, self.trainable_weights)
                self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

                self.total_loss_tracker.update_state(total_loss)
                self.reconstruction_loss_tracker.update_state(reconstruction_loss)
                self.kl_loss_tracker.update_state(kl_loss)

                return {
                    "loss": self.total_loss_tracker.result(),
                    "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                    "kl_loss": self.kl_loss_tracker.result(),
                }

            def call(self, inputs):
                z_mean, z_log_var, z = self.encoder(inputs)
                return self.decoder(z)

        # Create the custom VAE model
        self.vae = VAE(self.encoder, self.decoder)
        self.vae.compile(optimizer='adam')

        print("VAE model built successfully!")
        return self.vae

    def train_vae(self, epochs=100, batch_size=32):
        """
        Train the VAE on minority class data

        Args:
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        print("Training VAE on minority class (no dropout)...")

        # Use minority class for VAE training
        minority_data = self.no_dropout_sequences

        # Callbacks
        early_stopping = EarlyStopping(
            monitor='loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        )
        reduce_lr = ReduceLROnPlateau(
            monitor='loss',
            factor=0.5,
            patience=10,
            verbose=1
        )

        # Train VAE
        vae_history = self.vae.fit(
            minority_data,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        self.history['vae'] = vae_history.history
        print("VAE training completed!")

        return vae_history

    def generate_synthetic_data(self, num_samples):
        """
        Generate synthetic data using trained VAE

        Args:
            num_samples (int): Number of synthetic samples to generate
        """
        print(f"Generating {num_samples} synthetic samples...")

        # Sample from latent space
        random_latent_vectors = np.random.normal(size=(num_samples, self.latent_dim))

        # Add some structure by sampling around the learned distribution
        if len(self.no_dropout_sequences) > 0:
            # Encode some real samples to get latent distribution
            z_mean, z_log_var, _ = self.encoder.predict(self.no_dropout_sequences[:100])

            # Sample around the mean of the latent distribution
            latent_mean = np.mean(z_mean, axis=0)
            latent_std = np.std(z_mean, axis=0)

            # Generate samples around the learned distribution
            random_latent_vectors = np.random.normal(
                loc=latent_mean,
                scale=latent_std * 1.5,  # Add some variability
                size=(num_samples, self.latent_dim)
            )

        # Decode to generate synthetic data
        synthetic_data = self.decoder.predict(random_latent_vectors)

        print(f"Generated {len(synthetic_data)} synthetic samples")
        return synthetic_data

    def create_balanced_dataset(self):
        """
        Create balanced dataset using original + synthetic data
        """
        print("Creating balanced dataset...")

        # Calculate how many synthetic samples needed
        majority_count = len(self.dropout_sequences)
        minority_count = len(self.no_dropout_sequences)
        samples_needed = majority_count - minority_count

        if samples_needed > 0:
            # Generate synthetic data
            synthetic_data = self.generate_synthetic_data(samples_needed)

            # Combine original and synthetic data
            balanced_X = np.vstack([
                self.original_data['X'],  # Original data
                synthetic_data  # Synthetic minority samples
            ])

            # Create corresponding labels
            balanced_y = np.hstack([
                self.original_data['y'],  # Original labels
                np.zeros(samples_needed)  # Synthetic minority labels (0 = no dropout)
            ])

        else:
            print("Dataset is already balanced or minority is larger")
            balanced_X = self.original_data['X']
            balanced_y = self.original_data['y']

        print(f"Balanced dataset size: {len(balanced_X)}")
        print(f"Class distribution after balancing: {np.unique(balanced_y, return_counts=True)}")

        self.augmented_data = {
            'X': balanced_X,
            'y': balanced_y,
            'synthetic_samples': samples_needed if samples_needed > 0 else 0
        }

        return balanced_X, balanced_y

    def build_lstm_model(self):
        """
        Build LSTM model for dropout prediction
        """
        print("Building LSTM model...")

        model = keras.Sequential([
            # First LSTM layer
            layers.LSTM(64,
                       return_sequences=True,
                       input_shape=self.input_shape,
                       dropout=0.2,
                       recurrent_dropout=0.2),

            # Second LSTM layer
            layers.LSTM(32,
                       return_sequences=False,
                       dropout=0.2,
                       recurrent_dropout=0.2),

            # Dense layers
            layers.Dense(16, activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(8, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(1, activation='sigmoid')
        ])

        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy', 'precision', 'recall']
        )

        self.lstm_model = model
        print("LSTM model built successfully!")

        return model

    def train_lstm(self, epochs=100, batch_size=64):
        """
        Train LSTM model on balanced dataset

        Args:
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        print("Training LSTM model...")

        if self.augmented_data is None:
            print("Creating balanced dataset first...")
            self.create_balanced_dataset()

        X = self.augmented_data['X']
        y = self.augmented_data['y']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Callbacks
        early_stopping = EarlyStopping(patience=20, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(factor=0.5, patience=10)

        # Train model
        lstm_history = self.lstm_model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        self.history['lstm'] = lstm_history.history

        # Evaluate on test set
        test_predictions = self.lstm_model.predict(X_test)
        test_predictions_binary = (test_predictions > 0.5).astype(int).flatten()

        # Calculate metrics
        self.test_metrics = {
            'accuracy': accuracy_score(y_test, test_predictions_binary),
            'precision': precision_score(y_test, test_predictions_binary),
            'recall': recall_score(y_test, test_predictions_binary),
            'f1_score': f1_score(y_test, test_predictions_binary)
        }

        print("\nLSTM training completed!")
        print("\nTest Set Performance:")
        for metric, value in self.test_metrics.items():
            print(f"{metric.capitalize()}: {value:.4f}")

        return lstm_history, X_test, y_test, test_predictions

    def plot_training_history(self):
        """
        Plot training history for both VAE and LSTM
        """
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # VAE Loss
        if 'vae' in self.history:
            axes[0, 0].plot(self.history['vae']['loss'], label='Training Loss')
            if 'val_loss' in self.history['vae']:
                axes[0, 0].plot(self.history['vae']['val_loss'], label='Validation Loss')
            axes[0, 0].set_title('VAE Training Loss')
            axes[0, 0].set_xlabel('Epoch')
            axes[0, 0].set_ylabel('Loss')
            axes[0, 0].legend()
            axes[0, 0].grid(True)

        # LSTM Loss
        if 'lstm' in self.history:
            axes[0, 1].plot(self.history['lstm']['loss'], label='Training Loss')
            axes[0, 1].plot(self.history['lstm']['val_loss'], label='Validation Loss')
            axes[0, 1].set_title('LSTM Training Loss')
            axes[0, 1].set_xlabel('Epoch')
            axes[0, 1].set_ylabel('Loss')
            axes[0, 1].legend()
            axes[0, 1].grid(True)

        # LSTM Accuracy
        if 'lstm' in self.history:
            axes[1, 0].plot(self.history['lstm']['accuracy'], label='Training Accuracy')
            axes[1, 0].plot(self.history['lstm']['val_accuracy'], label='Validation Accuracy')
            axes[1, 0].set_title('LSTM Training Accuracy')
            axes[1, 0].set_xlabel('Epoch')
            axes[1, 0].set_ylabel('Accuracy')
            axes[1, 0].legend()
            axes[1, 0].grid(True)

        # Model Performance Metrics
        if hasattr(self, 'test_metrics'):
            metrics_names = list(self.test_metrics.keys())
            metrics_values = list(self.test_metrics.values())

            axes[1, 1].bar(metrics_names, metrics_values)
            axes[1, 1].set_title('Test Set Performance Metrics')
            axes[1, 1].set_ylabel('Score')
            axes[1, 1].set_ylim(0, 1)

            # Add value labels on bars
            for i, v in enumerate(metrics_values):
                axes[1, 1].text(i, v + 0.01, f'{v:.3f}', ha='center')

        plt.tight_layout()
        plt.show()

    def plot_confusion_matrix(self, y_true, y_pred):
        """
        Plot confusion matrix
        """
        y_pred_binary = (y_pred > 0.5).astype(int).flatten()
        cm = confusion_matrix(y_true, y_pred_binary)

        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['No Dropout', 'Dropout'],
                    yticklabels=['No Dropout', 'Dropout'])
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()

        # Print classification report
        print("\nClassification Report:")
        print(classification_report(y_true, y_pred_binary,
                                  target_names=['No Dropout', 'Dropout']))

    def predict_dropout(self, student_sequences):
        """
        Predict dropout probability for new student data

        Args:
            student_sequences: Array of shape (n_students, 30, 7)

        Returns:
            Dropout probabilities
        """
        if self.lstm_model is None:
            raise ValueError("LSTM model not trained yet!")

        # Normalize the input sequences
        sequences_flat = student_sequences.reshape(-1, self.sequence_length * self.features_per_day)
        sequences_normalized = self.scaler.transform(sequences_flat)
        sequences_reshaped = sequences_normalized.reshape(-1, self.sequence_length, self.features_per_day)

        # Predict
        predictions = self.lstm_model.predict(sequences_reshaped)

        return predictions.flatten()

    def run_complete_pipeline(self, csv_file_path, vae_epochs=100, lstm_epochs=100):
        """
        Run the complete VAE-LSTM pipeline

        Args:
            csv_file_path (str): Path to the CSV file
            vae_epochs (int): Epochs for VAE training
            lstm_epochs (int): Epochs for LSTM training
        """
        print("="*50)
        print("STARTING VAE-LSTM DROPOUT PREDICTION PIPELINE")
        print("="*50)

        # Step 1: Load and preprocess data
        X, y = self.load_and_preprocess_data(csv_file_path)

        # Step 2: Build and train VAE
        self.build_vae()
        self.train_vae(epochs=vae_epochs)

        # Step 3: Create balanced dataset
        balanced_X, balanced_y = self.create_balanced_dataset()

        # Step 4: Build and train LSTM
        self.build_lstm_model()
        lstm_history, X_test, y_test, test_predictions = self.train_lstm(epochs=lstm_epochs)

        # Step 5: Visualize results
        print("\n" + "="*50)
        print("TRAINING COMPLETED - GENERATING VISUALIZATIONS")
        print("="*50)

        self.plot_training_history()
        self.plot_confusion_matrix(y_test, test_predictions)

        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*50)

        return self.test_metrics

# Example usage
if __name__ == "__main__":
    # Initialize the predictor
    predictor = VAELSTMDropoutPredictor(
        sequence_length=30,
        features_per_day=7,
        latent_dim=32
    )

    # Run the complete pipeline
    # Replace 'your_data.csv' with the actual path to your CSV file
    csv_file_path = 'model1_210_features.csv'

    try:
        final_metrics = predictor.run_complete_pipeline(
            csv_file_path=csv_file_path,
            vae_epochs=50,  # Reduce for faster training
            lstm_epochs=50
        )

        print("\nFinal Model Performance:")
        for metric, value in final_metrics.items():
            print(f"{metric}: {value:.4f}")

    except FileNotFoundError:
        print(f"Error: Could not find the file '{csv_file_path}'")
        print("Please make sure the CSV file is in the correct location.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

    # Example of making predictions on new data
    # Uncomment the following lines if you want to test predictions
    """
    # Create some dummy test data (replace with real data)
    test_sequences = np.random.rand(5, 30, 7)  # 5 students, 30 days, 7 features

    # Make predictions
    dropout_probabilities = predictor.predict_dropout(test_sequences)

    print("\nSample Predictions:")
    for i, prob in enumerate(dropout_probabilities):
        risk_level = "High" if prob > 0.7 else "Medium" if prob > 0.3 else "Low"
        print(f"Student {i+1}: {prob:.3f} probability ({risk_level} risk)")
    """

STARTING VAE-LSTM DROPOUT PREDICTION PIPELINE
Loading and preprocessing data...
Loaded 120542 samples with 214 columns
Class distribution: {np.int64(0): np.int64(24961), np.int64(1): np.int64(95581)}
Dropout samples: 95581
No dropout samples: 24961
Imbalance ratio: 3.83
Building VAE model...
VAE model built successfully!
Training VAE on minority class (no dropout)...
Epoch 1/50
[1m623/624[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - kl_loss: 6885455360.0000 - loss: 3442727936.0000 - reconstruction_loss: 929.9529An error occurred: No loss to compute. Provide a `loss` argument in `compile()`.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.base import BaseEstimator, ClassifierMixin
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model, optimizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings('ignore')

class AttentionLayer(layers.Layer):
    """
    Custom Attention Layer for LSTM
    """
    def __init__(self, units=128, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
        self.units = units

    def build(self, input_shape):
        # Attention weights
        self.W_a = self.add_weight(
            name='W_a',
            shape=(input_shape[-1], self.units),
            initializer='glorot_uniform',
            trainable=True
        )
        self.U_a = self.add_weight(
            name='U_a',
            shape=(input_shape[-1], self.units),
            initializer='glorot_uniform',
            trainable=True
        )
        self.V_a = self.add_weight(
            name='V_a',
            shape=(self.units, 1),
            initializer='glorot_uniform',
            trainable=True
        )
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # inputs shape: (batch_size, time_steps, features)
        seq_len = tf.shape(inputs)[1]

        # Calculate attention scores
        # For each time step, compute attention with all other time steps
        scores = []
        for i in range(seq_len):
            # Current hidden state
            h_i = inputs[:, i:i+1, :]  # (batch_size, 1, features)
            h_i_repeated = tf.repeat(h_i, seq_len, axis=1)  # (batch_size, seq_len, features)

            # Attention mechanism
            score = tf.nn.tanh(
                tf.matmul(inputs, self.W_a) + tf.matmul(h_i_repeated, self.U_a)
            )  # (batch_size, seq_len, units)

            score = tf.matmul(score, self.V_a)  # (batch_size, seq_len, 1)
            scores.append(score)

        # Concatenate all scores
        attention_scores = tf.concat(scores, axis=-1)  # (batch_size, seq_len, seq_len)
        attention_weights = tf.nn.softmax(attention_scores, axis=1)

        # Apply attention weights
        context_vector = tf.matmul(attention_weights, inputs, transpose_a=True)  # (batch_size, seq_len, features)

        # Global attention pooling
        global_attention = tf.reduce_mean(context_vector, axis=1)  # (batch_size, features)

        return global_attention, attention_weights

    def compute_output_shape(self, input_shape):
        return [(input_shape[0], input_shape[2]), (input_shape[0], input_shape[1], input_shape[1])]

    def get_config(self):
        config = super(AttentionLayer, self).get_config()
        config.update({'units': self.units})
        return config

class SelfAttentionLayer(layers.Layer):
    """
    Self-Attention Layer (Simplified Transformer-style)
    """
    def __init__(self, d_model=64, num_heads=8, **kwargs):
        super(SelfAttentionLayer, self).__init__(**kwargs)
        self.d_model = d_model
        self.num_heads = num_heads

    def build(self, input_shape):
        self.wq = layers.Dense(self.d_model)
        self.wk = layers.Dense(self.d_model)
        self.wv = layers.Dense(self.d_model)
        self.dense = layers.Dense(input_shape[-1])
        super(SelfAttentionLayer, self).build(input_shape)

    def scaled_dot_product_attention(self, q, k, v):
        """Calculate the attention weights and apply to values"""
        matmul_qk = tf.matmul(q, k, transpose_b=True)

        # Scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # Softmax
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

        # Apply attention to values
        output = tf.matmul(attention_weights, v)
        return output, attention_weights

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        seq_len = tf.shape(inputs)[1]

        q = self.wq(inputs)
        k = self.wk(inputs)
        v = self.wv(inputs)

        # Apply attention
        attention_output, attention_weights = self.scaled_dot_product_attention(q, k, v)

        # Final linear transformation
        output = self.dense(attention_output)

        # Residual connection and layer norm
        output = layers.LayerNormalization()(output + inputs)

        return output

    def get_config(self):
        config = super(SelfAttentionLayer, self).get_config()
        config.update({
            'd_model': self.d_model,
            'num_heads': self.num_heads
        })
        return config

class AttentionLSTMClassifier(BaseEstimator, ClassifierMixin):
    """
    LSTM with Attention Mechanism wrapped as sklearn estimator
    """
    def __init__(self,
                 lstm_units=64,
                 attention_units=128,
                 dropout_rate=0.2,
                 recurrent_dropout=0.2,
                 dense_units=32,
                 learning_rate=0.001,
                 batch_size=32,
                 epochs=50,
                 attention_type='custom',
                 use_bidirectional=True,
                 num_lstm_layers=2):

        self.lstm_units = lstm_units
        self.attention_units = attention_units
        self.dropout_rate = dropout_rate
        self.recurrent_dropout = recurrent_dropout
        self.dense_units = dense_units
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.attention_type = attention_type
        self.use_bidirectional = use_bidirectional
        self.num_lstm_layers = num_lstm_layers
        self.model = None
        self.scaler = StandardScaler()
        self.history = None

    def _build_model(self, input_shape):
        """Build the Attention-LSTM model"""
        inputs = layers.Input(shape=input_shape)
        x = inputs

        # LSTM layers
        for i in range(self.num_lstm_layers):
            return_sequences = True if i < self.num_lstm_layers - 1 or self.attention_type != 'none' else False

            lstm_layer = layers.LSTM(
                self.lstm_units,
                return_sequences=return_sequences,
                dropout=self.dropout_rate,
                recurrent_dropout=self.recurrent_dropout,
                name=f'lstm_{i+1}'
            )

            if self.use_bidirectional:
                x = layers.Bidirectional(lstm_layer, name=f'bidirectional_{i+1}')(x)
            else:
                x = lstm_layer(x)

        # Attention mechanism
        if self.attention_type == 'custom':
            attention_output, attention_weights = AttentionLayer(
                units=self.attention_units,
                name='attention_layer'
            )(x)
            x = attention_output

        elif self.attention_type == 'self':
            x = SelfAttentionLayer(
                d_model=self.lstm_units * (2 if self.use_bidirectional else 1),
                name='self_attention'
            )(x)
            x = layers.GlobalAveragePooling1D()(x)

        elif self.attention_type == 'global':
            # Global attention pooling
            attention_weights = layers.Dense(1, activation='softmax')(x)
            x = tf.reduce_sum(x * attention_weights, axis=1)

        elif self.attention_type == 'none':
            # No attention, just use last output
            pass

        # Dense layers
        x = layers.Dense(self.dense_units, activation='relu', name='dense_1')(x)
        x = layers.Dropout(self.dropout_rate)(x)
        x = layers.Dense(self.dense_units // 2, activation='relu', name='dense_2')(x)
        x = layers.Dropout(self.dropout_rate)(x)

        # Output layer
        outputs = layers.Dense(1, activation='sigmoid', name='output')(x)

        model = Model(inputs, outputs, name='attention_lstm_classifier')

        # Compile model
        model.compile(
            optimizer=optimizers.Adam(learning_rate=self.learning_rate),
            loss='binary_crossentropy',
            metrics=['accuracy', 'precision', 'recall']
        )

        return model

    def fit(self, X, y):
        """Fit the model"""
        # Handle input shape
        if len(X.shape) == 2:
            # Reshape to (samples, timesteps, features)
            X = X.reshape(X.shape[0], 30, 7)  # 30 days, 7 features per day

        # Scale features
        original_shape = X.shape
        X_flat = X.reshape(-1, X.shape[-1])
        X_scaled = self.scaler.fit_transform(X_flat)
        X = X_scaled.reshape(original_shape)

        # Build model
        self.model = self._build_model(X.shape[1:])

        # Callbacks
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=10,
                restore_best_weights=True,
                verbose=0
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=5,
                verbose=0
            )
        ]

        # Train model
        self.history = self.model.fit(
            X, y,
            batch_size=self.batch_size,
            epochs=self.epochs,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=0
        )

        return self

    def predict(self, X):
        """Make predictions"""
        if len(X.shape) == 2:
            X = X.reshape(X.shape[0], 30, 7)

        # Scale features
        original_shape = X.shape
        X_flat = X.reshape(-1, X.shape[-1])
        X_scaled = self.scaler.transform(X_flat)
        X = X_scaled.reshape(original_shape)

        predictions = self.model.predict(X, verbose=0)
        return (predictions > 0.5).astype(int).flatten()

    def predict_proba(self, X):
        """Predict probabilities"""
        if len(X.shape) == 2:
            X = X.reshape(X.shape[0], 30, 7)

        # Scale features
        original_shape = X.shape
        X_flat = X.reshape(-1, X.shape[-1])
        X_scaled = self.scaler.transform(X_flat)
        X = X_scaled.reshape(original_shape)

        predictions = self.model.predict(X, verbose=0)
        return np.column_stack([1 - predictions.flatten(), predictions.flatten()])

    def score(self, X, y):
        """Score the model (accuracy)"""
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

class StudentDropoutPredictor:
    """
    Main class for student dropout prediction with Attention-LSTM and RandomizedSearchCV
    """
    def __init__(self, sequence_length=30, features_per_day=7):
        self.sequence_length = sequence_length
        self.features_per_day = features_per_day
        self.input_shape = (sequence_length, features_per_day)

        # Models and results
        self.best_model = None
        self.randomized_search = None
        self.results = {}

    def load_and_preprocess_data(self, csv_file_path):
        """Load and preprocess the student activity data"""
        print("Loading and preprocessing data...")

        # Load data
        df = pd.read_csv(csv_file_path)
        print(f"Loaded {len(df)} samples with {len(df.columns)} columns")

        # Extract feature columns
        feature_columns = []
        for day in range(1, 31):
            feature_columns.extend([
                f'day_{day}_access',
                f'day_{day}_problem',
                f'day_{day}_wiki',
                f'day_{day}_discussion',
                f'day_{day}_navigate',
                f'day_{day}_page_close',
                f'day_{day}_video'
            ])

        # Extract features and labels
        X = df[feature_columns].values
        y = df['dropout'].values

        # Reshape for time series
        X_sequences = X.reshape(-1, self.sequence_length, self.features_per_day)

        # Analyze class distribution
        unique, counts = np.unique(y, return_counts=True)
        class_distribution = dict(zip(unique, counts))
        print(f"Class distribution: {class_distribution}")
        print(f"Imbalance ratio: {counts[1] / counts[0]:.2f}")

        self.original_data = {
            'X': X_sequences,
            'y': y,
            'class_distribution': class_distribution
        }

        return X_sequences, y

    def define_hyperparameter_space(self):
        """Define hyperparameter search space"""
        param_distributions = {
            'lstm_units': randint(32, 128),
            'attention_units': randint(64, 256),
            'dropout_rate': uniform(0.1, 0.4),
            'recurrent_dropout': uniform(0.1, 0.3),
            'dense_units': randint(16, 64),
            'learning_rate': uniform(0.0001, 0.01),
            'batch_size': [16, 32, 64, 128],
            'attention_type': ['custom', 'self', 'global', 'none'],
            'use_bidirectional': [True, False],
            'num_lstm_layers': randint(1, 3)
        }

        return param_distributions

    def run_randomized_search(self, X, y, n_iter=50, cv_folds=3, n_jobs=-1):
        """Run RandomizedSearchCV for hyperparameter optimization"""
        print(f"Starting RandomizedSearchCV with {n_iter} iterations...")

        # Create base estimator
        base_estimator = AttentionLSTMClassifier(epochs=30)  # Reduced epochs for faster search

        # Define hyperparameter space
        param_distributions = self.define_hyperparameter_space()

        # Create stratified k-fold
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

        # Create RandomizedSearchCV
        self.randomized_search = RandomizedSearchCV(
            estimator=base_estimator,
            param_distributions=param_distributions,
            n_iter=n_iter,
            cv=cv,
            scoring='f1',  # Using F1 score for imbalanced dataset
            n_jobs=n_jobs,
            random_state=42,
            verbose=1
        )

        # Fit the search
        print("Fitting RandomizedSearchCV...")
        self.randomized_search.fit(X, y)

        print("RandomizedSearchCV completed!")
        print(f"Best score: {self.randomized_search.best_score_:.4f}")
        print(f"Best parameters: {self.randomized_search.best_params_}")

        return self.randomized_search

    def train_best_model(self, X, y, test_size=0.2):
        """Train the best model found by RandomizedSearchCV"""
        print("Training best model with full epochs...")

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )

        # Get best parameters and increase epochs for final training
        best_params = self.randomized_search.best_params_.copy()
        best_params['epochs'] = 100  # Full training

        # Create and train best model
        self.best_model = AttentionLSTMClassifier(**best_params)
        self.best_model.fit(X_train, y_train)

        # Evaluate on test set
        y_pred = self.best_model.predict(X_test)
        y_pred_proba = self.best_model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        self.results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba),
            'best_params': best_params,
            'y_test': y_test,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }

        print("\nBest Model Performance:")
        for metric in ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']:
            print(f"{metric.capitalize()}: {self.results[metric]:.4f}")

        return self.results

    def plot_hyperparameter_analysis(self):
        """Plot hyperparameter analysis from RandomizedSearchCV"""
        if self.randomized_search is None:
            print("No RandomizedSearchCV results to plot")
            return

        # Convert results to DataFrame
        results_df = pd.DataFrame(self.randomized_search.cv_results_)

        # Create subplots
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))

        # Plot 1: Score distribution
        axes[0, 0].hist(results_df['mean_test_score'], bins=20, alpha=0.7, color='skyblue')
        axes[0, 0].axvline(self.randomized_search.best_score_, color='red', linestyle='--',
                          label=f'Best Score: {self.randomized_search.best_score_:.4f}')
        axes[0, 0].set_title('Cross-Validation Score Distribution')
        axes[0, 0].set_xlabel('F1 Score')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].legend()

        # Plot 2: LSTM units vs Score
        axes[0, 1].scatter(results_df['param_lstm_units'], results_df['mean_test_score'], alpha=0.6)
        axes[0, 1].set_title('LSTM Units vs F1 Score')
        axes[0, 1].set_xlabel('LSTM Units')
        axes[0, 1].set_ylabel('F1 Score')

        # Plot 3: Learning rate vs Score
        axes[0, 2].scatter(results_df['param_learning_rate'], results_df['mean_test_score'], alpha=0.6)
        axes[0, 2].set_title('Learning Rate vs F1 Score')
        axes[0, 2].set_xlabel('Learning Rate')
        axes[0, 2].set_ylabel('F1 Score')
        axes[0, 2].set_xscale('log')

        # Plot 4: Attention type performance
        attention_performance = results_df.groupby('param_attention_type')['mean_test_score'].mean().sort_values()
        axes[1, 0].bar(attention_performance.index, attention_performance.values, color='lightcoral')
        axes[1, 0].set_title('Attention Type Performance')
        axes[1, 0].set_xlabel('Attention Type')
        axes[1, 0].set_ylabel('Mean F1 Score')
        axes[1, 0].tick_params(axis='x', rotation=45)

        # Plot 5: Bidirectional vs Unidirectional
        bidirectional_performance = results_df.groupby('param_use_bidirectional')['mean_test_score'].mean()
        axes[1, 1].bar(['Unidirectional', 'Bidirectional'],
                      [bidirectional_performance[False], bidirectional_performance[True]],
                      color=['lightblue', 'lightgreen'])
        axes[1, 1].set_title('Bidirectional vs Unidirectional LSTM')
        axes[1, 1].set_ylabel('Mean F1 Score')

        # Plot 6: Top 10 parameter combinations
        top_10 = results_df.nlargest(10, 'mean_test_score')
        axes[1, 2].barh(range(10), top_10['mean_test_score'].values[::-1])
        axes[1, 2].set_title('Top 10 Parameter Combinations')
        axes[1, 2].set_xlabel('F1 Score')
        axes[1, 2].set_ylabel('Rank')
        axes[1, 2].set_yticks(range(10))
        axes[1, 2].set_yticklabels([f'#{i+1}' for i in range(10)])

        plt.tight_layout()
        plt.show()

    def plot_model_performance(self):
        """Plot model performance metrics and curves"""
        if not self.results:
            print("No model results to plot")
            return

        fig, axes = plt.subplots(2, 3, figsize=(18, 12))

        # Plot 1: Performance metrics
        metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']
        values = [self.results[metric] for metric in metrics]

        bars = axes[0, 0].bar(metrics, values, color=['skyblue', 'lightgreen', 'lightcoral', 'lightyellow', 'lightpink'])
        axes[0, 0].set_title('Model Performance Metrics')
        axes[0, 0].set_ylabel('Score')
        axes[0, 0].set_ylim(0, 1)

        # Add value labels on bars
        for bar, value in zip(bars, values):
            axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                           f'{value:.3f}', ha='center', va='bottom')

        # Plot 2: Confusion Matrix
        cm = confusion_matrix(self.results['y_test'], self.results['y_pred'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1],
                   xticklabels=['No Dropout', 'Dropout'],
                   yticklabels=['No Dropout', 'Dropout'])
        axes[0, 1].set_title('Confusion Matrix')
        axes[0, 1].set_ylabel('True Label')
        axes[0, 1].set_xlabel('Predicted Label')

        # Plot 3: ROC Curve
        fpr, tpr, _ = roc_curve(self.results['y_test'], self.results['y_pred_proba'])
        axes[0, 2].plot(fpr, tpr, color='darkorange', lw=2,
                       label=f'ROC curve (AUC = {self.results["roc_auc"]:.3f})')
        axes[0, 2].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        axes[0, 2].set_xlim([0.0, 1.0])
        axes[0, 2].set_ylim([0.0, 1.05])
        axes[0, 2].set_xlabel('False Positive Rate')
        axes[0, 2].set_ylabel('True Positive Rate')
        axes[0, 2].set_title('ROC Curve')
        axes[0, 2].legend(loc="lower right")

        # Plot 4: Training History (if available)
        if hasattr(self.best_model, 'history') and self.best_model.history:
            history = self.best_model.history.history

            axes[1, 0].plot(history['loss'], label='Training Loss')
            axes[1, 0].plot(history['val_loss'], label='Validation Loss')
            axes[1, 0].set_title('Training History - Loss')
            axes[1, 0].set_xlabel('Epoch')
            axes[1, 0].set_ylabel('Loss')
            axes[1, 0].legend()

            axes[1, 1].plot(history['accuracy'], label='Training Accuracy')
            axes[1, 1].plot(history['val_accuracy'], label='Validation Accuracy')
            axes[1, 1].set_title('Training History - Accuracy')
            axes[1, 1].set_xlabel('Epoch')
            axes[1, 1].set_ylabel('Accuracy')
            axes[1, 1].legend()

        # Plot 5: Prediction Distribution
        axes[1, 2].hist(self.results['y_pred_proba'], bins=30, alpha=0.7, color='lightblue')
        axes[1, 2].set_title('Prediction Probability Distribution')
        axes[1, 2].set_xlabel('Predicted Dropout Probability')
        axes[1, 2].set_ylabel('Frequency')

        plt.tight_layout()
        plt.show()

    def run_complete_pipeline(self, csv_file_path, n_iter=30, cv_folds=3):
        """Run the complete pipeline"""
        print("="*60)
        print("ATTENTION-LSTM WITH RANDOMIZEDSEARCHCV PIPELINE")
        print("="*60)

        # Load and preprocess data
        X, y = self.load_and_preprocess_data(csv_file_path)

        # Run RandomizedSearchCV
        self.run_randomized_search(X, y, n_iter=n_iter, cv_folds=cv_folds)

        # Train best model
        results = self.train_best_model(X, y)

        # Plot results
        print("\n" + "="*60)
        print("GENERATING VISUALIZATIONS")
        print("="*60)

        self.plot_hyperparameter_analysis()
        self.plot_model_performance()

        # Print classification report
        print("\nDetailed Classification Report:")
        print(classification_report(
            self.results['y_test'],
            self.results['y_pred'],
            target_names=['No Dropout', 'Dropout']
        ))

        print("\n" + "="*60)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*60)

        return results

# Example usage
if __name__ == "__main__":
    # Initialize the predictor
    predictor = StudentDropoutPredictor(
        sequence_length=30,
        features_per_day=7
    )

    # Run the complete pipeline
    csv_file_path = 'model1_210_features.csv'

    try:
        results = predictor.run_complete_pipeline(
            csv_file_path=csv_file_path,
            n_iter=20,  # Reduced for faster execution, increase for better results
            cv_folds=3
        )

        print("\nBest Model Configuration:")
        for param, value in results['best_params'].items():
            print(f"{param}: {value}")

        print(f"\nFinal Performance:")
        print(f"Accuracy: {results['accuracy']:.4f}")
        print(f"F1-Score: {results['f1_score']:.4f}")
        print(f"ROC-AUC: {results['roc_auc']:.4f}")

    except FileNotFoundError:
        print(f"Error: Could not find the file '{csv_file_path}'")
        print("Please make sure the CSV file is in the correct location.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

    # Example of making predictions on new data
    """
    if predictor.best_model:
        # Create dummy test data (replace with real data)
        test_sequences = np.random.rand(5, 30, 7)

        # Make predictions
        predictions = predictor.best_model.predict(test_sequences)
        probabilities = predictor.best_model.predict_proba(test_sequences)[:, 1]

        print("\nSample Predictions:")
        for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
            risk_level = "High" if prob > 0.7 else "Medium" if prob > 0.3 else "Low"
            print(f"Student {i+1}: Prediction={pred}, Probability={prob:.3f} ({risk_level} risk)")
    """

ATTENTION-LSTM WITH RANDOMIZEDSEARCHCV PIPELINE
Loading and preprocessing data...
Loaded 120542 samples with 214 columns
Class distribution: {np.int64(0): np.int64(24961), np.int64(1): np.int64(95581)}
Imbalance ratio: 3.83
Starting RandomizedSearchCV with 20 iterations...
Fitting RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
