In [1]:
import pandas as pd
import numpy as np
import random

def generate_high_accuracy_dataset(num_students=50000):
    """
    Generates a large-scale synthetic dataset with strong correlations
    to facilitate higher model accuracy.

    Args:
        num_students (int): The number of student records to generate.

    Returns:
        pandas.DataFrame: A DataFrame containing the synthetic student data.
    """
    universities = [
        'IIT Madras', 'IISc Bangalore', 'NIT Tiruchirappalli',
        'Vellore Institute of Technology', 'Amrita Vishwa Vidyapeetham',
        'University of Hyderabad', 'NIT Surathkal', 'Anna University'
    ]
    university_weights = {
        'IIT Madras': 1.05, 'IISc Bangalore': 1.06, 'NIT Tiruchirappalli': 1.03,
        'Vellore Institute of Technology': 1.0, 'Amrita Vishwa Vidyapeetham': 0.98,
        'University of Hyderabad': 0.99, 'NIT Surathkal': 1.02, 'Anna University': 0.97
    }

    data = []

    for i in range(num_students):
        student_id = f'SID{i+1:05d}'
        university = random.choice(universities)
        weight = university_weights[university]
        
        # Base performance factor is now more deterministic
        base_performance = np.random.normal(0.9, 0.15) * weight
        base_performance = np.clip(base_performance, 0.4, 1.2)
        
        prior_cgpa = np.random.uniform(5.0, 9.8) * base_performance
        prior_cgpa = np.clip(prior_cgpa, 5.0, 10.0)
        
        # Engagement metrics are tightly coupled with base performance
        time_spent = round(np.random.uniform(1, 15) * base_performance, 1)
        login_frequency = int(time_spent / 1.5 + np.random.uniform(0, 2))
        discussion_posts = int(time_spent * 2.5 * base_performance)
        resource_access = int(time_spent * 5 * base_performance)
        
        assignment_completion = int(np.random.uniform(60, 100) * base_performance)
        avg_quiz_score = int(assignment_completion * np.random.uniform(0.9, 1.05) * base_performance)
        midterm_score = int((avg_quiz_score * 0.5 + assignment_completion * 0.5) * np.random.uniform(0.95, 1.05))

        # Clipping scores to ensure they are within the valid 0-100 range
        avg_quiz_score = np.clip(avg_quiz_score, 0, 100)
        assignment_completion = np.clip(assignment_completion, 0, 100)
        midterm_score = np.clip(midterm_score, 0, 100)
        
        # Target Variable: Final Grade with stronger weights and less noise
        final_grade = int(
            prior_cgpa * 2.0 +
            time_spent * 0.5 +
            avg_quiz_score * 0.3 +
            assignment_completion * 0.2 +
            midterm_score * 0.4 +
            np.random.normal(0, 1.0) # Reduced noise from 2.5 to 1.0
        )
        final_grade = np.clip(final_grade, 38, 100)

        data.append({
            'student_id': student_id,
            'university_name': university,
            'login_frequency_per_week': login_frequency,
            'time_spent_hours_per_week': time_spent,
            'discussion_posts_per_semester': discussion_posts,
            'resource_access_per_week': resource_access,
            'avg_quiz_score': avg_quiz_score,
            'assignment_completion_rate': assignment_completion,
            'midterm_score': midterm_score,
            'prior_cgpa': round(prior_cgpa, 2),
            'final_grade': final_grade
        })
        
    df = pd.DataFrame(data)
    return df

number_of_records = 50000 

high_acc_df = generate_high_accuracy_dataset(num_students=number_of_records)

output_filename = 'DL(Term_Paper).csv'
high_acc_df.to_csv(output_filename, index=False)

print(f"Successfully generated '{output_filename}' with {len(high_acc_df)} records.")
print("This dataset has stronger correlations and should yield higher model accuracy.")

Successfully generated 'DL(Term_Paper).csv' with 50000 records.
This dataset has stronger correlations and should yield higher model accuracy.


In [2]:
# ==============================================================================
# Term Paper Implementation: Predicting Student Success
# ==============================================================================
# This script performs the following actions:
# 1. Loads the 'DL(Term_Paper).csv' dataset.
# 2. Preprocesses the data as described in the paper.
# 3. Trains and evaluates Random Forest, XGBoost, LSTM, and Transformer models.
# 4. Prints a summary of the results.
# 5. Generates and saves the figures required for the LaTeX paper.
# ==============================================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Scikit-learn for preprocessing and baseline models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb

# TensorFlow and Keras for Deep Learning models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# --- Configuration ---
DATASET_FILENAME = 'DL(Term_Paper).csv'
FIGURES_DIR = 'figures'

# ==============================================================================
# SECTION 1: DATA LOADING AND PREPROCESSING
# ==============================================================================

def load_and_preprocess_data():
    """Loads and preprocesses the data according to the paper's methodology."""
    print("ðŸš€ Starting: Loading and preprocessing data...")
    
    if not os.path.exists(DATASET_FILENAME):
        print(f"Error: Dataset file '{DATASET_FILENAME}' not found!")
        print("Please make sure the dataset is in the same directory as this script.")
        exit()
        
    df = pd.read_csv(DATASET_FILENAME)
    
    # One-Hot Encoding for the 'university_name' column
    df = pd.get_dummies(df, columns=['university_name'], prefix='uni')
    
    # Separate features (X) and target (y)
    X = df.drop(columns=['student_id', 'final_grade'])
    y = df['final_grade']
    
    # Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale numerical features using MinMaxScaler
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert scaled arrays back to DataFrames to keep column names
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
    
    print("âœ… Preprocessing complete.")
    return X_train_scaled_df, X_test_scaled_df, y_train, y_test

# ==============================================================================
# SECTION 2: BASELINE MODELS (RANDOM FOREST & XGBOOST)
# ==============================================================================

def train_baseline_models(X_train, y_train, X_test, y_test):
    """Trains and evaluates the Random Forest and XGBoost models."""
    print("\nðŸ¤– Training Baseline Models...")
    results = {}
    
    # --- Random Forest ---
    print("   - Training Random Forest...")
    start_time = time.time()
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    r2_rf = r2_score(y_test, y_pred_rf)
    rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
    results['Random Forest'] = {'R2': r2_rf, 'RMSE': rmse_rf}
    print(f"   -> Random Forest finished. R2: {r2_rf:.4f}, RMSE: {rmse_rf:.4f}")

    # --- XGBoost ---
    print("   - Training XGBoost...")
    start_time = time.time()
    xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
    results['XGBoost'] = {'R2': r2_xgb, 'RMSE': rmse_xgb}
    print(f"   -> XGBoost finished. R2: {r2_xgb:.4f}, RMSE: {rmse_xgb:.4f}")
    
    # Extract feature importances for later plotting
    feature_importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
    
    return results, feature_importances
    
# ==============================================================================
# SECTION 3: DEEP LEARNING MODELS (LSTM & TRANSFORMER)
# ==============================================================================

def train_deep_learning_models(X_train, y_train, X_test, y_test):
    """Defines, trains, and evaluates the LSTM and Transformer models."""
    print("\nðŸ§  Training Deep Learning Models...")
    results = {}
    
    # Reshape data for DL models: (samples, timesteps, features)
    # For tabular data, we have 1 timestep.
    X_train_reshaped = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
    X_test_reshaped = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
    
    # --- LSTM Model ---
    print("   - Building and training LSTM model...")
    lstm_model = keras.Sequential([
        layers.Input(shape=(1, X_train.shape[1])),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.2),
        layers.LSTM(64),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    lstm_model.compile(optimizer='adam', loss='mean_squared_error')
    
    history_lstm = lstm_model.fit(
        X_train_reshaped, y_train,
        epochs=50,
        batch_size=64,
        validation_split=0.2,
        verbose=0 # Set to 1 to see live training progress
    )
    
    print("   -> LSTM training complete.")
    y_pred_lstm = lstm_model.predict(X_test_reshaped).flatten()
    r2_lstm = r2_score(y_test, y_pred_lstm)
    rmse_lstm = np.sqrt(mean_squared_error(y_test, y_pred_lstm))
    results['LSTM'] = {'R2': r2_lstm, 'RMSE': rmse_lstm}
    print(f"   -> LSTM finished. R2: {r2_lstm:.4f}, RMSE: {rmse_lstm:.4f}")
    
    # --- Transformer Model ---
    def transformer_encoder(inputs):
        x = layers.MultiHeadAttention(key_dim=64, num_heads=4, dropout=0.1)(inputs, inputs)
        x = layers.LayerNormalization(epsilon=1e-6)(inputs + x)
        ff_net = keras.Sequential([layers.Dense(32, activation="relu"), layers.Dense(inputs.shape[-1]),])
        x_ff = ff_net(x)
        return layers.LayerNormalization(epsilon=1e-6)(x + x_ff)

    print("   - Building and training Transformer model...")
    inputs = layers.Input(shape=(1, X_train.shape[1]))
    x = transformer_encoder(inputs)
    x = layers.GlobalAveragePooling1D(data_format="channels_last")(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(1)(x)
    transformer_model = keras.Model(inputs=inputs, outputs=outputs)
    
    transformer_model.compile(optimizer='adam', loss='mean_squared_error')
    
    transformer_model.fit(
        X_train_reshaped, y_train,
        epochs=50,
        batch_size=64,
        validation_split=0.2,
        verbose=0
    )
    
    print("   -> Transformer training complete.")
    y_pred_transformer = transformer_model.predict(X_test_reshaped).flatten()
    r2_transformer = r2_score(y_test, y_pred_transformer)
    rmse_transformer = np.sqrt(mean_squared_error(y_test, y_pred_transformer))
    results['Transformer'] = {'R2': r2_transformer, 'RMSE': rmse_transformer}
    print(f"   -> Transformer finished. R2: {r2_transformer:.4f}, RMSE: {rmse_transformer:.4f}")
    
    return results, history_lstm

# ==============================================================================
# SECTION 4: RESULTS AGGREGATION AND VISUALIZATION
# ==============================================================================

def generate_visualizations(all_results, feature_importances, history_lstm):
    """Generates and saves all the figures for the paper."""
    print("\nðŸ“Š Generating and Saving Figures...")
    
    if not os.path.exists(FIGURES_DIR):
        os.makedirs(FIGURES_DIR)
        
    # --- 1. Performance Table (Printed to Console) ---
    results_df = pd.DataFrame(all_results).T
    print("\n--- Final Model Performance Summary ---")
    print(results_df[['R2', 'RMSE']].round(4))
    print("---------------------------------------")

    # --- 2. RMSE Bar Chart ---
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(10, 6))
    models = list(all_results.keys())
    rmse_values = [res['RMSE'] for res in all_results.values()]
    
    colors = ['#4c72b0', '#55a868', '#c44e52', '#8172b2']
    bars = ax.bar(models, rmse_values, color=colors)
    ax.set_ylabel('Root Mean Squared Error (RMSE)')
    ax.set_xlabel('Model')
    ax.set_title('Model Performance Comparison (RMSE)')
    ax.bar_label(bars, fmt='%.4f')
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, 'rmse_chart.png'))
    print(f"âœ… Saved 'rmse_chart.png' to '{FIGURES_DIR}/' folder.")
    plt.close()

    # --- 3. Feature Importance Plot ---
    fig, ax = plt.subplots(figsize=(12, 8))
    feature_importances.sort_values(ascending=True).plot(kind='barh', ax=ax, color='#55a868')
    ax.set_xlabel('Feature Importance Score (XGBoost)')
    ax.set_title('Feature Importance Analysis')
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, 'feature_importance.png'))
    print(f"âœ… Saved 'feature_importance.png' to '{FIGURES_DIR}/' folder.")
    plt.close()

    # --- 4. Deep Learning Loss Curves ---
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(history_lstm.history['loss'], label='Training Loss', color='#c44e52')
    ax.plot(history_lstm.history['val_loss'], label='Validation Loss', color='#4c72b0')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Mean Squared Error (Loss)')
    ax.set_title('LSTM Model Training and Validation Loss')
    ax.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, 'loss_curves.png'))
    print(f"âœ… Saved 'loss_curves.png' to '{FIGURES_DIR}/' folder.")
    plt.close()

# ==============================================================================
# MAIN EXECUTION BLOCK
# ==============================================================================

if __name__ == '__main__':
    # Step 1: Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Step 2: Train and evaluate baseline models
    baseline_results, feature_importances = train_baseline_models(X_train, y_train, X_test, y_test)
    
    # Step 3: Train and evaluate deep learning models
    dl_results, history_lstm = train_deep_learning_models(X_train, y_train, X_test, y_test)
    
    # Step 4: Combine results and generate all plots
    all_results = {**baseline_results, **dl_results}
    generate_visualizations(all_results, feature_importances, history_lstm)
    
    print("\nðŸŽ‰ Script finished successfully! You're all set.")


ðŸš€ Starting: Loading and preprocessing data...
âœ… Preprocessing complete.

ðŸ¤– Training Baseline Models...
   - Training Random Forest...
   -> Random Forest finished. R2: 0.9962, RMSE: 1.0562
   - Training XGBoost...
   -> XGBoost finished. R2: 0.9964, RMSE: 1.0244

ðŸ§  Training Deep Learning Models...
   - Building and training LSTM model...



   -> LSTM training complete.
   -> LSTM finished. R2: 0.9966, RMSE: 1.0050
   - Building and training Transformer model...
   -> Transformer training complete.
   -> Transformer finished. R2: 0.9954, RMSE: 1.1588

ðŸ“Š Generating and Saving Figures...

--- Final Model Performance Summary ---
                   R2    RMSE
Random Forest  0.9962  1.0562
XGBoost        0.9964  1.0244
LSTM           0.9966  1.0050
Transformer    0.9954  1.1588
---------------------------------------
âœ… Saved 'rmse_chart.png' to 'figures/' folder.
âœ… Saved 'feature_importance.png' to 'figures/' folder.
âœ… Saved 'loss_curves.png' to 'figures/' folder.

ðŸŽ‰