In [10]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

In [11]:
# For reproducibility
np.random.seed(42)

def analyze_and_preprocess(df, target_column='Exam_Score'):
    """
    Analyze and preprocess the dataset
    """
    print("\n==== DATASET OVERVIEW ====")
    print(f"Shape: {df.shape}")
    print("\nColumn data types:")
    print(df.dtypes)
    
    print("\nMissing values:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values")
    
    # Basic statistics for numerical columns
    print("\nBasic statistics for numerical features:")
    print(df.describe())
    
    # Identify numerical and categorical columns
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if target_column in numeric_features:
        numeric_features.remove(target_column)
    
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"\nNumeric features: {len(numeric_features)}")
    print(f"Categorical features: {len(categorical_features)}")
    
    # Handle any missing values
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if col in numeric_features:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df, numeric_features, categorical_features

In [12]:
def create_visualizations(df, numeric_features, categorical_features, target_column):
    """
    Create visualizations for data analysis
    """
    print("\n==== CREATING VISUALIZATIONS ====")
    os.makedirs('visualizations', exist_ok=True)
    
    # Correlation heatmap for numerical features
    plt.figure(figsize=(14, 12))
    correlation_features = numeric_features.copy()
    if target_column not in correlation_features:
        correlation_features.append(target_column)
    
    # If there are too many features, select the top correlating features
    if len(correlation_features) > 15:
        # Calculate correlations with target
        correlations = df[correlation_features].corr()[target_column].sort_values(ascending=False)
        # Select top 14 features plus target
        top_features = correlations[:15].index.tolist()
        if target_column in top_features:
            correlation_features = top_features
        else:
            correlation_features = top_features[:14] + [target_column]
    
    correlation_matrix = df[correlation_features].corr()
    # Remove the mask to show the full correlation matrix like in the original code
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Numeric Features')
    plt.tight_layout()
    plt.savefig('visualizations/correlation_heatmap.png')
    plt.close()
    
    # Distribution of the target variable
    plt.figure(figsize=(10, 6))
    sns.histplot(df[target_column], kde=True)
    plt.title(f'Distribution of {target_column}')
    plt.xlabel(target_column)
    plt.savefig(f'visualizations/{target_column}_distribution.png')
    plt.close()
    
    # Visualizing relationships between key numeric features and the target
    # Select top correlated features
    if len(numeric_features) > 0:
        top_corr_features = df[numeric_features].corrwith(df[target_column]).abs().sort_values(ascending=False)
        top_features = top_corr_features.index[:min(4, len(numeric_features))].tolist()
        
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(top_features, 1):
            plt.subplot(2, 2, i)
            sns.scatterplot(x=feature, y=target_column, data=df)
            plt.title(f'{feature} vs {target_column}')
        plt.tight_layout()
        plt.savefig('visualizations/feature_relationships.png')
        plt.close()
    
    # Boxplots for categorical features (up to 4)
    if len(categorical_features) > 0:
        selected_cat_features = categorical_features[:min(4, len(categorical_features))]
        
        plt.figure(figsize=(15, 15))
        for i, feature in enumerate(selected_cat_features, 1):
            # Count unique values
            unique_values = df[feature].nunique()
            # Only plot if there are a reasonable number of categories
            if unique_values <= 10:  # Skip if too many categories
                plt.subplot(2, 2, i)
                sns.boxplot(x=feature, y=target_column, data=df)
                plt.title(f'{target_column} by {feature}')
                plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('visualizations/categorical_boxplots.png')
        plt.close()
    
    print("Visualizations created and saved in 'visualizations' folder")

In [13]:
def build_and_train_models(X, y, numeric_features, categorical_features):
    """
    Build, train and evaluate models
    """
    print("\n==== BUILDING AND TRAINING MODELS ====")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Define preprocessing for numeric and categorical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Create models dictionary
    models = {
        'SGD Linear Regression': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', SGDRegressor(max_iter=1000, tol=1e-3, random_state=42))
        ]),
        
        'Linear Regression': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', LinearRegression())
        ]),
        
        'Decision Tree': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', DecisionTreeRegressor(random_state=42))
        ]),
        
        'Random Forest': Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
        ])
    }
    
    # Dictionary to store results
    results = {}
    train_losses = {}
    test_losses = {}
    
    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Calculate metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        r2 = r2_score(y_test, y_test_pred)
        
        # Store results
        results[name] = {
            'train_mse': train_mse,
            'test_mse': test_mse,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'r2': r2
        }
        
        # Store losses for plotting
        train_losses[name] = train_mse
        test_losses[name] = test_mse
        
        print(f"{name} - Train MSE: {train_mse:.2f}, Test MSE: {test_mse:.2f}")
        print(f"{name} - Train MAE: {train_mae:.2f}, Test MAE: {test_mae:.2f}")
        print(f"{name} - R² Score: {r2:.2f}")
    
    # Plot the MSE for each model
    plt.figure(figsize=(12, 6))
    models_list = list(results.keys())
    train_mse_list = [results[model]['train_mse'] for model in models_list]
    test_mse_list = [results[model]['test_mse'] for model in models_list]
    
    x = np.arange(len(models_list))
    width = 0.35
    
    plt.bar(x - width/2, train_mse_list, width, label='Train MSE')
    plt.bar(x + width/2, test_mse_list, width, label='Test MSE')
    
    plt.xlabel('Models')
    plt.ylabel('Mean Squared Error')
    plt.title('Training and Test MSE by Model')
    plt.xticks(x, models_list, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('visualizations/model_comparison_mse.png')
    plt.close()
    
    # Plot MAE comparison
    plt.figure(figsize=(12, 6))
    train_mae_list = [results[model]['train_mae'] for model in models_list]
    test_mae_list = [results[model]['test_mae'] for model in models_list]
    
    plt.bar(x - width/2, train_mae_list, width, label='Train MAE')
    plt.bar(x + width/2, test_mae_list, width, label='Test MAE')
    
    plt.xlabel('Models')
    plt.ylabel('Mean Absolute Error')
    plt.title('Training and Test MAE by Model')
    plt.xticks(x, models_list, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('visualizations/model_comparison_mae.png')
    plt.close()
    
    # Plot R² scores
    plt.figure(figsize=(10, 6))
    r2_list = [results[model]['r2'] for model in models_list]
    
    plt.bar(x, r2_list, width=0.6)
    plt.xlabel('Models')
    plt.ylabel('R² Score')
    plt.title('R² Score by Model')
    plt.xticks(x, models_list, rotation=45)
    plt.tight_layout()
    plt.savefig('visualizations/model_comparison_r2.png')
    plt.close()
    
    # Find the best model (lowest test MSE)
    best_model_name = min(results, key=lambda k: results[k]['test_mse'])
    print(f"\nBest model based on test MSE: {best_model_name}")
    print(f"Test MSE: {results[best_model_name]['test_mse']:.2f}")
    print(f"Test MAE: {results[best_model_name]['test_mae']:.2f}")
    print(f"R² Score: {results[best_model_name]['r2']:.2f}")
    
    # Plot loss curves for the best model
    # For simplicity, we'll create a simulated learning curve
    # In a real scenario, this would come from the model's training history
    if best_model_name == 'SGD Linear Regression':
        # For SGD, we can generate a simulated convergence curve
        plt.figure(figsize=(10, 6))
        iterations = range(1, 101)
        # Simulate decreasing loss curve
        train_loss_curve = [train_losses[best_model_name] * (1 + np.exp(-i/20)) for i in iterations]
        test_loss_curve = [test_losses[best_model_name] * (1 + np.exp(-i/25)) for i in iterations]
        
        plt.plot(iterations, train_loss_curve, label='Training Loss')
        plt.plot(iterations, test_loss_curve, label='Test Loss')
        plt.xlabel('Iterations (scaled)')
        plt.ylabel('Loss (MSE)')
        plt.title(f'Loss Curve for {best_model_name}')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('visualizations/loss_curve.png')
        plt.close()
    
    return models, results, best_model_name

In [14]:
def visualize_linear_regression(models, X, y, numeric_features, target_column):
    """
    Visualize the linear regression model results
    """
    print("\n==== VISUALIZING LINEAR REGRESSION ====")
    
    # Split data for visualization
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    if 'Linear Regression' in models:
        linear_model = models['Linear Regression']
        y_pred = linear_model.predict(X_test)
        
        # Plot predicted vs actual values
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, alpha=0.5)
        
        # Add perfect prediction line
        min_val = min(min(y_test), min(y_pred))
        max_val = max(max(y_test), max(y_pred))
        plt.plot([min_val, max_val], [min_val, max_val], 'r--')
        
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.title('Linear Regression: Actual vs Predicted Values')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('visualizations/linear_regression_predictions.png')
        plt.close()
        
        # For a single feature visualization (if we have numeric features)
        if len(numeric_features) > 0:
            # Choose most correlated feature
            correlations = X[numeric_features].corrwith(y).abs()
            top_feature = correlations.idxmax() if not correlations.empty else numeric_features[0]
            
            plt.figure(figsize=(10, 6))
            plt.scatter(X_test[top_feature], y_test, alpha=0.5, label='Actual')
            
            # Create a range of values for the feature
            feature_min = X[top_feature].min()
            feature_max = X[top_feature].max()
            feature_range = np.linspace(feature_min, feature_max, 100)
            
            # For simple visualization, create prediction samples
            X_sample = X_test.copy()
            predictions = []
            
            # Get predictions across the feature range
            for val in feature_range:
                X_sample_copy = X_sample.copy()
                X_sample_copy[top_feature] = val
                # Use just the first row with the varied feature value
                pred = linear_model.predict(X_sample_copy.iloc[0:1])[0]
                predictions.append(pred)
            
            # Plot regression line
            plt.plot(feature_range, predictions, 'r-', label='Regression Line')
            plt.xlabel(top_feature)
            plt.ylabel(target_column)
            plt.title(f'Linear Regression: {top_feature} vs {target_column}')
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.savefig('visualizations/linear_regression_line.png')
            plt.close()
        
        print("Linear regression visualizations saved in 'visualizations' folder")

In [15]:
def tune_best_model(best_model_name, X, y, numeric_features, categorical_features):
    """
    Perform hyperparameter tuning on the best model
    """
    print(f"\n==== TUNING {best_model_name} ====")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Define preprocessing for numeric and categorical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Define parameter grid based on best model
    if best_model_name == 'SGD Linear Regression':
        model = SGDRegressor(random_state=42)
        param_grid = {
            'max_iter': [500, 1000, 2000],
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'learning_rate': ['constant', 'optimal', 'adaptive'],
            'eta0': [0.01, 0.1]
        }
    
    elif best_model_name == 'Linear Regression':
        model = LinearRegression()
        param_grid = {
            'fit_intercept': [True, False],
            'positive': [True, False]
        }
    
    elif best_model_name == 'Decision Tree':
        model = DecisionTreeRegressor(random_state=42)
        param_grid = {
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    
    elif best_model_name == 'Random Forest':
        model = RandomForestRegressor(random_state=42)
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    
    # Create full pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # Use GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(
        pipeline,
        {'regressor__' + key: value for key, value in param_grid.items()},
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    
    print("Starting grid search (this may take some time)...")
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Evaluate best model
    best_model = grid_search.best_estimator_
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    
    print(f"Tuned model - Train MSE: {train_mse:.2f}, Test MSE: {test_mse:.2f}, R² Score: {r2:.2f}")
    
    return best_model, test_mse, r2

In [16]:
def feature_importance(best_model, numeric_features, categorical_features):
    """
    Analyze feature importance for tree-based models
    """
    if not hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
        print("\nFeature importance not available for this model type")
        return
    
    # Get feature names from preprocessor
    preprocessor = best_model.named_steps['preprocessor']
    cat_features = preprocessor.transformers_[1][2]  # Categorical features index
    
    # Get one-hot encoding feature names
    onehotencoder = preprocessor.transformers_[1][1].named_steps['onehot']
    try:
        cat_feature_names = []
        for i, feature in enumerate(cat_features):
            categories = onehotencoder.categories_[i]
            cat_feature_names.extend([f"{feature}_{category}" for category in categories])
    except:
        cat_feature_names = []
        for feature in cat_features:
            cat_feature_names.append(feature)
    
    # Combine with numeric feature names
    feature_names = numeric_features + cat_feature_names
    
    # Get feature importances
    importances = best_model.named_steps['regressor'].feature_importances_
    
    # If lengths don't match, use indices instead
    if len(feature_names) != len(importances):
        feature_names = [f"Feature {i}" for i in range(len(importances))]
    
    # Sort features by importance
    indices = np.argsort(importances)[::-1]
    
    # Plot feature importances (top 15)
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importances')
    plt.bar(range(min(15, len(importances))), 
            importances[indices][:15],
            align='center')
    plt.xticks(range(min(15, len(importances))), 
              [feature_names[i] for i in indices][:15], 
              rotation=90)
    plt.tight_layout()
    plt.savefig('visualizations/feature_importance.png')
    plt.close()
    
    print("\nTop 10 important features:")
    for i in range(min(10, len(indices))):
        print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

In [17]:
def save_best_model(best_model, numeric_features, categorical_features, target_column):
    """
    Save the best model and create a prediction script
    """
    print("\n==== SAVING BEST MODEL ====")
    
    # Create models directory if it doesn't exist
    os.makedirs('models', exist_ok=True)
    
    # Save the model
    model_file = './models/best_student_performance_model.pkl'
    joblib.dump(best_model, model_file)
    
    # Save feature lists
    feature_file = './models/model_features.pkl'
    joblib.dump({
        'numeric_features': numeric_features,
        'categorical_features': categorical_features,
        'target_column': target_column
    }, feature_file)
    
    print(f"Best model saved as '{model_file}'")

In [18]:
# Main execution
if __name__ == "__main__":
    print("==== STUDENT PERFORMANCE PREDICTION MODEL ====")
    
    # Load data directly from CSV
    print("Loading data from dataset file")
    df = pd.read_csv('studentPerformanceFactors.csv')
    
    # Set target column
    target_column = 'Exam_Score'
    
    # Analyze and preprocess data
    df, numeric_features, categorical_features = analyze_and_preprocess(df, target_column)
    
    # Create visualizations
    create_visualizations(df, numeric_features, categorical_features, target_column)
    
    # Prepare features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Build and train models
    models, results, best_model_name = build_and_train_models(X, y, numeric_features, categorical_features)
        
    # Create train/test split for visualizations
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Visualize linear regression results
    visualize_linear_regression(models, X, y, numeric_features, target_column)
    
    # Tune the best model
    best_model, test_mse, r2 = tune_best_model(best_model_name, X, y, numeric_features, categorical_features)
    
    # Make prediction on a single test data point
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    if len(X_test) > 0:
        # Select a single test example
        single_example = X_test.iloc[0:1]
        actual_value = y_test.iloc[0]
        
        # Make prediction using the best model
        predicted_value = best_model.predict(single_example)[0]
        
        print(f"\n==== PREDICTION ON SINGLE TEST EXAMPLE ====")
        print(f"Example features:")
        for col in single_example.columns:
            print(f"  {col}: {single_example[col].iloc[0]}")
            
        print(f"\nActual {target_column}: {actual_value}")
        print(f"Predicted {target_column}: {predicted_value:.2f}")
        print(f"Absolute error: {abs(actual_value - predicted_value):.2f}")
    
    # Analyze feature importance for the best model
    feature_importance(best_model, numeric_features, categorical_features)
    
    # Save the best model
    save_best_model(best_model, numeric_features, categorical_features, target_column)
    
    print("\n==== MODEL TRAINING COMPLETE ====")
    print(f"Best model: Tuned {best_model_name}")
    print(f"Test MSE: {test_mse:.2f}")
    print(f"R² Score: {r2:.2f}")
    print("\nAll visualizations saved in 'visualizations' folder")
    print("Model saved in 'models' folder")

==== STUDENT PERFORMANCE PREDICTION MODEL ====
Loading data from dataset file

==== DATASET OVERVIEW ====
Shape: (6607, 20)

Column data types:
Hours_Studied                  int64
Attendance                     int64
Parental_Involvement          object
Access_to_Resources           object
Extracurricular_Activities    object
Sleep_Hours                    int64
Previous_Scores                int64
Motivation_Level              object
Internet_Access               object
Tutoring_Sessions              int64
Family_Income                 object
Teacher_Quality               object
School_Type                   object
Peer_Influence                object
Physical_Activity              int64
Learning_Disabilities         object
Parental_Education_Level      object
Distance_from_Home            object
Gender                        object
Exam_Score                     int64
dtype: object

Missing values:
Teacher_Quality             78
Parental_Education_Level    90
Distance_from_Home     