# Student Performance Prediction - EDA & Model Training

This notebook explores student performance data and trains predictive models to identify at-risk students.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')
from src.preprocess import preprocess_student_data
from src.train import train_and_evaluate

# Configure visualization
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load raw data
df_raw = pd.read_csv('../data/student_performance_updated_1000.csv')
print(f"Dataset shape: {df_raw.shape}")
print(f"\nFirst few records:")
df_raw.head()

## Data Exploration & Quality Checks

In [None]:
# Check data types and missing values
print("Data Info:")
print(f"Columns: {df_raw.columns.tolist()}")
print(f"\nData types:\n{df_raw.dtypes}")
print(f"\nMissing values:\n{df_raw.isnull().sum()}")
print(f"\nBasic statistics:")
df_raw.describe()

## Feature Distribution Analysis

In [None]:
# Visualize target variable distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Final Grade distribution
axes[0, 0].hist(df_raw['FinalGrade'], bins=30, color='skyblue', edgecolor='black')
axes[0, 0].axvline(70, color='red', linestyle='--', label='Intervention Threshold')
axes[0, 0].set_xlabel('Final Grade')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Distribution of Final Grades')
axes[0, 0].legend()

# Attendance Rate distribution
axes[0, 1].hist(df_raw['AttendanceRate'], bins=30, color='lightgreen', edgecolor='black')
axes[0, 1].set_xlabel('Attendance Rate (%)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Distribution of Attendance Rates')

# Study Hours distribution
axes[1, 0].hist(df_raw['StudyHoursPerWeek'], bins=30, color='lightcoral', edgecolor='black')
axes[1, 0].set_xlabel('Study Hours Per Week')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Distribution of Study Hours')

# Previous Grade distribution
axes[1, 1].hist(df_raw['PreviousGrade'], bins=30, color='lightyellow', edgecolor='black')
axes[1, 1].set_xlabel('Previous Grade')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Distribution of Previous Grades')

plt.tight_layout()
plt.show()

print(f"\nGrades below intervention threshold (70): {(df_raw['FinalGrade'] < 70).sum()} students")

## Correlation Analysis

In [None]:
# Calculate correlations with FinalGrade
numeric_cols = df_raw.select_dtypes(include=['int64', 'float64']).columns
correlations = df_raw[numeric_cols].corr()['FinalGrade'].sort_values(ascending=False)

print("Correlation with Final Grade:")
print(correlations)

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df_raw[numeric_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, cbar_kws={'label': 'Correlation'})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Categorical Features Analysis

In [None]:
# Analyze categorical features
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Gender vs Final Grade
df_raw.groupby('Gender')['FinalGrade'].mean().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Average Final Grade')
axes[0].set_title('Average Final Grade by Gender')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Parental Support vs Final Grade
df_raw.groupby('ParentalSupport')['FinalGrade'].mean().sort_values(ascending=False).plot(
    kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_xlabel('Parental Support')
axes[1].set_ylabel('Average Final Grade')
axes[1].set_title('Average Final Grade by Parental Support Level')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()

print("\nParental Support Distribution:")
print(df_raw['ParentalSupport'].value_counts())

## Data Preprocessing

In [None]:
# Preprocess data
X, y, preprocessor = preprocess_student_data('../data/student_performance_updated_1000.csv')

print(f"\nProcessed data shape:")
print(f"Features X: {X.shape}")
print(f"Target y: {y.shape}")
print(f"\nFeature names: {X.columns.tolist()}")
print(f"\nFinal Grade statistics (target):")
print(f"Mean: {y.mean():.2f}")
print(f"Std: {y.std():.2f}")
print(f"Min: {y.min():.2f}")
print(f"Max: {y.max():.2f}")

## Model Training & Evaluation

In [None]:
# Train multiple models
results, X_train, X_test, y_train, y_test = train_and_evaluate(
    X, y, 
    model_types=['linear', 'tree', 'random_forest']
)

## Feature Importance Analysis

In [None]:
# Compare feature importance across models
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for idx, (model_type, result) in enumerate(results.items()):
    model = result['model']
    if model.feature_importance is not None:
        top_features = model.feature_importance.head(10)
        axes[idx].barh(range(len(top_features)), top_features['importance'].values)
        axes[idx].set_yticks(range(len(top_features)))
        axes[idx].set_yticklabels(top_features['feature'].values)
        axes[idx].set_xlabel('Importance Score')
        axes[idx].set_title(f'{model_type.upper()} - Top 10 Features')
        axes[idx].invert_yaxis()

plt.tight_layout()
plt.show()

## At-Risk Student Identification

In [None]:
# Analyze at-risk students for best performing model (Random Forest)
best_model_type = 'random_forest'
at_risk_df = results[best_model_type]['at_risk']

print(f"\nAt-Risk Student Details (Top 10):")
print(at_risk_df[at_risk_df['at_risk']].head(10))

# Visualize prediction accuracy
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Actual vs Predicted
axes[0].scatter(at_risk_df['actual_grade'], at_risk_df['predicted_grade'], alpha=0.6)
axes[0].plot([at_risk_df['actual_grade'].min(), at_risk_df['actual_grade'].max()],
            [at_risk_df['actual_grade'].min(), at_risk_df['actual_grade'].max()],
            'r--', label='Perfect Prediction')
axes[0].set_xlabel('Actual Grade')
axes[0].set_ylabel('Predicted Grade')
axes[0].set_title('Actual vs Predicted Grades')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Prediction Error Distribution
axes[1].hist(at_risk_df['difference'], bins=30, color='lightblue', edgecolor='black')
axes[1].axvline(0, color='red', linestyle='--', label='Zero Error')
axes[1].set_xlabel('Prediction Error (Actual - Predicted)')
axes[1].set_ylabel('Count')
axes[1].set_title('Distribution of Prediction Errors')
axes[1].legend()

plt.tight_layout()
plt.show()

## Save Best Model

In [None]:
# Save the best performing model
best_model = results['random_forest']['model']
model_path = best_model.save_model('../models/student_performance_model.pkl')
print(f"\nModel saved successfully!")
print(f"Path: {model_path}")