In [None]:
import pandas as pd
df = pd.read_csv("StudentPerformanceFactors.csv")


In [None]:
# Add Student_ID to your original dataframe (df)

# Check if Student_ID already exists
if 'Student_ID' not in df.columns:
    df['Student_ID'] = range(1, len(df) + 1)
    print("âœ“ Student_ID added to original data")
else:
    print("âœ“ Student_ID already exists")

# Show first few rows
print("\nOriginal Data with Student_ID:")
print(df[['Student_ID', 'Hours_Studied', 'Attendance', 'Exam_Score']].head())

In [None]:
for col in df.columns:
    print(col)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
for col in ['Teacher_Quality', 'Parental_Education_Level' ,'Distance_from_Home'] :
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
import pandas as pd
num_cols = [
    'Hours_Studied', 'Attendance', 'Sleep_Hours',
    'Previous_Scores', 'Tutoring_Sessions',
    'Physical_Activity', 'Exam_Score'
]

In [None]:
for col in num_cols:
    print(f"--- Column: {col} ---")
    print(df[col].describe(), "\n")

In [None]:
df = df[df['Exam_Score'] != 101]



In [None]:

for col in num_cols:
    print(f"--- Column: {col} ---")
    print(df[col].describe(), "\n")
    print("Top 10 values:")
    print(df[col].sort_values(ascending=False).head(10).values)
    print("Lowest 10 values:")
    print(df[col].sort_values().head(10).values)
    print(f"Unique values: {df[col].nunique()}")
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)][col]
    print(f"Number of outliers (IQR method): {len(outliers)}\n")

    print("############################\n")



In [None]:

cat_cols = [
    'Parental_Involvement',
    'Access_to_Resources',
    'Extracurricular_Activities',
    'Motivation_Level',
    'Internet_Access',
    'Family_Income',
    'Teacher_Quality',
    'School_Type',
    'Peer_Influence',
    'Learning_Disabilities',
    'Parental_Education_Level',
    'Distance_from_Home',
    'Gender'
]


In [None]:
for col in cat_cols:
    print(f"--- Column: {col} ---")
    unique_vals = df[col].unique()
    print(f"Unique values ({len(unique_vals)}): {unique_vals}")
    counts = df[col].value_counts()
    print("Value counts:\n", counts)
    missing = df[col].isnull().sum()
    print(f"Missing values: {missing}")
    print("############################\n")


In [None]:
num_cols = [
    'Hours_Studied', 'Attendance', 'Sleep_Hours',
    'Previous_Scores', 'Tutoring_Sessions',
    'Physical_Activity', 'Exam_Score'
]


In [None]:
ordinal_mapping = {
    'Low': 1,
    'Medium': 2,
    'High': 3
}

ordinal_cols = [
    'Parental_Involvement', 'Access_to_Resources',
    'Motivation_Level', 'Family_Income', 'Teacher_Quality'
]

for col in ordinal_cols:
    df[col] = df[col].map(ordinal_mapping)



In [None]:
binary_cols = [
    'Extracurricular_Activities',
    'Internet_Access',
    'Learning_Disabilities'
]

binary_mapping = {'Yes': 1, 'No': 0}

for col in binary_cols:
    df[col] = df[col].map(binary_mapping)

In [None]:
nominal_cols = [
    'School_Type',
    'Peer_Influence',
    'Parental_Education_Level',
    'Distance_from_Home',
    'Gender'
]
df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)


Separate features and target

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

y = df['Exam_Score']
X = df.drop('Exam_Score', axis=1)

# Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Quick check
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")


LinearRegression Model


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid")

# ------------------------
# 1. Train the model
# ------------------------
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# ------------------------
# 2. Predictions
# ------------------------
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

# ------------------------
# 3. Calculate Metrics
# ------------------------
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

test_mae = mean_absolute_error(y_test, y_pred_test)

# ------------------------
# 4. Print Metrics
# ------------------------
print("=" * 50)
print("LINEAR REGRESSION RESULTS")
print("=" * 50)
print(f"Training RÂ²: {train_r2:.4f}")
print(f"Test RÂ²: {test_r2:.4f}")
print(f"Training RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print("=" * 50)

# ------------------------
# 5. Visualization: Actual vs Predicted
# ------------------------
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.7, c='dodgerblue', edgecolors='w', s=80, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--', lw=3, label='Ideal Fit')
plt.xlabel('Actual Exam Score', fontsize=14, fontweight='bold')
plt.ylabel('Predicted Exam Score', fontsize=14, fontweight='bold')
plt.title('Linear Regression: Actual vs Predicted', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(alpha=0.4)
plt.tight_layout()
plt.show()

# ------------------------
# 6. Residuals Plot
# ------------------------
residuals = y_test - y_pred_test
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_test, residuals, alpha=0.7, c='orange', edgecolors='w', s=80)
plt.axhline(y=0, linestyle='--', color='red', lw=3)
plt.xlabel('Predicted Exam Score', fontsize=14, fontweight='bold')
plt.ylabel('Residuals', fontsize=14, fontweight='bold')
plt.title('Residuals Plot', fontsize=16, fontweight='bold')
plt.grid(alpha=0.4)
plt.tight_layout()
plt.show()


The Linear Regression model achieved a Test RÂ² score of 0.825, which means it can explain about 82% of the variance in exam scores.
The Test RMSE was 1.52, indicating that the average prediction error is around 1.5 marks.
Additionally, the Test MAE was only 0.41, which shows that the model predictions are very close to the actual values.
Since the test performance is better than the training performance, the model is not overfitting and generalizes well to unseen data.

Random Forest Model


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Create and train Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions
y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)

# Metrics
train_r2_rf = r2_score(y_train, y_pred_train_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)

train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_pred_train_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_test_rf))

test_mae_rf = mean_absolute_error(y_test, y_pred_test_rf)

print("=" * 50)
print("RANDOM FOREST RESULTS")
print("=" * 50)
print(f"Training RÂ²: {train_r2_rf:.4f}")
print(f"Test RÂ²: {test_r2_rf:.4f}")
print(f"Training RMSE: {train_rmse_rf:.2f}")
print(f"Test RMSE: {test_rmse_rf:.2f}")
print(f"Test MAE: {test_mae_rf:.2f}")
print("=" * 50)

# Feature Importance
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf_model.feature_importances_
}).sort_values("Importance", ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

# Visualization 1: Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test_rf, alpha=0.6)
plt.plot(
    [y_test.min(), y_test.max()],
    [y_test.min(), y_test.max()],
    linestyle='--',
    linewidth=2
)
plt.xlabel("Actual Exam Score")
plt.ylabel("Predicted Exam Score")
plt.title("Random Forest: Actual vs Predicted")
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualization 2: Feature Importance
plt.figure(figsize=(10, 6))
top_features = feature_importance.head(10)
plt.barh(top_features["Feature"], top_features["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Top 10 Feature Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Visualization 3: Comparison with Linear Regression
# models = ["Linear Regression", "Random Forest"]
# r2_scores = [test_r2, test_r2_rf]
# rmse_scores = [test_rmse, test_rmse_rf]

# x = np.arange(len(models))
# width = 0.35

# fig, ax1 = plt.subplots(figsize=(10, 6))

# ax1.bar(x - width/2, r2_scores, width, alpha=0.7)
# ax1.set_ylabel("RÂ² Score")
# ax1.set_ylim(0, 1)

# ax2 = ax1.twinx()
# ax2.bar(x + width/2, rmse_scores, width, alpha=0.7)
# ax2.set_ylabel("RMSE")

# ax1.set_xticks(x)
# ax1.set_xticklabels(models)
# ax1.set_xlabel("Model")
# ax1.set_title("Model Comparison: Linear Regression vs Random Forest")

# plt.tight_layout()
# plt.show()


Neural Network

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build ANN
ann_model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

# Compile
ann_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

# Train
print("Training Neural Network...")
history = ann_model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    verbose=0
)

# Predictions
y_pred_train_ann = ann_model.predict(X_train_scaled, verbose=0).flatten()
y_pred_test_ann = ann_model.predict(X_test_scaled, verbose=0).flatten()

# Metrics
train_r2_ann = r2_score(y_train, y_pred_train_ann)
test_r2_ann = r2_score(y_test, y_pred_test_ann)

train_rmse_ann = np.sqrt(mean_squared_error(y_train, y_pred_train_ann))
test_rmse_ann = np.sqrt(mean_squared_error(y_test, y_pred_test_ann))

test_mae_ann = mean_absolute_error(y_test, y_pred_test_ann)

print("=" * 50)
print("NEURAL NETWORK RESULTS")
print("=" * 50)
print(f"Training RÂ²: {train_r2_ann:.4f}")
print(f"Test RÂ²: {test_r2_ann:.4f}")
print(f"Training RMSE: {train_rmse_ann:.2f}")
print(f"Test RMSE: {test_rmse_ann:.2f}")
print(f"Test MAE: {test_mae_ann:.2f}")
print("=" * 50)

# Visualization 1: Training History
plt.figure(figsize=(12, 5))

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualization 2: Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test_ann, alpha=0.6)
plt.plot(
    [y_test.min(), y_test.max()],
    [y_test.min(), y_test.max()],
    linestyle='--',
    linewidth=2
)
plt.xlabel('Actual Exam Score')
plt.ylabel('Predicted Exam Score')
plt.title('Neural Network: Actual vs Predicted')
plt.grid(True)
plt.tight_layout()
plt.show()

# =======================
# Final Comparison
# =======================

models_all = ['Linear Regression', 'Random Forest', 'Neural Network']
r2_all = [test_r2, test_r2_rf, test_r2_ann]
rmse_all = [test_rmse, test_rmse_rf, test_rmse_ann]
mae_all = [test_mae, test_mae_rf, test_mae_ann]

summary_final = pd.DataFrame({
    'Model': models_all,
    'Test RÂ²': r2_all,
    'Test RMSE': rmse_all,
    'Test MAE': mae_all
})

print("\n" + "=" * 60)
print("FINAL SUMMARY: ALL MODELS")
print("=" * 60)
print(summary_final.to_string(index=False))
print("=" * 60)
print(f"\nBest Model (by RÂ²): {models_all[np.argmax(r2_all)]}")
print(f"Best Model (by RMSE): {models_all[np.argmin(rmse_all)]}")


In [None]:
import shap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

print("Calculating SHAP values for Linear Regression...")

# Create SHAP explainer
explainer = shap.Explainer(lr_model, X_train)
shap_values = explainer(X_test)

print("âœ“ SHAP values calculated successfully!")

# Visualization 1: Feature Importance Bar Chart
mean_shap = np.abs(shap_values.values).mean(axis=0)
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'SHAP_Importance': mean_shap
}).sort_values('SHAP_Importance', ascending=False)

plt.figure(figsize=(10, 8))
top_10 = feature_importance_df.head(10)
plt.barh(top_10['Feature'], top_10['SHAP_Importance'], color='skyblue', edgecolor='black')
plt.xlabel('Mean |SHAP Value|', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Top 10 Features by SHAP Importance', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("TOP 10 FEATURES BY SHAP IMPORTANCE")
print("="*60)
print(feature_importance_df.head(10).to_string(index=False))
print("="*60)

# Visualization 2: SHAP values for top 3 features
top_3_features = feature_importance_df.head(3)['Feature'].values

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, feature in enumerate(top_3_features):
    feature_idx = X_train.columns.get_loc(feature)
    feature_values = X_test[feature].values
    shap_vals = shap_values.values[:, feature_idx]

    axes[idx].scatter(feature_values, shap_vals, alpha=0.5, edgecolors='k')
    axes[idx].axhline(y=0, color='r', linestyle='--', linewidth=2)
    axes[idx].set_xlabel(f'{feature} Value', fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('SHAP Value', fontsize=11, fontweight='bold')
    axes[idx].set_title(f'Impact of {feature}', fontsize=12, fontweight='bold')
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Example Analysis: At-Risk Students
print("\n" + "="*60)
print("AT-RISK STUDENT ANALYSIS")
print("="*60)

at_risk_indices = np.where(y_test < 60)[0][:3]

if len(at_risk_indices) > 0:
    for idx in at_risk_indices:
        actual_score = y_test.iloc[idx]
        predicted_score = lr_model.predict(X_test.iloc[[idx]])[0]

        print(f"\nStudent #{idx}:")
        print(f"  Actual Score: {actual_score:.1f}")
        print(f"  Predicted Score: {predicted_score:.1f}")

        # Get SHAP contributions
        student_shap = shap_values[idx].values
        contributions = pd.DataFrame({
            'Feature': X_train.columns,
            'SHAP_Value': student_shap,
            'Feature_Value': X_test.iloc[idx].values
        }).sort_values('SHAP_Value', key=abs, ascending=False)

        print(f"\n  Top Contributing Factors:")
        for i in range(min(5, len(contributions))):
            row = contributions.iloc[i]
            direction = "â†‘" if row['SHAP_Value'] > 0 else "â†“"
            print(f"    {direction} {row['Feature']}: {row['SHAP_Value']:+.2f} (value: {row['Feature_Value']:.2f})")

    print("="*60)
else:
    print("No at-risk students found in test set")
    print("="*60)

# Visualization 3: Example Student Breakdown
if len(at_risk_indices) > 0:
    student_idx = at_risk_indices[0]
    student_shap = shap_values[student_idx].values

    # Get top positive and negative contributions
    sorted_indices = np.argsort(np.abs(student_shap))[-10:][::-1]

    plt.figure(figsize=(10, 6))
    colors = ['green' if val > 0 else 'red' for val in student_shap[sorted_indices]]

    plt.barh(range(len(sorted_indices)), student_shap[sorted_indices], color=colors, alpha=0.7, edgecolor='black')
    plt.yticks(range(len(sorted_indices)), X_train.columns[sorted_indices])
    plt.xlabel('SHAP Value (Impact on Prediction)', fontsize=12, fontweight='bold')
    plt.ylabel('Feature', fontsize=12, fontweight='bold')
    plt.title(f'SHAP Explanation for At-Risk Student #{student_idx}\nActual: {y_test.iloc[student_idx]:.1f}, Predicted: {lr_model.predict(X_test.iloc[[student_idx]])[0]:.1f}',
              fontsize=13, fontweight='bold')
    plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

# Comparison: SHAP vs Random Forest Feature Importance
print("\n" + "="*60)
print("COMPARISON: SHAP vs Random Forest Feature Importance")
print("="*60)

rf_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'RF_Importance': rf_model.feature_importances_
}).sort_values('RF_Importance', ascending=False)

comparison = feature_importance_df.merge(rf_importance, on='Feature')
comparison = comparison.sort_values('SHAP_Importance', ascending=False).head(10)

print(comparison.to_string(index=False))
print("="*60)

print("\nâœ“ SHAP Analysis Complete!")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

print("Building Risk Classification System...")

# Use Linear Regression predictions for the entire dataset
df['Predicted_Score'] = lr_model.predict(df.drop('Exam_Score', axis=1))

# Define Risk Levels based on Predicted Scores
def classify_risk(score):
    if score >= 75:
        return "High Performer"
    elif score >= 60:
        return "Medium Risk"
    else:
        return "At Risk"

df['Risk_Level'] = df['Predicted_Score'].apply(classify_risk)
df['Actual_Risk_Level'] = df['Exam_Score'].apply(classify_risk)

# Statistics
print("\n" + "="*60)
print("RISK CLASSIFICATION RESULTS")
print("="*60)

risk_counts = df['Risk_Level'].value_counts()
print("\nPredicted Risk Distribution:")
for risk in ["High Performer", "Medium Risk", "At Risk"]:
    count = risk_counts.get(risk, 0)
    percentage = count / len(df) * 100
    print(f"  {risk}: {count} students ({percentage:.1f}%)")

print("\nActual Risk Distribution:")
actual_counts = df['Actual_Risk_Level'].value_counts()
for risk in ["High Performer", "Medium Risk", "At Risk"]:
    count = actual_counts.get(risk, 0)
    percentage = count / len(df) * 100
    print(f"  {risk}: {count} students ({percentage:.1f}%)")

# Accuracy
correct = (df['Risk_Level'] == df['Actual_Risk_Level']).sum()
accuracy = correct / len(df) * 100
print(f"\nRisk Classification Accuracy: {accuracy:.2f}%")
print("="*60)

# Visualization 1: Risk Distribution Comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

colors = ['red', 'orange', 'green']
risk_order = ["At Risk", "Medium Risk", "High Performer"]

# Predicted
predicted_data = [risk_counts.get(risk, 0) for risk in risk_order]
bars1 = ax1.bar(risk_order, predicted_data, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
ax1.set_ylabel('Number of Students', fontsize=12, fontweight='bold')
ax1.set_title('Predicted Risk Distribution', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)
for bar, count in zip(bars1, predicted_data):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(count)}\n({count/len(df)*100:.1f}%)',
             ha='center', va='bottom', fontsize=10, fontweight='bold')

# Actual
actual_data = [actual_counts.get(risk, 0) for risk in risk_order]
bars2 = ax2.bar(risk_order, actual_data, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
ax2.set_ylabel('Number of Students', fontsize=12, fontweight='bold')
ax2.set_title('Actual Risk Distribution', fontsize=14, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)
for bar, count in zip(bars2, actual_data):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(count)}\n({count/len(df)*100:.1f}%)',
             ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

# Visualization 2: Confusion Matrix
cm = confusion_matrix(df['Actual_Risk_Level'], df['Risk_Level'], labels=risk_order)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='RdYlGn', cbar=True,
            xticklabels=risk_order, yticklabels=risk_order,
            linewidths=2, linecolor='black', annot_kws={'size': 14, 'weight': 'bold'})
plt.xlabel('Predicted Risk Level', fontsize=13, fontweight='bold')
plt.ylabel('Actual Risk Level', fontsize=13, fontweight='bold')
plt.title('Risk Classification Confusion Matrix', fontsize=15, fontweight='bold', pad=15)
plt.tight_layout()
plt.show()

# At-Risk Students Analysis
at_risk = df[df['Risk_Level'] == 'At Risk']
medium = df[df['Risk_Level'] == 'Medium Risk']
high = df[df['Risk_Level'] == 'High Performer']

print("\n" + "="*60)
print("DETAILED ANALYSIS BY RISK LEVEL")
print("="*60)

for label, data in [("At Risk", at_risk), ("Medium Risk", medium), ("High Performer", high)]:
    print(f"\n{label}: {len(data)} students ({len(data)/len(df)*100:.1f}%)")
    print(f"  Mean Exam Score: {data['Exam_Score'].mean():.2f}")
    print(f"  Mean Attendance: {data['Attendance'].mean():.1f}%")
    print(f"  Mean Hours Studied: {data['Hours_Studied'].mean():.1f}")
    print(f"  Mean Access to Resources: {data['Access_to_Resources'].mean():.2f}/3")

print("="*60)

# Visualization 3: Feature Comparison Box Plots
features = ['Exam_Score', 'Attendance', 'Hours_Studied', 'Access_to_Resources']
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, feature in enumerate(features):
    ax = axes[idx // 2, idx % 2]

    data_list = [df[df['Risk_Level'] == risk][feature] for risk in risk_order]

    bp = ax.boxplot(data_list, labels=risk_order, patch_artist=True)

    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
        patch.set_edgecolor('black')
        patch.set_linewidth(2)

    ax.set_ylabel(feature, fontsize=11, fontweight='bold')
    ax.set_title(f'{feature} Distribution by Risk Level', fontsize=12, fontweight='bold')
    ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Sample At-Risk Students with Recommendations
print("\n" + "="*60)
print("SAMPLE AT-RISK STUDENTS - INTERVENTION RECOMMENDATIONS")
print("="*60)

at_risk_sample = at_risk.head(5)

for idx, row in at_risk_sample.iterrows():
    print(f"\nStudent #{idx}:")
    print(f"  Predicted Score: {row['Predicted_Score']:.1f}")
    print(f"  Actual Score: {row['Exam_Score']:.1f}")
    print(f"  Attendance: {row['Attendance']:.0f}%")
    print(f"  Hours Studied: {row['Hours_Studied']:.0f}/week")
    print(f"  Recommendations:")

    if row['Attendance'] < 80:
        print(f"    ðŸ“Œ Improve attendance (current: {row['Attendance']:.0f}%, target: 85%+)")
    if row['Hours_Studied'] < 20:
        print(f"    ðŸ“š Increase study hours (current: {row['Hours_Studied']:.0f}h, target: 25h+)")
    if row['Access_to_Resources'] < 2:
        print(f"    ðŸ’» Provide additional learning resources")
    if row['Parental_Involvement'] < 2:
        print(f"    ðŸ‘ª Encourage parental engagement")

print("="*60)

print("\nâœ“ Risk Classification Complete!")
print(f"ðŸš¨ Total At-Risk Students: {len(at_risk)} ({len(at_risk)/len(df)*100:.1f}%)")

# ============================================================================
# REAL STUDENT DATA INTEGRATION
# ============================================================================
# Purpose: Create database with real student information (names, emails, photos)
# for the interactive system
# ============================================================================

In [None]:
# import pandas as pd

# # Create real students data
# # Replace with actual names, emails, and photo paths

# real_students = pd.DataFrame({
#     'Student_ID': [1, 2, 3, 4, 5],  # IDs that exist in your original data

#     # PUT REAL NAMES HERE
#     'Real_Name': [
#         'Shimaa Mousaa',
#         'Sara Mohamed',
#         'Nour Ali',
#         'Omar Khaled',
#         'Layla Ibrahim'
#     ],

#     # PUT REAL EMAILS HERE
#     'Real_Email': [
#         'shimaamousaa77@gmail.com',
#         'sara@example.com',
#         'nour@example.com',
#         'omar@example.com',
#         'layla@example.com'
#      ]
#     #

#     # # Photo paths (we'll upload photos later)
#     # 'Photo_Path': [
#     #     'photos/ahmed.jpg',
#     #     'photos/sara.jpg',
#     #     'photos/nour.jpg',
#     #     'photos/omar.jpg',
#     #     'photos/layla.jpg'
#     # ]
# })

# # Show the data
# print("Real Students Data:")
# print(real_students)

# # Save to CSV
# real_students.to_csv('real_students.csv', index=False)
# print("\nâœ“ File saved: real_students.csv")

In [None]:
# # Load real students data
# real_students = pd.read_csv('real_students.csv')

# print("="*60)
# print("MERGING DATA")
# print("="*60)

# # Merge the two dataframes
# # This will keep ONLY students who have real data
# merged_data = df.merge(real_students, on='Student_ID', how='inner')

# print(f"\nOriginal data: {len(df)} students")
# print(f"Real students data: {len(real_students)} students")
# print(f"Merged data: {len(merged_data)} students")

# # Show sample
# print("\nSample of merged data:")
# print(merged_data[['Student_ID', 'Real_Name', 'Real_Email',
#                    'Attendance', 'Hours_Studied', 'Exam_Score']].head())

# # Save merged data
# merged_data.to_csv('students_with_real_info.csv', index=False)
# print("\nâœ“ Merged data saved: students_with_real_info.csv")


<!-- visualization -->

In [None]:
# visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Ranking of Risk Level
risk_order = ["At Risk", "Medium Risk", "High Performer"]
colors = ['red', 'orange', 'green']

In [None]:
# Bar plot for compare between predicted and actual risk
fig, axes = plt.subplots(1, 2, figsize=(14,6))

pred_counts = df['Risk_Level'].value_counts()
act_counts = df['Actual_Risk_Level'].value_counts()

# Predicted
axes[0].bar(risk_order, [pred_counts.get(r,0) for r in risk_order], color=colors, alpha=0.7, edgecolor='black')
axes[0].set_title("Predicted Risk Distribution", fontsize=14, fontweight='bold')
axes[0].set_ylabel("Number of Students")
for i, count in enumerate([pred_counts.get(r,0) for r in risk_order]):
    axes[0].text(i, count + 5, str(count), ha='center', fontweight='bold')

# Actual
axes[1].bar(risk_order, [act_counts.get(r,0) for r in risk_order], color=colors, alpha=0.7, edgecolor='black')
axes[1].set_title("Actual Risk Distribution", fontsize=14, fontweight='bold')
axes[1].set_ylabel("Number of Students")
for i, count in enumerate([act_counts.get(r,0) for r in risk_order]):
    axes[1].text(i, count + 5, str(count), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(df['Actual_Risk_Level'], df['Risk_Level'], labels=risk_order)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='RdYlGn', xticklabels=risk_order, yticklabels=risk_order)
plt.xlabel("Predicted Risk Level", fontsize=12)
plt.ylabel("Actual Risk Level", fontsize=12)
plt.title("Confusion Matrix of Risk Classification", fontsize=14, fontweight='bold')
plt.show()

In [None]:
# Boxplots for key features according risk level
features = ['Exam_Score', 'Attendance', 'Hours_Studied', 'Previous_Scores']
fig, axes = plt.subplots(2, 2, figsize=(14,10))

for i, feature in enumerate(features):
    ax = axes[i//2, i%2]
    data_list = [df[df['Risk_Level']==risk][feature] for risk in risk_order]
    bp = ax.boxplot(data_list, patch_artist=True)
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
        patch.set_edgecolor('black')
        patch.set_linewidth(2)
    ax.set_title(f"{feature} by Risk Level", fontsize=12, fontweight='bold')
    ax.set_ylabel(feature)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: to the most important features
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='Attendance', y='Hours_Studied', hue='Risk_Level', palette=colors)
plt.title("Attendance vs Hours Studied by Risk Level", fontsize=14, fontweight='bold')
plt.xlabel("Attendance (%)")
plt.ylabel("Hours Studied per week")
plt.grid(alpha=0.3)
plt.show()

In [None]:
# This scatter plot shows the relationship between studentsâ€™ Attendance and Hours Studied per week, colored by their Risk Level.
# It helps to identify patterns such as students who study a lot but have low attendance, or students with high attendance but fewer study hours.
# Clusters indicate different student profiles and potential academic risks.

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=1)
plt.title("Correlation Heatmap", fontsize=16, fontweight='bold')
plt.show()

In [None]:
# This heatmap shows the correlations between numeric student features like Hours_Studied, Attendance, Motivation_Level, Previous_Scores, and Sleep_Hours.
# Dark red or dark blue cells highlight strong positive or negative relationships.
# For example, higher Attendance and Hours_Studied tend to correlate with better Exam_Score, while low Sleep_Hours might negatively relate to performance.
# This visualization helps us understand which factors most influence student outcomes and can guide the student clustering and risk-level analysis

In [None]:
# Feature Importance Plot
# it presents the most important factors affecting the outcome.

importances = rf_model.feature_importances_
features = X.columns

imp_df = pd.DataFrame({'Feature': features, 'Importance': importances})
imp_df = imp_df.sort_values(by='Importance', ascending=False).head(10)

plt.figure(figsize=(10,6))
plt.barh(imp_df['Feature'], imp_df['Importance'])
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importance", fontweight='bold')
plt.xlabel("Importance Score")
plt.show()


In [None]:
# Distribution of Exam Scores (Histogram)
# shows the distribution of student grades

plt.figure(figsize=(8,6))
plt.hist(df['Exam_Score'], bins=30)
plt.title("Distribution of Exam Scores", fontweight='bold')
plt.xlabel("Exam Score")
plt.ylabel("Number of Students")
plt.show()


In [None]:
# This plot shows how students scored on exams.
# Most students got average scores, while fewer students scored very high or very low.
# It helps us see overall performance and identify students who may need extra support.

In [None]:
# Risk Level Percentage Pie Chart
# it shows the precentage of each risk level

import matplotlib.pyplot as plt
import numpy as np

risk_counts = df['Risk_Level'].value_counts()

scaled_counts = risk_counts.copy()
scaled_counts['At Risk'] *= 4
scaled_counts['High Performer'] *= 4

plt.figure(figsize=(7,7))
plt.pie(
    scaled_counts,
    labels=risk_counts.index,
    autopct=lambda p: f'{p*sum(risk_counts)/100:.0f}',
    startangle=140,
    # colors=['blue','red','green'],
    wedgeprops={'edgecolor':'black'}
)
plt.title("Risk Level Percentage (Enhanced Small Pieces)", fontweight='bold')
plt.show()



In [None]:
# This pie chart shows the distribution of students by risk level. Small groups,
# like "At Risk" and "High Performer," are scaled up to make them more visible.
# It helps highlight all categories clearly, even the ones with few students.

In [None]:
# Average Score by Category (Bar Chart)
# according Parental_Involvement or Teacher_Quality

avg_scores = df.groupby('Parental_Involvement')['Exam_Score'].mean()

avg_scores.plot(kind='bar', figsize=(8,6))
plt.title("Average Exam Score by Parental Involvement", fontweight='bold')
plt.ylabel("Average Score")
plt.show()


In [None]:
# This bar chart shows the average exam scores for students based on their level of parental involvement.
# Higher parental involvement tends to be associated with higher exam scores,
# indicating the positive impact of parents supporting their children's learning.

In [None]:
# Pairplot to the most importance variables
# it shows the relationships between them.

sns.pairplot(df[['Attendance','Hours_Studied','Previous_Scores','Exam_Score','Risk_Level']],
             hue='Risk_Level')
plt.show()


In [None]:
# This pairplot visualizes the relationships between key academic features: Attendance, Hours Studied, Previous Scores, and Exam Score, colored by Risk Level.
# It helps identify patterns such as students with higher attendance and study hours generally achieving higher scores,
# while Risk Level highlights which students might be underperforming or at risk.

In [None]:
# anoter features

In [None]:
# Feature: Lifestyle Balance

# normalize Hours_Studied (0-1)
df['Hours_norm'] = df['Hours_Studied'] / df['Hours_Studied'].max()

# calculate lifestyle balance
df['Lifestyle_Balance'] = (
    df['Sleep_Hours'] * 0.4 +
    df['Physical_Activity'] * 0.3 +
    df['Hours_norm'] * 10 * 0.3
)

# drop temporary column
df.drop('Hours_norm', axis=1, inplace=True)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
sns.boxplot(x='Risk_Level', y='Lifestyle_Balance', data=df,
            order=["At Risk","Medium Risk","High Performer"])

plt.title("Lifestyle Balance by Risk Level", fontweight='bold')
plt.xlabel("Risk Level")
plt.ylabel("Lifestyle Balance Score")
plt.show()
# it explains whether the balanced student is lives are less dangerous or not.


In [None]:
# The box plot compares lifestyle balance indicators across different student groups, highlighting differences in median values,
# variability, and potential outliers. It provides a clear summary of how lifestyle habits vary between performance or risk categories.