In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load CSV file
df = pd.read_csv(r"C:/Users/DELL/Downloads/ethiopian_students_dataset.csv")

# View first 5 rows
print(df.head())

# Access a column
print(df.columns)

In [None]:
# 1. Check the basic structure
print("="*60)
print("DATA STRUCTURE ANALYSIS")
print("="*60)

print(f"Dataset Shape: {df.shape}")
print(f"Number of students: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 2. View column categories
print("\n" + "="*60)
print("COLUMN CATEGORIES")
print("="*60)

# Group columns by type
test_score_cols = [col for col in df.columns if 'Test_Score' in col]
attendance_cols = [col for col in df.columns if 'Attendance' in col]
homework_cols = [col for col in df.columns if 'Homework' in col]
participation_cols = [col for col in df.columns if 'Participation' in col]
textbook_cols = [col for col in df.columns if 'Textbook' in col]

print(f"Test Score columns: {len(test_score_cols)}")
print(f"Attendance columns: {len(attendance_cols)}")
print(f"Homework columns: {len(homework_cols)}")
print(f"Participation columns: {len(participation_cols)}")
print(f"Textbook columns: {len(textbook_cols)}")

# 3. Check subject coverage by grade
print("\n" + "="*60)
print("SUBJECT COVERAGE BY GRADE")
print("="*60)

for grade in range(1, 13):
    grade_test_cols = [col for col in test_score_cols if f'Grade_{grade}_' in col]
    if grade_test_cols:
        subjects = list(set([col.split(f'Grade_{grade}_')[1].split('_Test')[0]
                           for col in grade_test_cols]))
        print(f"Grade {grade}: {len(grade_test_cols)} subjects - {subjects}")

all_categorized_cols = set(test_score_cols + attendance_cols + homework_cols + participation_cols + textbook_cols)
remaining_cols = [col for col in df.columns if col not in all_categorized_cols]

print("\n" + "="*60)
print("REMAINING COLUMNS")
print("="*60)
print(f"Number of remaining columns: {len(remaining_cols)}")
print("Remaining columns (first 20):\n", remaining_cols[:20])
print("Remaining columns (last 20):\n", remaining_cols[-20:])

In [None]:
# Check for nulls per column
print("==============================")
print("CHECKING FOR MISSING DATA")
print("==============================")
print(df.isnull().sum())

# Check the percentage of missing data
print("================================")
print("Percentage of missing data:")
print("--------------------------------")
print(df.isnull().mean() * 100)

In [None]:
# 1. Get the value counts of dtypes
dtype_counts = df.dtypes.value_counts().reset_index()
dtype_counts.columns = ['Data Type', 'Count']

# 2. Plotting with the fix
plt.figure(figsize=(10, 6))

# Fix: Assign 'Data Type' to hue and set legend=False
sns.barplot(
    data=dtype_counts,
    x='Data Type',
    y='Count',
    hue='Data Type',
    palette='viridis',
    legend=False
)

plt.title('Distribution of Data Types in Student Dataset', fontsize=14)
plt.ylabel('Number of Columns')
plt.xlabel('Data Type')

# 3. Add labels on top of bars
for i, count in enumerate(dtype_counts['Count']):
    plt.text(i, count + 5, str(count), ha='center', fontweight='bold')

plt.show()

In [None]:
# ================================
# 1Ô∏è‚É£ INITIAL CLEANING & ENCODING
# ================================
# Drop Student_ID (never used in ML)
df = df.drop(columns=['Student_ID'], errors='ignore')

# Encode Field_Choice (Social=0, Natural=1)
df['Field_Choice'] = df['Field_Choice'].map({'Social': 0, 'Natural': 1})

# Fill missing Career_Interest with "Unknown"
df['Career_Interest'] = df['Career_Interest'].fillna('Unknown')


# ================================
# 2Ô∏è‚É£ DEFINE EDUCATION STAGES
# ================================
lower_primary = ['Grade_1', 'Grade_2', 'Grade_3', 'Grade_4']
upper_primary = ['Grade_5', 'Grade_6', 'Grade_7', 'Grade_8']
secondary     = ['Grade_9', 'Grade_10']
preparatory   = ['Grade_11', 'Grade_12']

stages = {
    'Lower_Primary': lower_primary,
    'Upper_Primary': upper_primary,
    'Secondary': secondary,
    'Preparatory': preparatory
}


# ================================
# 3Ô∏è‚É£ HELPER FUNCTION TO AGGREGATE GRADES
# ================================
def stage_average(df, grades, metric_keywords):
    """
    Compute average across all columns for a given stage and metric keywords.
    Returns the average series and list of original columns used.
    """
    cols = []
    for g in grades:
        for keyword in metric_keywords:
            cols += [c for c in df.columns if c.startswith(g) and keyword.lower() in c.lower()]
    cols = list(set(cols))
    return df[cols].mean(axis=1), cols


# ================================
# 4Ô∏è‚É£ AGGREGATE TEST SCORE, ATTENDANCE, HW, PARTICIPATION
# ================================
metrics_dict = {
    'Test_Score': ['Test_Score'],
    'Attendance': ['Attendance'],
    'HW_Completion': ['Homework_Completion'],
    'Participation': ['Participation']
}

cols_to_drop = []

for metric_name, keywords in metrics_dict.items():
    for stage_name, grades in stages.items():
        col_name = f'Avg_{metric_name}_{stage_name}'
        df[col_name], original_cols = stage_average(df, grades, keywords)
        cols_to_drop += original_cols

# Drop original grade-level columns
df.drop(columns=list(set(cols_to_drop)), inplace=True)

# Columns list for display
aggregated_cols = [f'Avg_{m}_{s}' for m in metrics_dict.keys() for s in stages.keys()]
print("Aggregated averages per Education Stage (head):")
print(df[aggregated_cols].head())


# ================================
# 5Ô∏è‚É£ AGGREGATE TEXTBOOK ACCESS
# ================================
# Convert Yes/No ‚Üí 1/0 safely
textbook_cols = [c for c in df.columns if 'Textbook' in c]
for col in textbook_cols:
    df[col] = df[col].replace({'Yes': 1, 'No': 0}).infer_objects(copy=False)

# Helper function for textbook access per stage
def textbook_access(df, grade_prefixes):
    cols = []
    for g in grade_prefixes:
        cols.extend([c for c in df.columns if c.startswith(g) and 'Textbook' in c])
    return df[cols].mean(axis=1) if len(cols) > 0 else pd.Series(0, index=df.index)

# Create aggregated textbook access per stage
new_cols_df = pd.DataFrame({
    'Textbook_Access_1_4': textbook_access(df, lower_primary),
    'Textbook_Access_5_8': textbook_access(df, upper_primary),
    'Textbook_Access_9_10': textbook_access(df, secondary),
    'Textbook_Access_11_12': textbook_access(df, preparatory)
})

df = pd.concat([df, new_cols_df], axis=1)
df = df.loc[:, ~df.columns.duplicated()]  # remove duplicates

# Display and visualize
textbook_summary_cols = [c for c in new_cols_df.columns if c in df.columns]
print(df[textbook_summary_cols].head())

plt.figure(figsize=(10, 6))
sns.boxplot(data=df[textbook_summary_cols])
plt.title('Textbook Access Distribution by Education Level', fontsize=14, fontweight='bold')
plt.ylabel('Access Score (0 to 1)')
plt.xticks(rotation=15)
plt.grid(alpha=0.3)
plt.show()


# ================================
# 6Ô∏è‚É£ TRACK-BASED NATIONAL EXAMS
# ================================
# Subjects per track
social_subjects = ['National_Exam_History', 'National_Exam_Geography',
                   'National_Exam_Economics', 'National_Exam_Math_Social']

natural_subjects = ['National_Exam_Biology', 'National_Exam_Chemistry',
                    'National_Exam_Physics', 'National_Exam_Math_Natural']

# Track-specific averages
df['Social_Track_Subject_Avg']  = df[social_subjects].mean(axis=1)
df['Natural_Track_Subject_Avg'] = df[natural_subjects].mean(axis=1)

# Track-based assignment
df['Track_Subject_Average'] = np.where(
    df['Field_Choice'] == 0,
    df['Social_Track_Subject_Avg'],
    df['Natural_Track_Subject_Avg']
)

# Common subjects for all students
common_subjects = ['National_Exam_Aptitude', 'National_Exam_English',
                   'National_Exam_Civics_and_Ethical_Education']
df['Common_Exam_Average'] = df[common_subjects].mean(axis=1)

# Overall Track Exam Average
df['Track_Exam_Average'] = (df['Common_Exam_Average'] + df['Track_Subject_Average']) / 2

# Display new exam columns
exam_cols = [
    'Social_Track_Subject_Avg',
    'Natural_Track_Subject_Avg',
    'Track_Subject_Average',
    'Common_Exam_Average',
    'Track_Exam_Average'
]
print("New Aggregated National Exam Features:")
print(df[exam_cols].head())


# ================================
# 7Ô∏è‚É£ VISUALIZATION: Exam Scores
# ================================
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Boxplot: Common vs Track vs Overall
sns.boxplot(data=df[['Common_Exam_Average', 'Track_Subject_Average', 'Track_Exam_Average']],
            ax=axes[0], palette="Set2")
axes[0].set_title('Distribution of Aggregate Exam Scores')
axes[0].set_ylabel('Score (0-100)')

# KDE: Track Exam Average by Field Choice
for choice, label in [(0, 'Social Science'), (1, 'Natural Science')]:
    subset = df[df['Field_Choice'] == choice]
    sns.kdeplot(subset['Track_Exam_Average'], ax=axes[1], label=label, fill=True)

axes[1].set_title('Track Exam Average: Social vs. Natural')
axes[1].set_xlabel('Score')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# DROP ORIGINAL HIGH-DIMENSION COLUMNS
drop_cols = [c for c in df.columns if c.startswith('Grade_')]
drop_cols += [c for c in df.columns if c.startswith('National_Exam_')]

df = df.drop(columns=drop_cols)
# -------------------------------
# 0Ô∏è‚É£ Drop leaking exam average columns
# -------------------------------
leak_cols = [
    'Total_National_Exam_Score',
    'Social_Track_Subject_Avg',
    'Natural_Track_Subject_Avg',
    'Track_Exam_Average',
    'Track_Subject_Average',
    'Common_Exam_Average',
    'Avg_Score_Secondary',
    'Avg_Score_Preparatory',
    'Avg_Score_Lower_Primary',
    'Avg_Score_Upper_Primary',
    'Avg_Test_Score_Secondary',  'Avg_Test_Score_Preparatory',
    'Avg_Test_Score_Lower_Primary',  'Avg_Test_Score_Upper_Primary',
    'School_ID', 'Total_Test_Score']

df = df.drop(columns=[c for c in leak_cols if c in df.columns])

# fix null value
df['Health_Issue'] = df['Health_Issue'].fillna('No Issue')
df['Father_Education'] = df['Father_Education'].fillna('Unknown')
df['Mother_Education'] = df['Mother_Education'].fillna('Unknown')

# FINAL CHECK
print(df.shape)
print(df.head())
print("all columns:",df.columns)

In [None]:
TARGET = 'Overall_Average'

# Select numeric columns only
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Compute correlations with target
corr_numeric = (
    df[num_cols]
    .corr()[TARGET]
    .drop(TARGET)
    .sort_values(key=abs, ascending=False)
)

print("üìä Numeric Feature Correlation with Target:")
print(corr_numeric)

plt.figure(figsize=(10, 8))

sns.barplot(
    x=corr_numeric.values,
    y=corr_numeric.index,
    hue=corr_numeric.index,
    palette='coolwarm',
    legend=False
)

plt.axvline(0, color='black', linewidth=1)
plt.title('Correlation with Total_National_Exam_Score')
plt.xlabel('Pearson Correlation')
plt.tight_layout()
plt.show()

In [None]:
cat_cols = df.select_dtypes(include='object').columns.drop(TARGET, errors='ignore')

cat_corr = {}

for col in cat_cols:
    means = df.groupby(col)[TARGET].mean()
    encoded = df[col].map(means)
    cat_corr[col] = encoded.corr(df[TARGET])

cat_corr = (
    pd.Series(cat_corr)
    .sort_values(key=abs, ascending=False)
)

print("üìä Categorical Feature Association with Target:")
print(cat_corr)

In [None]:
# ================================
# ALL-IN-ONE CATEGORICAL ENCODING
# ================================

# -------------------------------
# 1Ô∏è‚É£ Fill missing / fix NaNs
# -------------------------------
if 'Health_Issue' in df.columns:
    df['Health_Issue'] = df['Health_Issue'].fillna('No Issue')

for col in ['Father_Education', 'Mother_Education']:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# -------------------------------
# 2Ô∏è‚É£ Binary encoding
# -------------------------------
binary_maps = {
    'Gender': {'Male': 0, 'Female': 1},
    'Home_Internet_Access': {'No': 0, 'Yes': 1},
    'Electricity_Access': {'No': 0, 'Yes': 1},
    'School_Location': {'Rural': 0, 'Urban': 1}
}

for col, mapping in binary_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

# -------------------------------
# 3Ô∏è‚É£ Ordinal encoding (Parents Education)
# -------------------------------
edu_map = {'Unknown': 0, 'Primary': 1, 'High School': 2, 'College': 3, 'University': 4}
for col in ['Father_Education', 'Mother_Education']:
    enc_col = col + '_Encoded'
    if col in df.columns:
        df[enc_col] = df[col].map(edu_map)
        df.drop(columns=[col], inplace=True)

# -------------------------------
# 4Ô∏è‚É£ One-Hot Encoding (moderate cardinality)
# -------------------------------
ohe_cols = [c for c in ['Region', 'School_Type', 'Health_Issue'] if c in df.columns]
if ohe_cols:
    df = pd.get_dummies(df, columns=ohe_cols, drop_first=True)

# -------------------------------
# 5Ô∏è‚É£ Target Encoding (high cardinality)
# -------------------------------
def target_encode_smooth(df, col, target, alpha=10):
    global_mean = df[target].mean()
    stats = df.groupby(col)[target].agg(['mean','count'])
    smooth = (stats['count'] * stats['mean'] + alpha * global_mean) / (stats['count'] + alpha)
    return df[col].map(smooth).fillna(global_mean)

for col in ['School_ID', 'Career_Interest']:
    if col in df.columns:
        df[col + '_Encoded'] = target_encode_smooth(df, col, TARGET, alpha=10)
        df.drop(columns=[col], inplace=True)

# -------------------------------
# Convert Date_of_Birth ‚Üí Age (numeric)
# -------------------------------
CURRENT_DATE = pd.Timestamp('2026-01-30')  # fixed date for reproducibility

if 'Date_of_Birth' in df.columns:
    df['Date_of_Birth'] = pd.to_datetime(df['Date_of_Birth'], errors='coerce')
    df['Age'] = ((CURRENT_DATE - df['Date_of_Birth']).dt.days // 365).astype(float)
    df.drop(columns=['Date_of_Birth'], inplace=True)

# -------------------------------
# 6Ô∏è‚É£ Safety check
# -------------------------------
print("Categorical encoding completed.")
print("Columns now:", df.select_dtypes(include='object').columns.tolist())  # should be empty

In [None]:
# -------------------------------
# üîü Drop Raw Categorical Columns
# -------------------------------
drop_cols = [
    'Father_Education', 'Mother_Education','Career_Interest',
    'Health_Issue', 'Region','Date_of_Birth',
    'School_ID', 'School_Type'
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

In [None]:
df.info()

In [None]:
# ======================================
# 1. Imports
# ======================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PowerTransformer, StandardScaler

X = df.drop(columns=[TARGET])
y = df[TARGET]

In [None]:
# ======================================
# 2. Split Data (Assuming X, y are defined)
# ======================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ======================================
# 3. Identify Numeric & Categorical Columns
# ======================================
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(exclude=["int64", "float64"]).columns

# ======================================
# 4. Preprocessing
# ======================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", "passthrough", categorical_features)
    ]
)

# ======================================
# 5. Define Models
# ======================================
models = {
    "XGBoost": XGBRegressor(
        n_estimators=700,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1.0,
        reg_lambda=2.0,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    ),
    "RandomForest": RandomForestRegressor(
        n_estimators=500,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    )
}

# ======================================
# 6. Train, Predict, Evaluate
# ======================================
results = {}

for name, model in models.items():
    print(f"\n==== Training {name} ====")
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}

    print(f"{name} Performance:")
    print(f"MAE  : {mae:.2f}")
    print(f"RMSE : {rmse:.2f}")
    print(f"R¬≤   : {r2:.3f}")

    # Feature Importance (only for tree-based models)
    if name in ["XGBoost", "RandomForest", "GradientBoosting"]:
        feature_names = numeric_features.tolist() + categorical_features.tolist()
        importances = pipeline.named_steps["model"].feature_importances_
        feature_importance = pd.Series(importances, index=feature_names).sort_values(ascending=False)

        print("\nTop 10 Important Features:")
        print(feature_importance.head(10))

        # Plot Feature Importance
        plt.figure(figsize=(8, 5))
        sns.barplot(x=feature_importance.head(10).values, y=feature_importance.head(10).index)
        plt.title(f"Top 10 Feature Importance - {name}")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        plt.show()

    # Actual vs Predicted Plot
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_pred, alpha=0.4)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
    plt.xlabel("Actual Total Test Score")
    plt.ylabel("Predicted Total Test Score")
    plt.title(f"Actual vs Predicted - {name}")
    plt.grid()
    plt.show()

# ======================================
# 7. Compare All Models
# ======================================
comparison_df = pd.DataFrame(results).T
print("\n=== Model Comparison ===")
print(comparison_df.sort_values("R2", ascending=False))

In [None]:
# ==============================
# STUDENT RISK / NOT-RISK PREDICTION PIPELINE
# (FIXED THRESHOLD = 50%)
# ==============================

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    f1_score
)
from imblearn.over_sampling import SMOTE

# -----------------------------
# 1Ô∏è‚É£ DEFINE TARGET COLUMN
# -----------------------------
score_col = 'Overall_Average'   # Change if needed

# -----------------------------
# 2Ô∏è‚É£ CREATE RISK / NOT-RISK TARGET
# Risk = 1 (score < 50), NotRisk = 0 (score >= 50)
# -----------------------------
df['Risk_NotRisk'] = (df[score_col] < 50).astype(int)

print("\nClass distribution:")
print(df['Risk_NotRisk'].value_counts())

# -----------------------------
# 3Ô∏è‚É£ PREPARE FEATURES
# -----------------------------
X = df.drop(
    ['Risk_NotRisk', score_col, 'Total_National_Exam_Score'],
    axis=1,
    errors='ignore'
)
y = df['Risk_NotRisk']

# One-hot encode categorical variables
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print(f"\nFeature shape: {X_encoded.shape}")
print(f"Target shape: {y.shape}")

# -----------------------------
# 4Ô∏è‚É£ TRAIN / TEST SPLIT
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain/Test split:")
print("Train size:", X_train.shape[0])
print("Test size :", X_test.shape[0])

# -----------------------------
# 5Ô∏è‚É£ HANDLE CLASS IMBALANCE (SMOTE)
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE:")
print(pd.Series(y_train_res).value_counts())

# -----------------------------
# 6Ô∏è‚É£ TRAIN RANDOM FOREST
# -----------------------------
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_res, y_train_res)

# -----------------------------
# 7Ô∏è‚É£ PREDICT (FIXED THRESHOLD = 0.50)
# -----------------------------
FIXED_THRESHOLD = 0.50

y_probs = rf.predict_proba(X_test)[:, 1]
y_pred = (y_probs >= FIXED_THRESHOLD).astype(int)

f1_fixed = f1_score(y_test, y_pred, pos_label=1)

print(f"\nUsing fixed threshold: {FIXED_THRESHOLD}")
print(f"F1-Score (Risk class): {f1_fixed:.3f}")

# -----------------------------
# 8Ô∏è‚É£ MODEL EVALUATION
# -----------------------------
cm = confusion_matrix(y_test, y_pred)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(cm)

roc_auc = roc_auc_score(y_test, y_probs)
print(f"ROC-AUC: {roc_auc:.3f}")

# -----------------------------
# 9Ô∏è‚É£ VISUALIZATIONS
# -----------------------------
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# ---- Confusion Matrix
ax1 = axes[0, 0]
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    ax=ax1,
    xticklabels=['Predicted NotRisk', 'Predicted Risk'],
    yticklabels=['Actual NotRisk', 'Actual Risk']
)
ax1.set_title('Confusion Matrix (Threshold = 0.50)', fontweight='bold')

# ---- ROC Curve
ax2 = axes[0, 1]
fpr, tpr, thresholds_roc = roc_curve(y_test, y_probs)
ax2.plot(fpr, tpr, lw=2, label=f'AUC = {roc_auc:.3f}')
ax2.plot([0, 1], [0, 1], linestyle='--', alpha=0.5)

idx = np.argmin(np.abs(thresholds_roc - FIXED_THRESHOLD))
ax2.scatter(fpr[idx], tpr[idx], s=100, label='Threshold 0.50')

ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('ROC Curve', fontweight='bold')
ax2.legend()
ax2.grid(alpha=0.3)

# ---- Feature Importance
ax3 = axes[1, 0]
feature_importance = pd.Series(
    rf.feature_importances_,
    index=X_encoded.columns
).sort_values(ascending=False)

top_features = feature_importance.head(10)
ax3.barh(top_features.index, top_features.values)
ax3.invert_yaxis()
ax3.set_title('Top 10 Feature Importances', fontweight='bold')
ax3.set_xlabel('Importance')

# ---- Probability Distribution
ax4 = axes[1, 1]
ax4.hist(y_probs[y_test == 0], bins=30, alpha=0.6, label='NotRisk')
ax4.hist(y_probs[y_test == 1], bins=30, alpha=0.6, label='Risk')
ax4.axvline(0.50, linestyle='--', label='Threshold 0.50')
ax4.set_title('Predicted Probability Distribution', fontweight='bold')
ax4.set_xlabel('Predicted Risk Probability')
ax4.legend()

plt.suptitle('Student Risk / Not-Risk Prediction (Fixed Threshold = 50%)',
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# -----------------------------
# üîü PRINT TOP FEATURES
# -----------------------------
print("\n" + "=" * 60)
print("TOP 10 FEATURES INFLUENCING RISK / NOT-RISK")
print("=" * 60)

for i, (feature, importance) in enumerate(top_features.items(), 1):
    print(f"{i:2d}. {feature:<30} {importance:.4f}")

print("\n" + "=" * 60)
print("PIPELINE COMPLETE ‚úî")
print("=" * 60)