In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

# Read the saved data
print("=" * 70)
print("READING SAVED DATA")
print("=" * 70)

try:
    df = pd.read_csv('students_dataset.csv')
    print(f"‚úÖ Dataset loaded successfully!")
    print(f" Shape: {df.shape}")
    print(f" Columns: {len(df.columns)}")
    print(f" Total records: {len(df)}")
except FileNotFoundError:
    print(" File 'students_dataset.csv' not found.")
    exit()



In [None]:
# 1. Check the basic structure
print("="*60)
print("DATA STRUCTURE ANALYSIS")
print("="*60)

print(f"Dataset Shape: {df.shape}")
print(f"Number of students: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 2. View column categories
print("\n" + "="*60)
print("COLUMN CATEGORIES")
print("="*60)

# Group columns by type
test_score_cols = [col for col in df.columns if 'Test_Score' in col]
attendance_cols = [col for col in df.columns if 'Attendance' in col]
homework_cols = [col for col in df.columns if 'Homework' in col]
participation_cols = [col for col in df.columns if 'Participation' in col]
textbook_cols = [col for col in df.columns if 'Textbook' in col]

print(f"Test Score columns: {len(test_score_cols)}")
print(f"Attendance columns: {len(attendance_cols)}")
print(f"Homework columns: {len(homework_cols)}")
print(f"Participation columns: {len(participation_cols)}")
print(f"Textbook columns: {len(textbook_cols)}")

# 3. Check subject coverage by grade
print("\n" + "="*60)
print("SUBJECT COVERAGE BY GRADE")
print("="*60)

for grade in range(1, 13):
    grade_test_cols = [col for col in test_score_cols if f'Grade_{grade}_' in col]
    if grade_test_cols:
        subjects = list(set([col.split(f'Grade_{grade}_')[1].split('_Test')[0]
                           for col in grade_test_cols]))
        print(f"Grade {grade}: {len(grade_test_cols)} subjects - {subjects}")

all_categorized_cols = set(test_score_cols + attendance_cols + homework_cols + participation_cols + textbook_cols)
remaining_cols = [col for col in df.columns if col not in all_categorized_cols]

print("\n" + "="*60)
print("REMAINING COLUMNS")
print("="*60)
print(f"Number of remaining columns: {len(remaining_cols)}")
print("Remaining columns (first 20):\n", remaining_cols[:20])
print("Remaining columns (last 20):\n", remaining_cols[-20:])

In [None]:
# Check for nulls per column
print("==============================")
print("CHECKING FOR MISSING DATA")
print("==============================")
print(df.isnull().sum())

# Check the percentage of missing data
print("================================")
print("Percentage of missing data:")
print("--------------------------------")
print(df.isnull().mean() * 100)

In [None]:
print("--------------------------------")
print("Check data types of all columns")
print("--------------------------------")
print(df.dtypes)

# To see a count of how many columns you have for each type
print(df.dtypes.value_counts())

# Select only numeric columns (float and int)
numeric_cols = df.select_dtypes(include=['number']).columns
print(f"Numeric Columns: {list(numeric_cols)}")

# Select only categorical/object columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(f"Categorical Columns: {list(categorical_cols)}")

#column name of one subject(maths)
one_column= [col for col in df.columns if 'Grade_12' in col and 'Math' in col]
#number of column is one subject (maths)
print("number of column in one subject is:",len(one_column))
print(one_column)

In [None]:
# 1. Get the value counts of dtypes
dtype_counts = df.dtypes.value_counts().reset_index()
dtype_counts.columns = ['Data Type', 'Count']

# 2. Plotting with the fix
plt.figure(figsize=(10, 6))

# Fix: Assign 'Data Type' to hue and set legend=False
sns.barplot(
    data=dtype_counts,
    x='Data Type',
    y='Count',
    hue='Data Type',
    palette='viridis',
    legend=False
)

plt.title('Distribution of Data Types in Student Dataset', fontsize=14)
plt.ylabel('Number of Columns')
plt.xlabel('Data Type')

# 3. Add labels on top of bars
for i, count in enumerate(dtype_counts['Count']):
    plt.text(i, count + 5, str(count), ha='center', fontweight='bold')

plt.show()

In [None]:
# 1. Check for inconsistent scales
print("\n" + "="*60)
print("SCALE CONSISTENCY CHECK")
print("="*60)

# Test scores should be on similar scale (0-100 typically)
sample_test_scores = test_score_cols[:10]  # Check first 10 test score columns

for col in sample_test_scores:
    if col in df.columns:
        min_val = df[col].min()
        max_val = df[col].max()
        mean_val = df[col].mean()
        print(f"{col:50} Min: {min_val:6.2f} Max: {max_val:6.2f} Mean: {mean_val:6.2f}")

# 2. Check for impossible values
print("\n" + "="*60)
print("DATA VALIDATION CHECKS")
print("="*60)

# Check if any test scores are outside reasonable range (0-100)
for grade in [9, 10, 11, 12]:  # Check upper grades first
    grade_cols = [col for col in test_score_cols if f'Grade_{grade}_' in col]
    for col in grade_cols[:3]:  # Check first 3 subjects per grade
        if col in df.columns:
            invalid_count = ((df[col] < 0) | (df[col] > 100)).sum()
            if invalid_count > 0:
                print(f"WARNING: {col} has {invalid_count} values outside 0-100 range")

# 3. Check attendance, homework, participation scales
# These are likely percentages (0-100) or proportions (0-1)
print("\nChecking behavioral data scales...")
sample_attendance = attendance_cols[0] if attendance_cols else None
if sample_attendance and sample_attendance in df.columns:
    att_min = df[sample_attendance].min()
    att_max = df[sample_attendance].max()
    print(f"Attendance sample ({sample_attendance}): {att_min:.2f} to {att_max:.2f}")

    # Determine scale
    if att_max > 1:
        print("Likely scale: 0-100 (percentage)")
        # We may need to normalize to 0-1 for consistency
    else:
        print("Likely scale: 0-1 (proportion)")

In [None]:
# -----------------------------
# Distribution of key variables
# -----------------------------
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Distribution of Total_National_Exam_Score
if 'Total_National_Exam_Score' in df.columns:
    axes[0, 0].hist(
        df['Total_National_Exam_Score'].dropna(),
        bins=30,
        edgecolor='black',
        alpha=0.7
    )
    axes[0, 0].set_title('Distribution of Total_National_Exam_Score')
    axes[0, 0].set_xlabel('Average Score')
    axes[0, 0].set_ylabel('Frequency')

# Plot 2: Parental Involvement distribution
if 'Parental_Involvement' in df.columns:
    axes[0, 1].hist(
        df['Parental_Involvement'].dropna(),
        bins=30,
        edgecolor='black',
        alpha=0.7,
        color='green'
    )
    axes[0, 1].set_title('Parental Involvement Distribution')
    axes[0, 1].set_xlabel('Involvement Score (0‚Äì1)')
    axes[0, 1].set_ylabel('Frequency')

# Plot 3: Overall Average by Academic Track (BOXPLOT)
if 'Field_Choice' in df.columns and 'Overall_Average' in df.columns:
    track_data = df[['Field_Choice', 'Overall_Average']].dropna()

    tracks = sorted(track_data['Field_Choice'].unique())
    labels = ['Social' if t == 0 else 'Natural' for t in tracks]

    box_data = [
        track_data[track_data['Field_Choice'] == t]['Overall_Average']
        for t in tracks
    ]

    axes[1, 0].boxplot(
        box_data,
        tick_labels=labels   # ‚úÖ FIXED (was labels=)
    )
    axes[1, 0].set_title('Overall Average by Academic Track')
    axes[1, 0].set_ylabel('Average Score')

# Plot 4: Region performance
if 'Region' in df.columns and 'Overall_Average' in df.columns:
    region_avg = (
        df.groupby('Region')['Overall_Average']
        .mean()
        .sort_values()
    )

    axes[1, 1].barh(range(len(region_avg)), region_avg.values)
    axes[1, 1].set_yticks(range(len(region_avg)))
    axes[1, 1].set_yticklabels(region_avg.index)
    axes[1, 1].set_title('Average Performance by Region')
    axes[1, 1].set_xlabel('Average Score')

plt.tight_layout()
plt.show()

In [None]:
#Drop ID (never use in ML)
df = df.drop(columns=['Student_ID'], errors='ignore')

#  Encode Field_Choice (VERY IMPORTANT)
df['Field_Choice'] = df['Field_Choice'].map({
    'Social': 0,
    'Natural': 1
})

# HANDLE CAREER_INTEREST
# Fill missing with "Unknown"
df['Career_Interest'] = df['Career_Interest'].fillna('Unknown')

In [None]:
# AGGREGATE GRADES INTO EDUCATION STAGES
# Define grade groups
lower_primary = ['Grade_1', 'Grade_2', 'Grade_3', 'Grade_4']
upper_primary = ['Grade_5', 'Grade_6', 'Grade_7', 'Grade_8']
secondary = ['Grade_9', 'Grade_10']
preparatory = ['Grade_11', 'Grade_12']

# Helper function to compute average test score
def stage_average(df, grades):
    cols = []
    for g in grades:
        cols += [c for c in df.columns if c.startswith(g) and c.endswith('_Test_Score')]
    return df[cols].mean(axis=1)

# Create aggregated academic scores
df['Avg_Score_Lower_Primary'] = stage_average(df, lower_primary)
df['Avg_Score_Upper_Primary'] = stage_average(df, upper_primary)
df['Avg_Score_Secondary'] = stage_average(df, secondary)
df['Avg_Score_Preparatory'] = stage_average(df, preparatory)

In [None]:
# CREATE ENGAGEMENT SCORES
# Helper function for engagement
def engagement_score(df, grades):
    attendance = []
    homework = []
    participation = []

    for g in grades:
        attendance += [c for c in df.columns if c.startswith(g) and 'Attendance' in c]
        homework += [c for c in df.columns if c.startswith(g) and 'Homework' in c]
        participation += [c for c in df.columns if c.startswith(g) and 'Participation' in c]

    return df[attendance + homework + participation].mean(axis=1)

# Create engagement features
df['Engagement_1_4'] = engagement_score(df, lower_primary)
df['Engagement_5_8'] = engagement_score(df, upper_primary)
df['Engagement_9_10'] = engagement_score(df, secondary)
df['Engagement_11_12'] = engagement_score(df, preparatory)

# -------------------------------
# Combine all Engagement scores into one overall column
# -------------------------------
engagement_cols = [
    'Engagement_1_4',
    'Engagement_5_8',
    'Engagement_9_10',
    'Engagement_11_12'
]
"""
# Create overall engagement column
df['Engagement_All'] = df[engagement_cols].mean(axis=1)

# Check the first 5 rows
print("Overall Engagement Scores (Head):")
print(df[['Engagement_All']].head())

# Optional: Visualize the overall engagement
plt.figure(figsize=(8, 5))
sns.histplot(df['Engagement_All'], kde=True, bins=20)
plt.title('Overall Engagement Score Distribution')
plt.xlabel('Engagement Score (0 to 1)')
plt.ylabel('Count')
plt.show()
"""
print("Engagement(attendance,homework,participation) Scores per Education Level (Head):")
print(df[engagement_cols].head())

# Visualizing the distribution of the four new columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[engagement_cols])
plt.title('Engagement(attendance,homework,participation) Distribution by Education Level')
plt.ylabel('Average of enggegment')
plt.xticks(rotation=15)
plt.show()

In [None]:
# 1. Handle Downcasting Warning (Opt-in to the future behavior)
pd.set_option('future.no_silent_downcasting', True)

# 2. Convert Yes/No to 1/0
textbook_cols = [c for c in df.columns if 'Textbook' in c]
df[textbook_cols] = df[textbook_cols].replace({'Yes': 1, 'No': 0}).infer_objects(copy=False)

# 3. Optimized Textbook access calculation
def textbook_access(df, grades):
    cols = []
    for g in grades:
        cols += [c for c in df.columns if c.startswith(g) and 'Textbook' in c]
    return df[cols].mean(axis=1)

# 4. Fix Fragmentation: Create a dictionary for new columns first
new_data = {
    'Textbook_Access_1_4': textbook_access(df, lower_primary),
    'Textbook_Access_5_8': textbook_access(df, upper_primary),
    'Textbook_Access_9_10': textbook_access(df, secondary),
    'Textbook_Access_11_12': textbook_access(df, preparatory)
}

# 5. Add all columns at once using pd.concat to prevent PerformanceWarning
new_cols_df = pd.DataFrame(new_data)
df = pd.concat([df, new_cols_df], axis=1)

# Ensure no duplicate columns
textbook_summary_cols = list(dict.fromkeys(textbook_summary_cols))

# Drop columns that do not exist or are all NaN
textbook_summary_cols = [c for c in textbook_summary_cols if c in df.columns and not df[c].isna().all()]

# Visualize the distribution safely
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[textbook_summary_cols])
plt.title('Textbook Access Distribution by Education Level')
plt.ylabel('Access Score (0 to 1)')
plt.xticks(rotation=15)
plt.show()

In [None]:
# HANDLE TRACK-BASED NATIONAL EXAMS (CRITICAL PART)
# Social Science track exam score
social_subjects = [
    'National_Exam_History',
    'National_Exam_Geography',
    'National_Exam_Economics',
    'National_Exam_Math_Social'
]

natural_subjects = [
    'National_Exam_Biology',
    'National_Exam_Chemistry',
    'National_Exam_Physics',
    'National_Exam_Math_Natural'
]

df['Social_Track_Subject_Avg'] = df[social_subjects].mean(axis=1)
df['Natural_Track_Subject_Avg'] = df[natural_subjects].mean(axis=1)


df['Track_Subject_Average'] = np.where(
    df['Field_Choice'] == 0,
    df['Social_Track_Subject_Avg'],
    df['Natural_Track_Subject_Avg']
)

common_subjects = [
    'National_Exam_Aptitude',
    'National_Exam_English',
    'National_Exam_Civics_and_Ethical_Education'
]

df['Common_Exam_Average'] = df[common_subjects].mean(axis=1)

df['Track_Exam_Average'] = (
    df['Common_Exam_Average'] + df['Track_Subject_Average']
) / 2

# --- VISUALIZATION ---

# Set style
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Visualization 1: Comparison of Common vs. Track Performance
sns.boxplot(data=df[['Common_Exam_Average', 'Track_Subject_Average', 'Track_Exam_Average']],
            ax=axes[0], palette="Set2")
axes[0].set_title('Distribution of Aggregate Exam Scores')
axes[0].set_ylabel('Score (0-100)')

# Visualization 2: Final Performance by Field Choice (Density)
for choice, label in [(0, 'Social Science'), (1, 'Natural Science')]:
    subset = df[df['Field_Choice'] == choice]
    sns.kdeplot(subset['Track_Exam_Average'], ax=axes[1], label=label, fill=True)

axes[1].set_title('Track Exam Average: Social vs. Natural')
axes[1].set_xlabel('Score')
axes[1].legend()

plt.tight_layout()
plt.show()

# 4. Show the Head (FIXED)
exam_cols = [
    'Social_Track_Subject_Avg',
    'Natural_Track_Subject_Avg',
    'Track_Subject_Average',
    'Common_Exam_Average',
    'Track_Exam_Average'
]

print("New Aggregated National Exam Features:")
print(df[exam_cols].head())

In [None]:
# DROP ORIGINAL HIGH-DIMENSION COLUMNS
drop_cols = [c for c in df.columns if c.startswith('Grade_')]
drop_cols += [c for c in df.columns if c.startswith('National_Exam_')]

df = df.drop(columns=drop_cols)
# -------------------------------
# 0Ô∏è‚É£ Drop leaking exam average columns
# -------------------------------
leak_cols = [
    'Social_Track_Subject_Avg',
    'Natural_Track_Subject_Avg',
    'Track_Exam_Average',
    'Track_Subject_Average',
    'Common_Exam_Average',
     'School_ID','Total_Test_Score', 'Overall_Average']

df = df.drop(columns=[c for c in leak_cols if c in df.columns])

# fix null value
df['Health_Issue'] = df['Health_Issue'].fillna('No Issue')
df['Father_Education'] = df['Father_Education'].fillna('Unknown')
df['Mother_Education'] = df['Mother_Education'].fillna('Unknown')

# FINAL CHECK
print(df.shape)
print(df.head())
print("all columns:",df.columns)

In [None]:
df.info()

In [None]:
TARGET = 'Total_National_Exam_Score'

# Select numeric columns only
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Compute correlations with target
corr_numeric = (
    df[num_cols]
    .corr()[TARGET]
    .drop(TARGET)
    .sort_values(key=abs, ascending=False)
)

print("üìä Numeric Feature Correlation with Target:")
print(corr_numeric)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))

sns.barplot(
    x=corr_numeric.values,
    y=corr_numeric.index,
    hue=corr_numeric.index,
    palette='coolwarm',
    legend=False
)

plt.axvline(0, color='black', linewidth=1)
plt.title('Correlation with Total_National_Exam_Score')
plt.xlabel('Pearson Correlation')
plt.tight_layout()
plt.show()

In [None]:
cat_cols = df.select_dtypes(include='object').columns.drop(TARGET, errors='ignore')

cat_corr = {}

for col in cat_cols:
    means = df.groupby(col)[TARGET].mean()
    encoded = df[col].map(means)
    cat_corr[col] = encoded.corr(df[TARGET])

cat_corr = (
    pd.Series(cat_corr)
    .sort_values(key=abs, ascending=False)
)

print("üìä Categorical Feature Association with Target:")
print(cat_corr)

In [None]:
# Select categorical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns

# Prepare a dictionary to store info
cat_summary = {}

for col in cat_cols:
    unique_vals = df[col].unique()
    cat_summary[col] = {
        'Unique_Count': len(unique_vals),
        'Unique_Values': unique_vals
    }

# Display summary in a readable way
print("Unique count and value of catagorical features:")
for col, info in cat_summary.items():
    print(f"Column: {col}")
    print(f"  Unique Count : {info['Unique_Count']}")
    print(f"  Unique Values: {info['Unique_Values']}\n")

In [None]:
# ================================
# ALL-IN-ONE CATEGORICAL ENCODING
# ================================

# -------------------------------
# 1Ô∏è‚É£ Fill missing / fix NaNs
# -------------------------------
if 'Health_Issue' in df.columns:
    df['Health_Issue'] = df['Health_Issue'].fillna('No Issue')

for col in ['Father_Education', 'Mother_Education']:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# -------------------------------
# 2Ô∏è‚É£ Binary encoding
# -------------------------------
binary_maps = {
    'Gender': {'Male': 0, 'Female': 1},
    'Home_Internet_Access': {'No': 0, 'Yes': 1},
    'Electricity_Access': {'No': 0, 'Yes': 1},
    'School_Location': {'Rural': 0, 'Urban': 1}
}

for col, mapping in binary_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

# -------------------------------
# 3Ô∏è‚É£ Ordinal encoding (Parents Education)
# -------------------------------
edu_map = {'Unknown': 0, 'Primary': 1, 'High School': 2, 'College': 3, 'University': 4}
for col in ['Father_Education', 'Mother_Education']:
    enc_col = col + '_Encoded'
    if col in df.columns:
        df[enc_col] = df[col].map(edu_map)
        df.drop(columns=[col], inplace=True)

# -------------------------------
# 4Ô∏è‚É£ One-Hot Encoding (moderate cardinality)
# -------------------------------
ohe_cols = [c for c in ['Region', 'School_Type', 'Health_Issue'] if c in df.columns]
if ohe_cols:
    df = pd.get_dummies(df, columns=ohe_cols, drop_first=True)

# -------------------------------
# 5Ô∏è‚É£ Target Encoding (high cardinality)
# -------------------------------
def target_encode_smooth(df, col, target, alpha=10):
    global_mean = df[target].mean()
    stats = df.groupby(col)[target].agg(['mean','count'])
    smooth = (stats['count'] * stats['mean'] + alpha * global_mean) / (stats['count'] + alpha)
    return df[col].map(smooth).fillna(global_mean)

for col in ['School_ID', 'Career_Interest']:
    if col in df.columns:
        df[col + '_Encoded'] = target_encode_smooth(df, col, TARGET, alpha=10)
        df.drop(columns=[col], inplace=True)

# -------------------------------
# Convert Date_of_Birth ‚Üí Age (numeric)
# -------------------------------
CURRENT_DATE = pd.Timestamp('2026-01-30')  # fixed date for reproducibility

if 'Date_of_Birth' in df.columns:
    df['Date_of_Birth'] = pd.to_datetime(df['Date_of_Birth'], errors='coerce')
    df['Age'] = ((CURRENT_DATE - df['Date_of_Birth']).dt.days // 365).astype(float)
    df.drop(columns=['Date_of_Birth'], inplace=True)

# -------------------------------
# 6Ô∏è‚É£ Safety check
# -------------------------------
print("Categorical encoding completed.")
print("Columns now:", df.select_dtypes(include='object').columns.tolist())  # should be empty

In [None]:
# -------------------------------
# üîü Drop Raw Categorical Columns
# -------------------------------
drop_cols = [
    'Father_Education', 'Mother_Education','Career_Interest',
    'Health_Issue', 'Region','Date_of_Birth',
    'School_ID', 'School_Type'
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

# after remove catagorical feature
df.head()

In [None]:

# =====================================
#  Summary Statistics, Check skewness and detecting outlier
# =====================================
# ------------------------------
# 1Ô∏è‚É£ Summary Statistics
# ------------------------------
print("Summary statistics of numeric features and target:\n")
print(df.describe().T)

# ------------------------------
# 2Ô∏è‚É£ Check skewness
# ------------------------------
print("\nSkewness of numeric features and target:\n")
print(df.skew())

selected_numeric_cols = [
    "Total_National_Exam_Score",
    "Avg_Score_Preparatory",
    "Textbook_Access_11_12",
    "Parental_Involvement",
    "Engagement_11_12",
    "Engagement_1_4",
    "School_Academic_Score",
    "Teacher_Student_Ratio",
    "School_Resources_Score"
]

# ------------------------------
# 3Ô∏è‚É£ Plot Histograms for selected numeric features
# ------------------------------
n_cols = 3  # number of columns in plot grid
n_rows = int(np.ceil(len(selected_numeric_cols)/n_cols))

plt.figure(figsize=(n_cols*5, n_rows*4))

for i, col in enumerate(selected_numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(df[col], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# ------------------------------
# 4Ô∏è‚É£ Boxplots to detect outliers for selected features
# ------------------------------
plt.figure(figsize=(n_cols*5, n_rows*4))
for i, col in enumerate(selected_numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(x=df[col], color='lightgreen')
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
# ==============================
# Improved Robust Modeling Pipeline with Feature Engineering, Skew Handling, and Hyperparameter Tuning
# ==============================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV, HuberRegressor, RANSACRegressor, LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from scipy.stats.mstats import winsorize

# ------------------------------
# Settings
# ------------------------------
TARGET = 'Total_National_Exam_Score'
TEST_SIZE = 0.2
RANDOM_STATE = 42
WINSOR_LIMIT = 0.01
df=df_clean.copy()
# ------------------------------
# 1Ô∏è‚É£ Prepare features and target
# ------------------------------
X = df.drop(columns=[TARGET])
y = np.log1p(df[TARGET])  # log-transform target to reduce skew

# ------------------------------
# 2Ô∏è‚É£ Winsorize numeric features
# ------------------------------
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
X_winsor = X.copy()
for col in numeric_cols:
    X_winsor[col] = winsorize(X[col], limits=(WINSOR_LIMIT, WINSOR_LIMIT))

# ------------------------------
# 3Ô∏è‚É£ Transform skewed features (log1p)
# ------------------------------
skewed_cols = [col for col in numeric_cols if abs(X_winsor[col].skew()) > 1]
for col in skewed_cols:
    X_winsor[col] = np.log1p(X_winsor[col])

# ------------------------------
# 6Ô∏è‚É£ Train/Test Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_winsor, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [None]:
# ===============================================================
# COMPLETE REGRESSION PIPELINE: LINEAR + NON-LINEAR + COMPARISON
# ===============================================================

RANDOM_STATE = 42
sns.set(style="whitegrid")

# ===============================================================
# 1Ô∏è‚É£ LINEAR PREPROCESSING (UNCHANGED)
# ===============================================================

pt = PowerTransformer(method='yeo-johnson', standardize=True)
X_train[numeric_cols] = pt.fit_transform(X_train[numeric_cols])
X_test[numeric_cols]  = pt.transform(X_test[numeric_cols])

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols]  = scaler.transform(X_test[numeric_cols])

# ===============================================================
# 2Ô∏è‚É£ LINEARITY & INDEPENDENCE CHECK (UNCHANGED)
# ===============================================================

def check_linearity_independence(X, y):
    lr = LinearRegression()
    lr.fit(X, y)
    y_pred = lr.predict(X)
    residuals = y - y_pred

    plt.figure(figsize=(7,5))
    sns.scatterplot(x=y_pred, y=residuals, alpha=0.5)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residuals vs Predicted (Linearity & Homoscedasticity)")
    plt.show()

    dw = durbin_watson(residuals)
    print(f"Durbin-Watson statistic: {dw:.2f} (~2-> residuals are independant)")

    return residuals

_ = check_linearity_independence(X_train, y_train)

# ===============================================================
# 3Ô∏è‚É£ TRAIN & COMPARE ALL MODELS
# ===============================================================

models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": LassoCV(alphas=np.logspace(-3, 1, 20), cv=5, max_iter=5000, random_state=RANDOM_STATE),
    "Ridge Regression": RidgeCV(alphas=np.logspace(-3, 3, 20), cv=5),
    "Random Forest": RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=5,
                                           random_state=RANDOM_STATE, n_jobs=-1),
    "XGBoost": xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5,
                               subsample=0.8, colsample_bytree=0.8,
                               random_state=RANDOM_STATE, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,
                                                   max_depth=3, random_state=RANDOM_STATE)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    results.append({
        "Model": name,
        "R2_Score": r2_score(y_test, preds),
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "Predictions": preds,
        "Model_Object": model
    })

comparison_df = pd.DataFrame(results).sort_values("R2_Score", ascending=False)

print("\nüìä MODEL PERFORMANCE COMPARISON")
print(comparison_df[["Model", "R2_Score", "MAE", "RMSE"]])

# ===============================================================
# 4Ô∏è‚É£ BEST MODEL
# ===============================================================

best_row = comparison_df.iloc[0]
best_model_name = best_row["Model"]
best_predictions = best_row["Predictions"]

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"R¬≤   : {best_row['R2_Score']:.4f}")
print(f"MAE  : {best_row['MAE']:.4f}")
print(f"RMSE : {best_row['RMSE']:.4f}")

# ===============================================================
# 5Ô∏è‚É£ VISUALIZATION
# ===============================================================

fig = plt.figure(figsize=(20, 14))

# --- R¬≤ ---
ax1 = plt.subplot(2,3,1)
ax1.bar(comparison_df["Model"], comparison_df["R2_Score"])
ax1.set_title("R¬≤ Score")
ax1.tick_params(axis='x', rotation=45)
ax1.grid(alpha=0.3)

# --- MAE ---
ax2 = plt.subplot(2,3,2)
ax2.bar(comparison_df["Model"], comparison_df["MAE"])
ax2.set_ylim(0, 0.15) # üëà FIX: compress error scale
ax2.set_title("MAE")
ax2.tick_params(axis='x', rotation=45)
ax2.grid(alpha=0.3)

# --- RMSE ---
ax3 = plt.subplot(2,3,3)
ax3.bar(comparison_df["Model"], comparison_df["RMSE"])
ax3.set_ylim(0, 0.15) # üëà FIX: compress error scale
ax3.set_title("RMSE")
ax3.tick_params(axis='x', rotation=45)
ax3.grid(alpha=0.3)

# --- Actual vs Predicted ---
ax4 = plt.subplot(2,3,5)
ax4.scatter(y_test, best_predictions, alpha=0.6,
            c=np.abs(y_test - best_predictions), cmap="viridis")
min_val = min(y_test.min(), best_predictions.min())
max_val = max(y_test.max(), best_predictions.max())
ax4.plot([min_val, max_val], [min_val, max_val], 'r--')
ax4.set_title(f"Actual vs Predicted ({best_model_name})")
ax4.set_xlabel("Actual")
ax4.set_ylabel("Predicted")
plt.colorbar(ax4.collections[0], ax=ax4, label="Absolute Error")

plt.suptitle(f"Model Comparison | Best Model: {best_model_name}", fontsize=18)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    RocCurveDisplay
)
from imblearn.over_sampling import SMOTE

# -----------------------------
# 1Ô∏è‚É£ Define Pass/Fail Target
# -----------------------------
df['Pass_Fail'] = (df['Total_National_Exam_Score'] >= 350).astype(int)
print("Class distribution:\n", df['Pass_Fail'].value_counts())

# -----------------------------
# 2Ô∏è‚É£ Define Features and Target
# -----------------------------
X = df.drop(['Total_National_Exam_Score', 'Pass_Fail'], axis=1)
y = df['Pass_Fail']

# -----------------------------
# 3Ô∏è‚É£ Train/Test Split (stratified)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 4Ô∏è‚É£ Handle Class Imbalance (SMOTE)
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE, training class distribution:")
print(pd.Series(y_train_res).value_counts())

# -----------------------------
# 5Ô∏è‚É£ Train Random Forest Classifier
# -----------------------------
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42
)
rf.fit(X_train_res, y_train_res)

# -----------------------------
# 6Ô∏è‚É£ Make Predictions with Adjusted Threshold
# -----------------------------
y_probs = rf.predict_proba(X_test)[:, 1]

# Adjust threshold
threshold = 0.50
y_pred = (y_probs >= threshold).astype(int)

# -----------------------------
# 7Ô∏è‚É£ Evaluate Model
# -----------------------------
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

# -----------------------------
# 8Ô∏è‚É£ Visualize Confusion Matrix
# -----------------------------
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fail', 'Pass'], yticklabels=['Fail', 'Pass'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# -----------------------------
# 9Ô∏è‚É£ ROC Curve
# -----------------------------
fpr, tpr, thresholds_roc = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# -----------------------------
# 10Ô∏è‚É£ Feature Importance
# -----------------------------
feature_importance = pd.Series(rf.feature_importances_, index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)
print("\nTop 10 Features by Importance:\n", feature_importance.head(10))

# -----------------------------
# 11Ô∏è‚É£ Visualize Top 10 Features
# -----------------------------
top_features = feature_importance.head(10)
plt.figure(figsize=(10,6))
sns.barplot(x=top_features.values, y=top_features.index, color='skyblue')
plt.title('Top 10 Features Influencing Pass/Fail')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# -----------------------------
# 12Ô∏è‚É£ Probability Distribution by Class
# -----------------------------
plt.figure(figsize=(8,5))
sns.histplot(y_probs[y_test==0], color='red', label='Fail', kde=True, stat="density", bins=25, alpha=0.6)
sns.histplot(y_probs[y_test==1], color='green', label='Pass', kde=True, stat="density", bins=25, alpha=0.6)
plt.axvline(threshold, color='black', linestyle='--', label=f'Threshold = {threshold}')
plt.title('Predicted Probabilities Distribution by Class')
plt.xlabel('Predicted Probability for Pass')
plt.ylabel('Density')
plt.legend()
plt.show()