In [None]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import json
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ All libraries imported successfully")

# Cell 2: Load Data
df = pd.read_csv('../social_work_exam_dataset.csv')
print(f"📊 Dataset Shape: {df.shape}")
print(f"\n📋 Columns: {list(df.columns)}")
df.head()

# Cell 3: Data Exploration
print("=" * 60)
print("DATA EXPLORATION")
print("=" * 60)

# Missing values
print("\n🔍 Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("   ✅ No missing values!")
else:
    print(missing[missing > 0])

# Target distribution
print(f"\n🎯 Target Distribution:")
print(df['ExamResultPercent'].describe())

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['ExamResultPercent'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Exam Result Percentage')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Exam Result Percentages')
axes[0].grid(alpha=0.3)

axes[1].boxplot(df['ExamResultPercent'])
axes[1].set_ylabel('Exam Score')
axes[1].set_title('Exam Score Boxplot')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Cell 4: Correlation Analysis
numerical_cols = ['Age', 'StudyHours', 'SleepHours', 'Confidence', 
                 'MockExamScore', 'GPA', 'Scholarship', 'InternshipGrade', 'ReviewCenter']

# Correlation matrix
correlation_matrix = df[numerical_cols + ['ExamResultPercent']].corr()

# Visualize correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Top correlations with target
target_corr = correlation_matrix['ExamResultPercent'].drop('ExamResultPercent').abs().sort_values(ascending=False)
print("\n🏆 Top Features Correlated with Exam Result Percentage:")
for feature, corr in target_corr.head(10).items():
    print(f"   {feature}: {corr:.4f}")

# Cell 5: Feature Importance Analysis
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer # Import SimpleImputer

# Prepare data for feature importance
X = df.drop('ExamResultPercent', axis=1).copy()
y = df['ExamResultPercent'].values

# Impute missing values in 'MockExamScore'
if 'MockExamScore' in X.columns:
    imputer = SimpleImputer(strategy='mean')
    X['MockExamScore'] = imputer.fit_transform(X[['MockExamScore']])

# Encode categorical variables
categorical_cols = ['Gender', 'IncomeLevel', 'EmploymentStatus']
le = LabelEncoder()
for col in categorical_cols:
    if col in X.columns:
        X[col] = le.fit_transform(X[col].astype(str))

# ANOVA F-test for regression
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)
f_scores = pd.DataFrame({
    'Feature': X.columns,
    'F-Score': selector.scores_
}).sort_values('F-Score', ascending=False)

# Visualize feature importance
plt.figure(figsize=(12, 6))
plt.barh(f_scores['Feature'][:10], f_scores['F-Score'][:10], color='skyblue', edgecolor='black')
plt.xlabel('F-Score', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Top 10 Feature Importance (ANOVA F-test)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\n📊 Feature Importance Scores:")
print(f_scores.to_string(index=False))

# Cell 6: Data Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np

categorical_columns = ['Gender', 'IncomeLevel', 'EmploymentStatus']
numerical_columns = ['Age', 'StudyHours', 'SleepHours', 'Confidence', 
                    'MockExamScore', 'GPA', 'Scholarship', 'InternshipGrade']
binary_columns = ['ReviewCenter']

# Prepare features
X = df[categorical_columns + numerical_columns + binary_columns].copy()
y = df['ExamResultPercent'].values

# ✅ FIX 1: Handle missing values
print("\n🔧 Handling Missing Values...")
print(f"Missing values before imputation:\n{X.isnull().sum()}")

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X[numerical_columns] = imputer.fit_transform(X[numerical_columns])

print(f"\nMissing values after imputation:\n{X.isnull().sum()}")
assert X.isnull().sum().sum() == 0, "❌ Still have missing values!"

# ✅ FIX 2: Label Encoding
X_encoded = X.copy()
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    label_encoders[col] = le

# ✅ FIX 3: Create preprocessor 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder as LE_Sklearn

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns + binary_columns),
        ('cat', 'passthrough', categorical_columns)  
    ],
    remainder='drop'
)

# ✅ FIX 4: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Verify no NaN
assert not np.isnan(X_scaled).any(), "❌ NaN after scaling!"

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print("\n✅ Data Preprocessing Complete!")
print(f"   Training: {len(X_train)}, Test: {len(X_test)}")
print(f"   Features: {X_scaled.shape[1]}")
print(f"   Target range: [{y_train.min():.2f}, {y_train.max():.2f}]")

# Cell 7: Save Processed Data
import joblib

output_dir = '../processed_data'
os.makedirs(output_dir, exist_ok=True)

# ✅ VERIFY NO NaN
print("\n🔍 Final Verification:")
print(f"   X_train NaN count: {np.isnan(X_train).sum()}")
print(f"   X_test NaN count: {np.isnan(X_test).sum()}")

if np.isnan(X_train).any() or np.isnan(X_test).any():
    print("❌ ERROR: Data contains NaN!")
else:
    # Save arrays
    np.save(f'{output_dir}/X_train.npy', X_train)
    np.save(f'{output_dir}/X_test.npy', X_test)
    np.save(f'{output_dir}/y_train.npy', y_train)
    np.save(f'{output_dir}/y_test.npy', y_test)

    # ✅ CRITICAL: Save preprocessing objects
    joblib.dump(scaler, f'{output_dir}/scaler.pkl')
    joblib.dump(label_encoders, f'{output_dir}/label_encoders.pkl')
    joblib.dump(imputer, f'{output_dir}/imputer.pkl')

    # Save feature names
    feature_names = categorical_columns + numerical_columns + binary_columns
    with open(f'{output_dir}/feature_names.json', 'w') as f:
        json.dump(feature_names, f)

    print(f"\n💾 Data saved to {output_dir}/")
    print("✅ PREPROCESSING COMPLETE!")