In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os

# Load dataset
df = pd.read_csv("../data/placementdata.csv")
print("✅ Dataset loaded")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head(), "\n")

# Handle missing values
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols, "\n")

# Fill missing numerical values with median
for col in numerical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)
        print(f"Filled missing values in numerical column '{col}' with median")

# Fill missing categorical values with mode
for col in categorical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)
        print(f"Filled missing values in categorical column '{col}' with mode")

print("✅ Missing values handled\n")

# Encode categorical features
label_encoders = {}
categorical_features = [col for col in categorical_cols if col != "PlacementStatus"]
for col in categorical_features:
    le = LabelEncoder()
    df[col + "_encoded"] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f"Encoded '{col}' to '{col}_encoded'")
    print(df[[col, col + "_encoded"]].head(), "\n")

# Encode target
target_encoder = LabelEncoder()
df["status_encoded"] = target_encoder.fit_transform(df["PlacementStatus"])
print("Encoded target 'PlacementStatus' to 'status_encoded'")
print(df[["PlacementStatus", "status_encoded"]].head(), "\n")

# Select features
encoded_features = [c for c in df.columns if c.endswith("_encoded") and c != "status_encoded"]
numerical_features = [c for c in numerical_cols if c not in ["StudentID"]]
feature_columns = encoded_features + numerical_features
print("Selected feature columns:", feature_columns, "\n")

X = df[feature_columns]
y = df["status_encoded"]
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape, "\n")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("✅ Train-test split done")
print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())
print("y_test distribution:\n", y_test.value_counts(), "\n")

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Features scaled")
print("X_train_scaled sample:\n", X_train_scaled[:5])
print("X_test_scaled sample:\n", X_test_scaled[:5], "\n")

# Save preprocessing objects
os.makedirs("../models", exist_ok=True)
joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(target_encoder, "../models/target_encoder.pkl")
joblib.dump(label_encoders, "../models/label_encoders.pkl")
joblib.dump(feature_columns, "../models/feature_names.pkl")
print("✅ Preprocessing objects saved:")
print("- Scaler: '../models/scaler.pkl'")
print("- Target encoder: '../models/target_encoder.pkl'")
print("- Label encoders: '../models/label_encoders.pkl'")
print("- Feature names: '../models/feature_names.pkl'\n")

print("🎉 Preprocessing complete. Ready for model training!")


✅ Dataset loaded
Shape: (10000, 12)
Columns: ['StudentID', 'CGPA', 'Internships', 'Projects', 'Workshops/Certifications', 'AptitudeTestScore', 'SoftSkillsRating', 'ExtracurricularActivities', 'PlacementTraining', 'SSC_Marks', 'HSC_Marks', 'PlacementStatus']
   StudentID  CGPA  Internships  Projects  Workshops/Certifications  \
0          1   7.5            1         1                         1   
1          2   8.9            0         3                         2   
2          3   7.3            1         2                         2   
3          4   7.5            1         1                         2   
4          5   8.3            1         2                         2   

   AptitudeTestScore  SoftSkillsRating ExtracurricularActivities  \
0                 65               4.4                        No   
1                 90               4.0                       Yes   
2                 82               4.8                       Yes   
3                 85               4.4     