In [2]:
import pandas as pd
from pathlib import Path

In [12]:
# Paths
RAW_PATH = Path("data/student_performance.csv")
PROCESSED_DIR = Path("processed")
PROCESSED_DIR.mkdir(exist_ok=True)
PROCESSED_PATH = PROCESSED_DIR / "student_performance_clean.csv"


In [13]:
def load_data() -> pd.DataFrame:
    """Load raw CSV and do basic label cleanup (no 0/1 for gender & learning style)."""
    df = pd.read_csv(RAW_PATH)

    # Clean column names (remove spaces etc.)
    df.columns = [c.strip() for c in df.columns]

    # ---- Gender mapping (if encoded as numbers) ----
    if "Gender" in df.columns and df["Gender"].dtype != "O":
        # adjust mapping if your dataset uses opposite encoding
        gender_map = {
            0: "Female",
            1: "Male"
        }
        df["Gender"] = df["Gender"].map(gender_map).fillna(df["Gender"].astype(str))

    # ---- Learning Style mapping (0/1/2/3 -> names) ----
    if "LearningStyle" in df.columns:
        learning_style_map = {
            0: "Visual",
            1: "Auditory",
            2: "Kinesthetic",
            3: "Reading/Writing"
        }

        if pd.api.types.is_numeric_dtype(df["LearningStyle"]):
            df["LearningStyle"] = df["LearningStyle"].map(learning_style_map)
        else:
            # already string, just strip spaces
            df["LearningStyle"] = df["LearningStyle"].astype(str).str.strip()

    # ---- Yes/No style mappings for binary columns (keep as text, not 0/1) ----
    binary_cols = ["Extracurricular", "Internet", "OnlineCourses", "EduTech"]
    for col in binary_cols:
        if col in df.columns:
            df[col] = (
                df[col]
                .astype(str)
                .str.strip()
                .str.lower()
                .map({"yes": "Yes", "no": "No"})
                .fillna("Unknown")
            )

    return df



In [14]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Convert dtypes, handle missing values & create buckets/features."""
    df = df.copy()

    # Strip column names
    df.columns = [c.strip() for c in df.columns]

    # Expected columns (just for info)
    expected_cols = [
        "StudyHours", "Attendance", "Resources", "Extracurricular",
        "Motivation", "Internet", "Gender", "Age", "LearningStyle",
        "OnlineCourses", "Discussions", "AssignmentCompletion",
        "ExamScore", "EduTech", "StressLevel", "FinalGrade"
    ]

    missing = [c for c in expected_cols if c not in df.columns]
    if missing:
        print("⚠ Warning: Missing columns in CSV:", missing)

    # Convert numeric columns safely
    num_cols = [
        "StudyHours", "Attendance", "Motivation",
        "Discussions", "AssignmentCompletion",
        "ExamScore", "StressLevel", "Age"
    ]
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Basic missing value handling
    # Numeric → median, Categorical → mode
    for col in df.columns:
        if df[col].dtype in ["float64", "int64"]:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])

    # ---------- Derived features ----------

    # 1. PerformanceCategory based on ExamScore
    if "ExamScore" in df.columns:
        def score_bucket(x):
            if x >= 80:
                return "High"
            elif x >= 50:
                return "Medium"
            else:
                return "Low"

        df["PerformanceCategory"] = df["ExamScore"].apply(score_bucket)

    # 2. AttendanceBucket
    if "Attendance" in df.columns:
        def att_bucket(x):
            if x >= 90:
                return "Excellent (>=90%)"
            elif x >= 75:
                return "Good (75–89%)"
            elif x >= 60:
                return "Moderate (60–74%)"
            else:
                return "Low (<60%)"

        df["AttendanceBucket"] = df["Attendance"].apply(att_bucket)

    # 3. StudyHoursBucket
    if "StudyHours" in df.columns:
        def study_bucket(x):
            if x >= 20:
                return "Intensive (>=20 hrs)"
            elif x >= 10:
                return "Moderate (10–19 hrs)"
            elif x > 0:
                return "Low (1–9 hrs)"
            else:
                return "None"

        df["StudyHoursBucket"] = df["StudyHours"].apply(study_bucket)

    # 4. StressBucket
    if "StressLevel" in df.columns:
        def stress_bucket(x):
            if x >= 8:
                return "High"
            elif x >= 4:
                return "Medium"
            else:
                return "Low"

        df["StressBucket"] = df["StressLevel"].apply(stress_bucket)

    print("✅ Cleaning & feature engineering done.")
    return df

In [15]:
def main():
    df_raw = load_data()
    df_clean = clean_data(df_raw)
    df_clean.to_csv(PROCESSED_PATH, index=False)
    print(f"✅ Saved cleaned data to: {PROCESSED_PATH}")
    print("Final shape:", df_clean.shape)


if __name__ == "__main__":
    main()


✅ Cleaning & feature engineering done.
✅ Saved cleaned data to: processed\student_performance_clean.csv
Final shape: (14003, 20)
