In [1]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

if Path("/kaggle").exists():
  sys.path.append("/kaggle/input/kaggle-utils")

  %run /kaggle/input/kaggle-utils/stat_funcs.py
  %run /kaggle/input/kaggle-utils/preproc.py
  train_df = pd.read_csv("/kaggle/input/playground-series-s6e1/train.csv")
  test_df = pd.read_csv("/kaggle/input/playground-series-s6e1/test.csv")
else:
  %run ../../utils/stat_funcs.py
  %run ../../utils/preproc.py
  train_df = pd.read_csv("../../data-raw/train.csv")
  test_df = pd.read_csv("../../data-raw/test.csv")

print("custom functions are now available in the notebook namespace!")
print("Libraries loaded successfully!")

custom functions are now available in the notebook namespace!
Libraries loaded successfully!


# Overall EDA Summary

## Dataset overview 

The training data contains **630,000 rows** and **13 columns**, with the target variable **`exam_score`** (continuous, 0-100 scale). An **`id`** column is a unique identifier (sequential from 0 to 629,999) and is not inherently predictive. The `id` column would need to be removed during feature engineering.

- **Data quality:**
  - **No missing values** across any features or the target.
  - Data types are clean and appropriate after casting:
    - **Numeric:** `age`, `study_hours`, `class_attendance`, `sleep_hours`, `exam_score`
    - **Categorical:** `gender`, `course`, `internet_access`, `sleep_quality`, `study_method`, `facility_rating`, `exam_difficulty`
- **Typical student profile (central tendency):**
  - **Age:** ~**20.55** years (median **21**, range **17-24**)
  - **Study hours:** mean ~**4.00** (median **4.00**, range **0.08-7.91**)
  - **Class attendance (%):** mean ~**71.99** (median **72.6**, range **40.6-99.4**)
  - **Sleep hours:** mean ~**7.07** (median **7.1**, range **4.1-9.9**)
  - **Exam score:** mean ~**62.51** (median **62.6**, range **19.6-100**)
- **Spread / variability:**
  - `exam_score` has **substantial variability** (std ~**18.92**), suggesting meaningful separation between low- and high-performing students.
  - `class_attendance` also varies widely (std ~**17.43**), while `age` is comparatively tight (std ~**2.26**).
- **Notable extremes & sanity checks:**
  - Very low study time values exist (down to **0.08 hours**), and attendance can be as low as **~40%**, which may represent legitimately low-engagement students rather than data issues (since missingness is zero and ranges look plausible).
  - Scores span almost the entire possible scale (**~20 to 100**), indicating no obvious clipping problems.
- **Model-readiness implications:**
  - This is a **mixed-type regression problem** with several categorical predictors that will require **encoding** (one-hot or target/ordinal encoding depending on the feature).

## Feature Engineering

### 1) Split + leakage control

- [x] Create train/validation split before fitting any preprocessing steps
- [x] Drop `id` column

In [2]:
from sklearn.model_selection import train_test_split

TARGET = "exam_score"

In [3]:
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### 2) Basic cleaning

- [x] Verify numeric ranges (attendance 0–100, score 0–100, sleep hours plausible)
- [x] Outlier handling (only if needed): clip numeric features to sensible bounds or train-quantiles

In [5]:
X.describe()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours
count,630000.0,630000.0,630000.0,630000.0,630000.0
mean,314999.5,20.545821,4.002337,71.987261,7.072758
std,181865.479132,2.260238,2.35988,17.430098,1.744811
min,0.0,17.0,0.08,40.6,4.1
25%,157499.75,19.0,1.97,57.0,5.6
50%,314999.5,21.0,4.0,72.6,7.1
75%,472499.25,23.0,6.05,87.2,8.6
max,629999.0,24.0,7.91,99.4,9.9


### 3) Encoding (keep it simple)

- [x] One-hot encode nominal categoricals: `gender`, `course`, `internet_access`, `study_method`
- [x] Ordinal encode only if truly ordered: `sleep_quality`, `facility_rating`, `exam_difficulty`

In [6]:
X.head()

Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy


In [7]:
encode_cols = ["gender", "course", "internet_access", "study_method", "sleep_quality", "facility_rating", "exam_difficulty"]

In [8]:
X_enc = pd.get_dummies(X, columns=encode_cols, dtype=int)

In [9]:
X_enc.head()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,gender_female,gender_male,gender_other,course_b.com,course_b.sc,...,study_method_self-study,sleep_quality_average,sleep_quality_good,sleep_quality_poor,facility_rating_high,facility_rating_low,facility_rating_medium,exam_difficulty_easy,exam_difficulty_hard,exam_difficulty_moderate
0,0,21,7.91,98.8,4.9,1,0,0,0,1,...,0,1,0,0,0,1,0,1,0,0
1,1,18,4.95,94.8,4.7,0,0,1,0,0,...,1,0,0,1,0,0,1,0,0,1
2,2,20,4.68,92.6,5.8,1,0,0,0,1,...,0,0,0,1,1,0,0,0,0,1
3,3,19,2.0,49.5,8.3,0,1,0,0,1,...,0,1,0,0,1,0,0,0,0,1
4,4,23,7.65,86.9,9.6,0,1,0,0,0,...,1,0,1,0,1,0,0,1,0,0


In [10]:
X_enc.columns

Index(['id', 'age', 'study_hours', 'class_attendance', 'sleep_hours',
       'gender_female', 'gender_male', 'gender_other', 'course_b.com',
       'course_b.sc', 'course_b.tech', 'course_ba', 'course_bba', 'course_bca',
       'course_diploma', 'internet_access_no', 'internet_access_yes',
       'study_method_coaching', 'study_method_group study',
       'study_method_mixed', 'study_method_online videos',
       'study_method_self-study', 'sleep_quality_average',
       'sleep_quality_good', 'sleep_quality_poor', 'facility_rating_high',
       'facility_rating_low', 'facility_rating_medium', 'exam_difficulty_easy',
       'exam_difficulty_hard', 'exam_difficulty_moderate'],
      dtype='object')

In [11]:
X_enc.dtypes

id                              int64
age                             int64
study_hours                   float64
class_attendance              float64
sleep_hours                   float64
gender_female                   int64
gender_male                     int64
gender_other                    int64
course_b.com                    int64
course_b.sc                     int64
course_b.tech                   int64
course_ba                       int64
course_bba                      int64
course_bca                      int64
course_diploma                  int64
internet_access_no              int64
internet_access_yes             int64
study_method_coaching           int64
study_method_group study        int64
study_method_mixed              int64
study_method_online videos      int64
study_method_self-study         int64
sleep_quality_average           int64
sleep_quality_good              int64
sleep_quality_poor              int64
facility_rating_high            int64
facility_rat

In [12]:
X_enc.drop("internet_access_no", axis=1, inplace=True)

In [13]:
X_enc.dtypes

id                              int64
age                             int64
study_hours                   float64
class_attendance              float64
sleep_hours                   float64
gender_female                   int64
gender_male                     int64
gender_other                    int64
course_b.com                    int64
course_b.sc                     int64
course_b.tech                   int64
course_ba                       int64
course_bba                      int64
course_bca                      int64
course_diploma                  int64
internet_access_yes             int64
study_method_coaching           int64
study_method_group study        int64
study_method_mixed              int64
study_method_online videos      int64
study_method_self-study         int64
sleep_quality_average           int64
sleep_quality_good              int64
sleep_quality_poor              int64
facility_rating_high            int64
facility_rating_low             int64
facility_rat

### 5) Scaling (for stability)

- [x] Standardize numeric features: `age`, `study_hours`, `class_attendance`, `sleep_hours`

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()
num_scale_cols = ["age", "study_hours", "class_attendance", "sleep_hours"]

In [16]:
X_enc[num_scale_cols] = scaler.fit_transform(X_enc[num_scale_cols])

In [17]:
X_enc.head()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,gender_female,gender_male,gender_other,course_b.com,course_b.sc,...,study_method_self-study,sleep_quality_average,sleep_quality_good,sleep_quality_poor,facility_rating_high,facility_rating_low,facility_rating_medium,exam_difficulty_easy,exam_difficulty_hard,exam_difficulty_moderate
0,0,0.200943,1.655875,1.538302,-1.245269,1,0,0,0,1,...,0,1,0,0,0,1,0,1,0,0
1,1,-1.126352,0.401573,1.308814,-1.359895,0,0,1,0,0,...,1,0,0,1,0,0,1,0,0,1
2,2,-0.241488,0.28716,1.182595,-0.729454,1,0,0,0,1,...,0,0,0,1,1,0,0,0,0,1
3,3,-0.68392,-0.848492,-1.290141,0.703367,0,1,0,0,1,...,0,1,0,0,1,0,0,0,0,1
4,4,1.085807,1.545699,0.855575,1.448434,0,1,0,0,0,...,1,0,1,0,1,0,0,1,0,0


### 6) Minimal “linear-friendly” feature creation (optional, small set)

- [x] Add 1–2 interaction terms. Starting points:
  - `study_hours * class_attendance`
  - `study_hours * exam_difficulty` (after encoding)
- [x] Add 1 curvature term if residuals suggest nonlinearity:
  - `study_hours^2` (or `sleep_hours^2`)

In [18]:
X_enc = X_enc.assign(study_hours_attendance=lambda x: x["study_hours"] * x["class_attendance"])

In [19]:
X_enc = X_enc.assign(study_hours_curve=lambda x: (x["study_hours"] ** 2))

In [20]:
X_enc.head()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,gender_female,gender_male,gender_other,course_b.com,course_b.sc,...,sleep_quality_good,sleep_quality_poor,facility_rating_high,facility_rating_low,facility_rating_medium,exam_difficulty_easy,exam_difficulty_hard,exam_difficulty_moderate,study_hours_attendance,study_hours_curve
0,0,0.200943,1.655875,1.538302,-1.245269,1,0,0,0,1,...,0,0,0,1,0,1,0,0,2.547236,2.741921
1,1,-1.126352,0.401573,1.308814,-1.359895,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0.525584,0.161261
2,2,-0.241488,0.28716,1.182595,-0.729454,1,0,0,0,1,...,0,1,1,0,0,0,0,1,0.339594,0.082461
3,3,-0.68392,-0.848492,-1.290141,0.703367,0,1,0,0,1,...,0,0,1,0,0,0,0,1,1.094674,0.719938
4,4,1.085807,1.545699,0.855575,1.448434,0,1,0,0,0,...,1,0,1,0,0,1,0,0,1.322461,2.389187


### 7) Multicollinearity + regularization

- [x] Check multicollinearity (VIF or condition number) after encoding

In [21]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [22]:
X_for_vif = X_enc.copy()

In [23]:
X_for_vif = X_for_vif.drop(columns=["id"], errors="ignore")

In [24]:
X_for_vif = X_for_vif.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [25]:
# X_mat = X_for_vif.to_numpy(dtype=float)
X_mat = X_for_vif.to_numpy(dtype=float)

In [26]:
# vif_df = pd.DataFrame({
#     "feature": X_for_vif.columns,
#     "vif": [variance_inflation_factor(X_mat, i) for i in range(X_mat.shape[1])]
# }).sort_values("vif", ascending=False)

In [27]:
cond_number = np.linalg.cond(X_mat)
print(f"cond_number: {cond_number}")

cond_number: 1128456225783163.5


### quick notes

High condition number, which means I have near-linear dependence in my features (very strong multicollinearity or near-duplicates). 
Not quite sure what to do next. 

At this level, though:
- OLS coefficients will be numerical unstable
- predictions will still look fine but interpretability is short
- i may need to fix the matrix or just move to Ridge/ElasticNet and just not worry about OLS coefficients altogether. That kinda defeats the purpose of this project being a simple multilinear regression model.

After further reading, a simple multilinear regression is still doable even if we use Ridge. The simplicity is about the model class (linear in the features), not about using the closed-form OLS solution at all costs.

### 7a) Dont accept OLS coefficients... yet

In [28]:
X_train_enc = pd.get_dummies(X_train, columns=encode_cols, dtype=int, drop_first=True)
X_val_enc = pd.get_dummies(X_val, columns=encode_cols, dtype=int, drop_first=True)

In [29]:
X_val_enc = X_val_enc.reindex(columns=X_train_enc.columns, fill_value=0)

In [30]:
# drop the constants
std0_cols = X_train_enc.columns[X_train_enc.std(axis=0) == 0]
X_train_enc = X_train_enc.drop(columns=std0_cols)
X_val_enc = X_val_enc.drop(columns=std0_cols, errors="ignore")

In [31]:
cond = np.linalg.cond(X_train_enc.to_numpy(float))
print(f"cond={cond}")

cond=2338417.486892914


#### Quick notes

`cond=2338417.486892914` is still high. but it looks like a multicollinearity/scaling issues rather than the matrix is brkoen. 

- OLS will still produce a fit, but the coefficients will be unstable. especially for correlated features
- predictions will be fine but be wary of over-interpreting individual coefficients.

### 7b) Recompute `cond` on the matrix

In [32]:
X_train_c = X_train_enc.copy()
X_val_c = X_val_enc.copy()

In [33]:
scaler = StandardScaler()
X_train_c[num_scale_cols] = scaler.fit_transform(X_train_c[num_scale_cols])
X_val_c[num_scale_cols] = scaler.transform(X_val_c[num_scale_cols])

In [34]:
cond_scaled = np.linalg.cond(X_train_c.to_numpy(float))
print(f"cond_scaled={cond_scaled}")

cond_scaled=2259718.464071761


In [35]:
X_train_c.columns

Index(['id', 'age', 'study_hours', 'class_attendance', 'sleep_hours',
       'gender_male', 'gender_other', 'course_b.sc', 'course_b.tech',
       'course_ba', 'course_bba', 'course_bca', 'course_diploma',
       'internet_access_yes', 'study_method_group study', 'study_method_mixed',
       'study_method_online videos', 'study_method_self-study',
       'sleep_quality_good', 'sleep_quality_poor', 'facility_rating_low',
       'facility_rating_medium', 'exam_difficulty_hard',
       'exam_difficulty_moderate'],
      dtype='object')

### quick notes

still strong multicollinearity/near-redundancy. but i think that is fine for this dataset. in our dataset, correlated predictors are expected.

BUTTTT re-run the one-hot encoding because i have multiple one-hot columns per categorical, but there are not clear dropped reference level for each.

In [36]:
X_train_enc, X_val_enc, X_test_enc, scaler, dropped_const_cols = one_hot_encode(
    X_train=X_train,
    X_val=X_val,
    X_test=test_df,
    encode_cols=encode_cols,
    num_scale_cols=["age", "study_hours", "class_attendance", "sleep_hours"],
    drop_id_col="id",
)

In [37]:
cond = np.linalg.cond(X_train_enc.to_numpy(float))
print("cond:", cond)
print("dropped constant cols:", len(dropped_const_cols))

cond: 9.295644505017439
dropped constant cols: 0


### quick notes

`cond: 9.295644505017439` is the best one so far.

- earlier huge condition numbers were almost certainly caused by dummy-variable trap/redundant one-hot columns
- OLS coefficients are now stable enough to interpret

### 8) ~~Diagnostics + iteration loop~~

- [ ] ~~Check residual plots (nonlinearity, heteroscedasticity)~~
- [ ] ~~Evaluate metrics (MAE/RMSE/R²) and compare against a naive baseline (predict mean)~~
- [ ] ~~Iterate: only add interactions/polynomials if diagnostics show systematic error~~

### 8) Save data for fitting/modeling

In [38]:
from pathlib import Path
import joblib

if Path("/kaggle").exists():
    Path("/kaggle/working/artifacts/playground-series-s6e1").mkdir(parents=True, exist_ok=True)
    X_train.to_csv("/kaggle/working/artifacts/playground-series-s6e1/X_train_raw.csv", index=False)
    X_val.to_csv("/kaggle/working/artifacts/playground-series-s6e1/X_val_raw.csv", index=False)
    y_train.to_frame("exam_score").to_csv("/kaggle/working/artifacts/playground-series-s6e1/y_train.csv", index=False)
    y_val.to_frame("exam_score").to_csv("/kaggle/working/artifacts/playground-series-s6e1/y_val.csv", index=False)
    
    X_train_enc.to_csv("/kaggle/working/artifacts/playground-series-s6e1/X_train_enc.csv", index=False)
    X_val_enc.to_csv("/kaggle/working/artifacts/playground-series-s6e1/X_val_enc.csv", index=False)
    if X_test_enc is not None:
      X_test_enc.to_csv("/kaggle/working/artifacts/playground-series-s6e1/X_test_enc.csv", index=False)
    
    (X_train_enc.columns.to_series(name="feature").to_csv("/kaggle/working/artifacts/playground-series-s6e1/feature_columns.csv", index=False))
    
    joblib.dump(scaler, "/kaggle/working/artifacts/playground-series-s6e1/standard_scaler.joblib")
    joblib.dump(
      {
          "target": "exam_score",
          "encode_cols": encode_cols,
          "num_scale_cols": ["age", "study_hours", "class_attendance", "sleep_hours"],
          "random_state": 42,
          "test_size": 0.2,
      },
      "/kaggle/working/artifacts/playground-series-s6e1/preprocess_config.joblib",
    )
else:
    OUT_DIR = Path("../../../src/data")
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    
    X_train.to_csv(OUT_DIR / "X_train_raw.csv", index=False)
    X_val.to_csv(OUT_DIR / "X_val_raw.csv", index=False)
    y_train.to_frame("exam_score").to_csv(OUT_DIR / "y_train.csv", index=False)
    y_val.to_frame("exam_score").to_csv(OUT_DIR / "y_val.csv", index=False)
    
    X_train_enc.to_csv(OUT_DIR / "X_train_enc.csv", index=False)
    X_val_enc.to_csv(OUT_DIR / "X_val_enc.csv", index=False)
    if X_test_enc is not None:
      X_test_enc.to_csv(OUT_DIR / "X_test_enc.csv", index=False)
    
    (X_train_enc.columns.to_series(name="feature").to_csv(OUT_DIR / "feature_columns.csv", index=False))
    
    joblib.dump(scaler, OUT_DIR / "standard_scaler.joblib")
    joblib.dump(
      {
          "target": "exam_score",
          "encode_cols": encode_cols,
          "num_scale_cols": ["age", "study_hours", "class_attendance", "sleep_hours"],
          "random_state": 42,
          "test_size": 0.2,
      },
      OUT_DIR / "preprocess_config.joblib",
    )
