In [1]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")

ON_KAGGLE = Path("/kaggle").exists()

if ON_KAGGLE:
    sys.path.append("/kaggle/input/kaggle-utils")
    %run /kaggle/input/kaggle-utils/stat_funcs.py
    %run /kaggle/input/kaggle-utils/preproc.py
    BASE = Path("/kaggle/working/artifacts/playground-series-s6e1")
else:
    %run ../../utils/stat_funcs.py
    %run ../../utils/preproc.py
    BASE = Path("../../../src/data")

X_train_enc = pd.read_csv(BASE / "X_train_enc.csv")
X_val_enc = pd.read_csv(BASE / "X_val_enc.csv")
y_train = pd.read_csv(BASE / "y_train.csv")["exam_score"]
y_val = pd.read_csv(BASE / "y_val.csv")["exam_score"]

X_test_enc = None
test_path = BASE / "X_test_enc.csv"
if test_path.exists():
    X_test_enc = pd.read_csv(test_path)

X_train_raw = None
X_val_raw = None
raw_train_path = BASE / "X_train_raw.csv"
raw_val_path = BASE / "X_val_raw.csv"
if raw_train_path.exists() and raw_val_path.exists():
    X_train_raw = pd.read_csv(raw_train_path)
    X_val_raw   = pd.read_csv(raw_val_path)

feature_columns = pd.read_csv(BASE / "feature_columns.csv")["feature"].tolist()

print("custom functions are now available in the notebook namespace!")
print("Libraries loaded successfully!")
print("Datasets loaded successfully!")
print("X_train_enc:", X_train_enc.shape, "| X_val_enc:", X_val_enc.shape)
print("y_train:", y_train.shape, "| y_val:", y_val.shape)
print("Has X_test_enc?", X_test_enc is not None)

custom functions are now available in the notebook namespace!
Libraries loaded successfully!
Datasets loaded successfully!
X_train_enc: (504000, 23) | X_val_enc: (126000, 23)
y_train: (504000,) | y_val: (126000,)
Has X_test_enc? True


## Fit multilinear regression

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
lr = LinearRegression()
lr.fit(X_train_enc, y_train)

val_pred = lr.predict(X_val_enc)

In [4]:
rmse = mean_squared_error(y_val, val_pred)
mae = mean_absolute_error(y_val, val_pred)
r2 = r2_score(y_val, val_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE : {mae:.4f}")
print(f"Validation R^2 : {r2:.4f}")

Validation RMSE: 78.9697
Validation MAE : 7.0933
Validation R^2 : 0.7780


## Kaggle submissiokns

In [5]:
if X_test_enc is not None:
    if Path("/kaggle").exists():
        test_df = pd.read_csv("/kaggle/input/playground-series-s6e1/test.csv")
        submission_path = Path("/kaggle/working/artifacts/playground-series-s6e1/submission.csv")
    else:
        test_df = pd.read_csv("../../data-raw/test.csv")
        submission_path = Path("../../data/submission.csv")

    if "feature_columns" in globals() and feature_columns is not None:
        missing = set(feature_columns) - set(X_test_enc.columns)
        extra = set(X_test_enc.columns) - set(feature_columns)
        if missing:
            raise ValueError(f"X_test_enc is missing {len(missing)} training features. Example: {list(missing)[:5]}")
        # Reorder to match training feature order
        X_test_enc = X_test_enc[feature_columns]

    test_pred = lr.predict(X_test_enc)

    submission = pd.DataFrame({ "id": test_df["id"], "exam_score": test_pred })

    submission.to_csv(submission_path, index=False)
    print(f"Saved: {submission_path.resolve()}")
else:
    print("X_test_enc not found; skipping submission.csv creation.")

Saved: /home/carlitos/Documents/Projects/kaggle/predicting-student-test-scores/src/data/submission.csv
