In [22]:
# --- Imports ---
import json
import os
import numpy as np
import pandas as pd
import joblib
from scipy import sparse
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# --- Paths ---
ARTIFACT_DIR = "artifacts"
XTR_PATH = os.path.join(ARTIFACT_DIR, "X_train.npz")
YTR_PATH = os.path.join(ARTIFACT_DIR, "y_train.csv")
MODEL_PATH = os.path.join(ARTIFACT_DIR, "linear_regression_classifier.joblib")
TRAIN_METRICS_PATH = os.path.join(ARTIFACT_DIR, "train_metrics.json")

# --- Load data ---
X_train = sparse.load_npz(XTR_PATH)
y_train = pd.read_csv(YTR_PATH, header=None).iloc[:,0].to_numpy()

# --- Fit Linear Regression (regression used for classification) ---
reg = LinearRegression()
reg.fit(X_train, y_train)

# --- Basic regression metrics on train (for sanity) ---
y_pred_cont = reg.predict(X_train)

# Older sklearn versions don't support squared=False, so compute RMSE manually
from sklearn.metrics import mean_squared_error
import numpy as np

mse = mean_squared_error(y_train, y_pred_cont)
rmse = float(np.sqrt(mse))

from sklearn.metrics import r2_score
r2 = float(r2_score(y_train, y_pred_cont))


joblib.dump(reg, MODEL_PATH)

with open(TRAIN_METRICS_PATH, "w") as f:
    json.dump({"rmse_train": float(rmse), "r2_train": float(r2)}, f, indent=2)
    # or: json.dump({...}, f, indent=2, cls=SafeJSONEncoder)



print("Model trained & saved.")
print(f"Train RMSE: {rmse:.4f} | R^2: {r2:.4f}")
print(f"Saved: {MODEL_PATH}")


Model trained & saved.
Train RMSE: 0.0000 | R^2: 1.0000
Saved: artifacts/linear_regression_classifier.joblib
