In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd


In [4]:
import pandas as pd
# Define features and target
df = pd.read_csv(r"E:\git\Student_Performance_ML_Pipeline\data\raw\student_performance.csv")

X = df.drop(["TestScore", "PassFail"], axis=1)
y = df["TestScore"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (500, 5)
y shape: (500,)


In [5]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
cat_imputer = SimpleImputer(strategy="most_frequent")
cat_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = Pipeline([
    ("num_imputer", num_imputer),
    ("num_scaler", num_scaler),
    ("cat_imputer", cat_imputer),
    ("cat_encoder", cat_encoder)
])

linear_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model",LinearRegression())
])

In [7]:
linear_pipeline.fit(X_train, y_train)

y_pred_linear = linear_pipeline.predict(X_test)

r2_linear = r2_score(y_test, y_pred_linear)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))

print("Linear Regression")
print("R2:", r2_linear)
print("RMSE:", rmse_linear)


Linear Regression
R2: 0.8273122998250443
RMSE: 7.144380066849644


In [8]:
ridge_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", Ridge(alpha=1.0))
])

ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_test)

print("\nRidge Regression")
print("R2:", r2_score(y_test, y_pred_ridge))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))



Ridge Regression
R2: 0.8235456201633149
RMSE: 7.221876686223968


In [9]:
lasso_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", Lasso(alpha=0.1))
])

lasso_pipeline.fit(X_train, y_train)
y_pred_lasso = lasso_pipeline.predict(X_test)

print("\nLasso Regression")
print("R2:", r2_score(y_test, y_pred_lasso))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lasso)))



Lasso Regression
R2: 0.8025441479371905
RMSE: 7.63956889953865


In [10]:
print("\n FINAL MODEL COMPARISON")

print(f"Linear  → R2={r2_linear:.3f}, RMSE={rmse_linear:.2f}")
print(f"Ridge   → R2={r2_score(y_test,y_pred_ridge):.3f}, RMSE={np.sqrt(mean_squared_error(y_test,y_pred_ridge)):.2f}")
print(f"Lasso   → R2={r2_score(y_test,y_pred_lasso):.3f}, RMSE={np.sqrt(mean_squared_error(y_test,y_pred_lasso)):.2f}")



 FINAL MODEL COMPARISON
Linear  → R2=0.827, RMSE=7.14
Ridge   → R2=0.824, RMSE=7.22
Lasso   → R2=0.803, RMSE=7.64


In [11]:
print(f"Ridge   → R2={r2_score(y_test,y_pred_ridge):.3f}, RMSE={np.sqrt(mean_squared_error(y_test,y_pred_ridge)):.2f}")


Ridge   → R2=0.824, RMSE=7.22


In [12]:
import joblib
import os

# Save Ridge model as final model
model_to_save = ridge_pipeline

# Create src folder path
save_path = r"E:\git\Student_Performance_ML_Pipeline\src\final_model.pkl"

joblib.dump(model_to_save, save_path)

print("Model saved successfully!")


Model saved successfully!
