In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv(r"E:\git\Student_Performance_ML_Pipeline\data\student_performance.csv")

X = df.drop(["TestScore","PassFail"], axis=1)
y = df["TestScore"]


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [4]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print("Baseline R2:", r2_score(y_test, pred))
print("Baseline RMSE:", np.sqrt(mean_squared_error(y_test, pred)))


Baseline R2: 0.9063861680692175
Baseline RMSE: 5.260219399856505


In [5]:
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

pipe_lr.fit(X_train, y_train)

pred_pipe = pipe_lr.predict(X_test)

print("Pipeline R2:", r2_score(y_test, pred_pipe))


Pipeline R2: 0.906386168069218


In [6]:
scores = cross_val_score(
    pipe_lr,
    X,
    y,
    cv=5,
    scoring="r2"
)

print("CV Scores:", scores)
print("Mean CV Score:", scores.mean())


CV Scores: [0.89695913 0.89290544 0.91406904 0.84471899 0.91340904]
Mean CV Score: 0.8924123277871707


In [7]:
pipe_ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0))
])

pipe_ridge.fit(X_train, y_train)
pred_ridge = pipe_ridge.predict(X_test)

print("Ridge R2:", r2_score(y_test, pred_ridge))


Ridge R2: 0.9061821973921493


In [8]:
pipe_lasso = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Lasso(alpha=0.1))
])

pipe_lasso.fit(X_train, y_train)
pred_lasso = pipe_lasso.predict(X_test)

print("Lasso R2:", r2_score(y_test, pred_lasso))


Lasso R2: 0.9057466924473325
