## Item-11

### Apply and Evaluate Model to Test Set

#### Objective:
Run predictions for your test set and evaluate logistic regression model accuracy
#### Steps:
* Switch feature matrix to test set
* Predict
* Compute percentage of agreement

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

In [26]:
df=pd.read_csv(r"C:\Users\ayber\OneDrive\Masaüstü\ML\Data\external\Fatigue_data.csv").copy()
target='Fatigue'

X = df.drop(columns=['Sl. No.', target]).copy()
y_reg = df[target].to_numpy()
y = (y_reg > 500).astype(int)     # 1=good, 0=bad

pd.Series(y).value_counts()

1    233
0    204
Name: count, dtype: int64

In [28]:
# 60/20/20 with stratification (important for class balance)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

In [35]:
# --Helpers for future use--
def fit_and_print_clf(p, model_name, thr=0.5, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    p.fit(X_train, y_train)

    # feature count before the estimator
    Z_train = p[:-1].transform(X_train)
    n_features = Z_train.shape[1]

    # probabilities -> labels at threshold
    proba_tr = p.predict_proba(X_train)[:, 1]
    proba_te = p.predict_proba(X_test)[:, 1]
    yhat_tr  = (proba_tr >= thr).astype(int)
    yhat_te  = (proba_te >= thr).astype(int)

    # metrics
    acc_tr = accuracy_score(y_train, yhat_tr) * 100.0
    acc_te = accuracy_score(y_test,  yhat_te) * 100.0
    auc_tr = roc_auc_score(y_train, proba_tr)
    auc_te = roc_auc_score(y_test,  proba_te)

    print("Model Name ->", model_name)
    print("Feature count:", n_features)
    print(f"Agreement (train): {acc_tr:.2f}% | AUC: {auc_tr:.3f}")
    print(f"Agreement (test):  {acc_te:.2f}% | AUC: {auc_te:.3f}")
    print("Confusion matrix (test):\n", confusion_matrix(y_test, yhat_te))

In [75]:
p1= Pipeline([
    ("poly",  PolynomialFeatures(degree=2, include_bias=False)),
    ("scale", StandardScaler()),
    ("clf",   LogisticRegressionCV(
        Cs=np.logspace(-2, 3, 13),
        cv=5,
        max_iter=10000,
        scoring="roc_auc",
        n_jobs=-1,
        refit=True
    ))
])

fit_and_print_clf(p1, model_name="Logistic Regression")
print("Best C:", p1.named_steps["clf"].C_[0])

Model Name -> Logistic Regression
Feature count: 350
Agreement (train): 100.00% | AUC: 1.000
Agreement (test):  89.77% | AUC: 0.979
Confusion matrix (test):
 [[36  5]
 [ 4 43]]
Best C: 1.2115276586285888


## Item-12

### Improve Model Accuracy

#### Objective:
Try to improve test set agreement percentage
#### Steps:
 * Create/select/exclude features
 * Train + Perform logistic regression
 * Compute percentage of agreement and find best one you can

In [None]:
#Feature exclusion and parameter tuning done on the previous model
p2= Pipeline([
    ("scale", StandardScaler()),
    ("clf",   LogisticRegressionCV(
        Cs=np.logspace(-2, 3, 26),
        cv=5,
        penalty="l1",
        solver="saga",
        max_iter=10000,
        scoring="roc_auc",
        n_jobs=-1,
        refit=True
    ))
])

fit_and_print_clf(p2, model_name="Logistic Regression with ")
print("Best C:", p2.named_steps["clf"].C_[0])

Model Name -> Logistic Regression
Feature count: 25
Agreement (train): 96.93% | AUC: 0.996
Agreement (test):  93.18% | AUC: 0.972
Confusion matrix (test):
 [[38  3]
 [ 3 44]]
Best C: 2.5118864315095824


## Item-13

### Use RR on your dataset

#### Objective:
Compare Ridge Regression result to your previous models
#### Steps:
 * Create/Select features similar to before
 * Perform ridge regression on these using scikit-learn
 * Compare to the weights you get in your regular regression

In [94]:
from sklearn.linear_model import RidgeCV, LinearRegression

In [79]:
X = df.drop(columns=['Sl. No.', target]).copy()
y = df['Fatigue'].to_numpy()

from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val,  y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [90]:
def rmse(y_true, y_pred):
    from sklearn.metrics import mean_squared_error
    return mean_squared_error(y_true, y_pred) ** 0.5

def fit_and_print(p, model_name, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, X_val=X_val, y_val=y_val):
    p.fit(X_train,y_train)
    train_preds= p.predict(X_train)
    validation_preds = p.predict(X_val)
    test_preds= p.predict(X_test)
    
    
    Z_train = p[:-1].transform(X_train)
    n_features = Z_train.shape[1]
    print("Model Name -> " + str(model_name))
    print("Feature count " + str(n_features))
    print("Training error: " + str(rmse(train_preds, y_train)))
    print("Validation error: " + str(rmse(validation_preds, y_val)))
    print("Testing error: " + str(rmse(test_preds, y_test)) + "\n")

In [91]:
p3= Pipeline([
    ("poly",  PolynomialFeatures(degree=2, include_bias=False)), 
    ("scale", StandardScaler()),
    ("ridge", RidgeCV(alphas=np.logspace(-4, 4, 33), cv=5))
])

fit_and_print(p3, model_name="Ridge Regression")

Model Name -> Ridge Regression
Feature count 350
Training error: 17.612567878994412
Validation error: 26.777701024099567
Testing error: 30.5408097784356



In [96]:
p4= Pipeline([
    ("poly",  PolynomialFeatures(degree=2, include_bias=False)), 
    ("scale", StandardScaler()),
    ("ols",   LinearRegression())
])

fit_and_print(p4, model_name="Regular Linear Regression")

Model Name -> Regular Linear Regression
Feature count 350
Training error: 6.354231662215995
Validation error: 213.37353878450864
Testing error: 389.6138200719838



## Item-14

### Use KRR on your dataset

#### Objective:
* Compare Kernel Ridge Regression result to your previous models.

#### Steps:
* Create/Select features similar to before
* Perform kernel ridge regression on these using sciki-learn
* Test if your model is overfit or not

In [97]:
from sklearn.kernel_ridge import KernelRidge
from itertools import product

In [108]:
def fit_krr_rbf_by_val(X_train, y_train, X_val, y_val,
                       alphas=(1e-3, 1e-2, 1e-1, 1, 10),
                       gammas=(1e-3, 1e-2, 1e-1, 1.0)):
    best_pipe, best_rmse, best_params = None, float("inf"), None
    for a, g in product(alphas, gammas):
        pipe = Pipeline([
            ("scale", StandardScaler()),
            ("krr", KernelRidge(kernel="rbf", alpha=a, gamma=g))
        ])
        pipe.fit(X_train, y_train)
        val_rmse = rmse(y_val, pipe.predict(X_val))
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_pipe = pipe
            best_params = {"alpha": a, "gamma": g}
    return best_pipe, best_params, best_rmse

best_krr, best_params, val_rmse = fit_krr_rbf_by_val(X_train, y_train, X_val, y_val)
print("KRR-RBF best (VAL):", best_params, "| val RMSE:", f"{val_rmse:.4f}")

KRR-RBF best (VAL): {'alpha': 0.01, 'gamma': 0.01} | val RMSE: 22.3610


In [109]:
X_tv = pd.concat([X_train, X_val], axis=0)
y_tv = np.concatenate([y_train, y_val], axis=0)

# rebuild with the same hyperparameters and fit on train+val
p5 = Pipeline([
    ("scale", StandardScaler()),
    ("krr", KernelRidge(kernel="rbf", **best_params))
]).fit(X_tv, y_tv)

print("KRR-RBF RMSE  train+val=", f"{rmse(y_tv,  p5.predict(X_tv)):.4f}",
      " test=", f"{rmse(y_test, p5.predict(X_test)):.4f}")

KRR-RBF RMSE  train+val= 14.4204  test= 21.6197


In [111]:
from sklearn.metrics import accuracy_score, roc_auc_score

def overfit_report_regressor(model, X_train, y_train, X_val, y_val, X_test, y_test):
    ytr = model.predict(X_train)
    yva = model.predict(X_val)
    yte = model.predict(X_test)
    tr, va, te = rmse(y_train, ytr), rmse(y_val, yva), rmse(y_test, yte)
    gap_va = va - tr
    gap_te = te - tr
    print(f"RMSE  train={tr:.4f}  val={va:.4f}  test={te:.4f}")
    print(f"Generalization gap:  val-train={gap_va:.4f} | test-train={gap_te:.4f}")
    if gap_va > 0 and gap_te > 0 and (gap_va > 0.1*abs(va) or gap_te > 0.1*abs(te)):
        print("⚠️ Likely overfitting: train error is much lower than val/test.")
    else:
        print("✅ No strong overfitting signal based on gaps.")

In [None]:
#I will test the funciton on my OLR model which I know overfitting
overfit_report_regressor(p4, X_train, y_train, X_val, y_val, X_test, y_test)

RMSE  train=6.3542  val=213.3735  test=389.6138
Generalization gap:  val-train=207.0193 | test-train=383.2596
⚠️ Likely overfitting: train error is much lower than val/test.


In [None]:
#Now I will test if my KNN model overfitting strongly
overfit_report_regressor(p5, X_train, y_train, X_val, y_val, X_test, y_test)

RMSE  train=14.5836  val=13.9252  test=21.6197
Generalization gap:  val-train=-0.6584 | test-train=7.0361
✅ No strong overfitting signal based on gaps.
