# Machine Learning Boilerplate Workflow (Linear Regression Example)

## 1. Define Problem

We want to predict disease progression one year after baseline for diabetes patients.

1. Target (y): a continuous value (disease progression measure).
2. Features (X): 10 baseline medical variables (age, sex, BMI, blood pressure, and 6 blood serum measurements).
3. Goal: Build an interpretable regression model to understand which features influence disease progression, and evaluate predictive performance.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True, as_frame=True)

## 2. Load and Inspect data 

In [None]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

X.head()

In [None]:
# Check data types and missing values
print(X.info())
print("Missing values:\n", X.isna().sum())

🔹 Other methods to consider:

- .nunique() to check unique values (categorical detection).
- .duplicated().sum() to detect duplicates.

## 3. Exploratory Data Analysis (EDA)

Why? To understand patterns, correlations, and distributions.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Summary statistics
display(X.describe())

# Target distribution
sns.histplot(y, bins=20, kde=True)
plt.title("Distribution of Target (Disease Progression)")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(X.corr(), annot=False, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

## 4. Data Preparation

1. Scale not strictly required for linear regression, but helps with interpretation & regularization.
2. Always split data into train/test sets.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 5. Train Model

In [None]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train_scaled, y_train)

y_pred = linreg.predict(X_test_scaled)

## 6. Gradient Descent (Conceptual Demo)

Sklearn uses the Normal Equation (analytical). Let’s demo gradient descent with 1 feature to illustrate.

In [None]:
import numpy as np

# Use BMI only for demo
X_simple = X_train_scaled[:,2].reshape(-1,1)  
y_simple = y_train.values.reshape(-1,1)

def gradient_descent(X, y, lr=0.1, epochs=100):
    m, n = X.shape
    theta = np.zeros((n,1))
    losses = []
    for _ in range(epochs):
        gradients = -(2/m) * X.T.dot(y - X.dot(theta))
        theta -= lr * gradients
        loss = np.mean((y - X.dot(theta))**2)
        losses.append(loss)
    return theta, losses

theta, losses = gradient_descent(X_simple, y_simple, lr=0.1, epochs=100)

plt.plot(losses)
plt.title("Gradient Descent Convergence (MSE)")
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.show()

## 7. Evaluate Model (Regression Metrics)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

In [None]:
# Get coefficients, p-values, log-likelihood using statsmodels:
import statsmodels.api as sm

X_train_sm = sm.add_constant(X_train_scaled)
model_sm = sm.OLS(y_train, X_train_sm).fit()
print(model_sm.summary())

## 8. Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(linreg, X, y, cv=5, scoring="r2")
print("Cross-validated R²:", cv_scores.mean())

## 9. Regularization & Hyperparameter Tuning

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

# Ridge
ridge = Ridge()
param_grid = {"alpha": [0.01, 0.1, 1, 10, 100]}
ridge_grid = GridSearchCV(ridge, param_grid, cv=5, scoring="r2")
ridge_grid.fit(X_train_scaled, y_train)
print("Best Ridge alpha:", ridge_grid.best_params_)

# Lasso
lasso = Lasso()
lasso_grid = GridSearchCV(lasso, param_grid, cv=5, scoring="r2")
lasso_grid.fit(X_train_scaled, y_train)
print("Best Lasso alpha:", lasso_grid.best_params_)


## 10. (Optional) Classification-style Metrics

Convert regression output to classification (high vs low progression).

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score, accuracy_score

# Binarize target
median_val = np.median(y)
y_class = (y > median_val).astype(int)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)

linreg_c = LinearRegression()
linreg_c.fit(X_train_c, y_train_c)
y_pred_prob = linreg_c.predict(X_test_c)

y_pred_class = (y_pred_prob >= 0.5).astype(int)

cm = confusion_matrix(y_test_c, y_pred_class)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

print("Accuracy:", accuracy_score(y_test_c, y_pred_class))
print("Precision:", precision_score(y_test_c, y_pred_class))
print("Recall (Sensitivity):", recall_score(y_test_c, y_pred_class))
print("F1:", f1_score(y_test_c, y_pred_class))

fpr, tpr, thresholds = roc_curve(y_test_c, y_pred_prob)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"AUC={roc_auc:.2f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

## 11. Bias-Variance Tradeoff

In [None]:
train_errors, test_errors = [], []
for d in range(1, 11):
    poly = np.vander(X_simple.flatten(), N=d, increasing=True)
    model = LinearRegression().fit(poly, y_simple)
    train_errors.append(mean_squared_error(y_simple, model.predict(poly)))
    
    test_poly = np.vander(X_test_scaled[:,2], N=d, increasing=True)
    test_errors.append(mean_squared_error(y_test, model.predict(test_poly)))

plt.plot(range(1,11), train_errors, label="Train Error")
plt.plot(range(1,11), test_errors, label="Test Error")
plt.xlabel("Polynomial Degree")
plt.ylabel("MSE")
plt.title("Bias-Variance Tradeoff")
plt.legend()
plt.show()


## 12. Interpret Coefficients

In [None]:
import pandas as pd

coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": linreg.coef_
}).sort_values(by="Coefficient", ascending=False)

print(coef_df)

## 13. Save Model

In [None]:
import joblib

joblib.dump(linreg, "linear_model.pkl")
print("Model saved as linear_model.pkl")


# NOTE:

In the code I shared for the Diabetes dataset, here’s what happened:

- The basic Linear Regression model in sklearn (LinearRegression) does not require explicit standardization, because it estimates coefficients using Ordinary Least Squares (OLS). The scale of features doesn’t affect predictions, but it does affect coefficient interpretation (larger-scaled features dominate).

- When I showed Ridge regression / Lasso for regularization & hyperparameter tuning, I did standardize the data using StandardScaler(). This is necessary because regularization penalizes coefficients, and without standardization, features with larger scales get penalized more unfairly.

- For gradient descent demonstration, I normalized values internally when plotting, but I didn’t explicitly run a scaler on the main regression data — because that part was just illustrative.

# NOTE:

👉 Best Practice (checklist correction):

Always standardize features when:

Using algorithms that are scale-sensitive (regularization, gradient descent, SVM, kNN, PCA, etc.)

You need interpretable coefficients (so you can compare their relative magnitudes).

For plain OLS (unregularized LinearRegression), scaling isn’t strictly required, but it’s often done anyway for consistency.