In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
# Synthetic dataset
np.random.seed(1)
X = np.sort(5 * np.random.rand(100, 1), axis=0)
y = np.sin(X) + 0.4 * np.random.randn(100, 1)

# Model
degree = 5
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

# ---- Single Train/Test Split ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

split_mse = mean_squared_error(y_test, y_pred)

print("Single Split MSE:", split_mse)

# ---- K-Fold Cross Validation ----
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    model, X, y, scoring="neg_mean_squared_error", cv=kf
)

cv_mse_scores = -cv_scores

print("\nCross-Validation MSE per fold:")
for i, score in enumerate(cv_mse_scores):
    print(f"Fold {i+1}: {score:.3f}")

print("\nMean CV MSE:", cv_mse_scores.mean())
print("Std CV MSE:", cv_mse_scores.std())

Single Split MSE: 0.1288889683584376

Cross-Validation MSE per fold:
Fold 1: 0.129
Fold 2: 0.067
Fold 3: 0.153
Fold 4: 0.323
Fold 5: 0.124

Mean CV MSE: 0.15912148668733417
Std CV MSE: 0.08681737353548076


**Single split MSE gives a nice value but Mean CV MSE value gives a more robust estimate which is more trustworthy and shows that the initial single split was optimistic**

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# WRONG way (leakage)
scaler = StandardScaler()
X_scaled_wrong = scaler.fit_transform(X)  # before split!

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_wrong, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MSE with DATA LEAKAGE:",
      mean_squared_error(y_test, y_pred))


# CORRECT way
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

scores = cross_val_score(
    pipeline, X, y, scoring="neg_mean_squared_error", cv=5
)

print("MSE without leakage:", (-scores).mean())


MSE with DATA LEAKAGE: 0.3199802379223909
MSE without leakage: 0.7113968577495271


**Data leakage occurs when preprocessing steps are fit on the full dataset before splitting, allowing test data information to influence training. This leads to overly optimistic performance estimates. The correct approach is to encapsulate preprocessing within a pipeline so transformations are learned only from training folds**