<h3>
    Data Leakage
</h3>
<div style="width: 80%">
Data leakage occurs when information outside the training dataset is used to create the model. This can lead to overly optimistic performance estimates and ultimately, poor generalization to new data. One common mistake is applying feature engineering techniques, like polynomial features, before splitting the data into training and testing sets.
</div>

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the diabetes dataset
diabetes = load_diabetes()

# Create a DataFrame for easier manipulation
df = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
df["target"] = diabetes.target

# Let's say we want to use 'bmi' as our feature
X = df[["bmi"]]
y = df["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Applying polynomial features without taking precautions
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Now let's fit a model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Predictions
y_pred_train = model.predict(X_train_poly)
y_pred_test = model.predict(X_test_poly)

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)