# Demostración: Código Seguro contra Data Leakage
Este cuaderno replica los pasos del script para destacar la regla de oro: **Divide antes, transforma después**.

In [3]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
DATA_PATH = Path('./data/bike_sharing_demand.csv')
df = pd.read_csv(DATA_PATH, parse_dates=['datetime'])
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,0.81,0.0,3,13,16
1,2011-01-01 03:00:00,1,0,0,1,9.84,13.635,0.8,0.0,1,8,9
2,2011-01-01 06:00:00,1,0,0,1,9.02,10.69,0.65,6.0032,1,5,6
3,2011-01-01 09:00:00,1,0,1,1,9.02,12.365,0.51,11.0014,8,32,40
4,2011-01-01 12:00:00,1,0,1,1,13.12,14.395,0.52,16.9979,12,40,52


In [4]:
df['hour'] = df['datetime'].dt.hour
df['is_weekend'] = df['datetime'].dt.dayofweek >= 5
features = ['temp', 'humidity', 'windspeed', 'hour', 'is_weekend']
target = 'count'
X = df[features]
y = df[target]
X.head()

Unnamed: 0,temp,humidity,windspeed,hour,is_weekend
0,9.84,0.81,0.0,0,True
1,9.84,0.8,0.0,3,True
2,9.02,0.65,6.0032,6,True
3,9.02,0.51,11.0014,9,True
4,13.12,0.52,16.9979,12,True


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)
len(X_train), len(X_test)

(8, 2)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
"""
El ajuste del scaler se hace exclusivamente con X_train.
Esto evita filtrar estadísticas del set de prueba y previene fugas de información.
"""
X_train_scaled[:2]

array([[ 1.33333333, -1.32528457,  0.84845231,  0.65465367,  0.        ],
       [-1.        ,  1.82674359, -1.98011689, -1.52752523,  0.        ]])