# Demostración: Código Seguro contra Data Leakage
Este cuaderno replica los pasos del script para destacar la regla de oro: **Divide antes, transforma después**.

In [18]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
DATA_PATH = Path('./data/bike_sharing_demand.csv')
df = pd.read_csv(DATA_PATH, parse_dates=['timestamp'], dayfirst=True)
df.head()

Unnamed: 0,id,timestamp,season,holiday,workingday,weather,temp,temp_feel,humidity,windspeed,demand
0,1,01-01-2017 00:00,spring,No,No,Clear or partly cloudy,9.84,14.395,81.0,0.0,2.7726
1,2,01-01-2017 01:00,spring,No,No,Clear or partly cloudy,9.02,13.635,80.0,0.0,3.6889
2,3,01-01-2017 02:00,spring,No,No,Clear or partly cloudy,9.02,13.635,80.0,0.0,3.4657
3,4,01-01-2017 03:00,spring,No,No,Clear or partly cloudy,9.84,14.395,75.0,0.0,2.5649
4,5,01-01-2017 04:00,spring,No,No,Clear or partly cloudy,9.84,14.395,75.0,0.0,0.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8708 entries, 0 to 8707
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          8708 non-null   int64  
 1   timestamp   8708 non-null   object 
 2   season      8708 non-null   object 
 3   holiday     8708 non-null   object 
 4   workingday  8708 non-null   object 
 5   weather     8708 non-null   object 
 6   temp        7506 non-null   float64
 7   temp_feel   8606 non-null   float64
 8   humidity    8669 non-null   float64
 9   windspeed   8508 non-null   float64
 10  demand      8708 non-null   float64
dtypes: float64(5), int64(1), object(5)
memory usage: 748.5+ KB


In [20]:
# Manejar valores faltantes
df = df.dropna(subset=['temp', 'humidity', 'windspeed', 'demand'])

In [None]:
# Convertir timestamp a datetime si no lo es
df['timestamp'] = pd.to_datetime(df['timestamp'], dayfirst=True, errors='coerce')


Unnamed: 0,temp,humidity,windspeed,hour,is_weekend
0,9.84,81.0,0.0,0.0,True
1,9.02,80.0,0.0,1.0,True
2,9.02,80.0,0.0,2.0,True
3,9.84,75.0,0.0,3.0,True
4,9.84,75.0,0.0,4.0,True


In [25]:
# Perfilamiento de todas las columnas
df_profile =  df.describe(include='all')
df_profile

Unnamed: 0,id,timestamp,season,holiday,workingday,weather,temp,temp_feel,humidity,windspeed,demand,hour,is_weekend
count,7275.0,7266,7275,7275,7275,7275,7275.0,7185.0,7275.0,7275.0,7275.0,7266.0,7275
unique,,,4,2,2,4,,,,,,,2
top,,,summer,No,Yes,Clear or partly cloudy,,,,,,,False
freq,,,2270,7078,4993,4914,,,,,,,5175
mean,4360.234639,2017-10-15 07:46:54.698596096,,,,,20.11181,23.559089,60.925361,13.022549,4.450148,11.537435,
min,1.0,2017-01-01 00:00:00,,,,,0.82,0.76,0.0,0.0,0.0,0.0,
25%,2177.5,2017-05-17 13:15:00,,,,,13.94,15.91,46.0,7.0015,3.6376,6.0,
50%,4365.0,2017-10-13 18:30:00,,,,,20.5,24.24,60.0,12.998,4.8675,12.0,
75%,6541.5,2018-03-09 18:45:00,,,,,26.24,31.06,77.0,19.0012,5.5568,18.0,
max,8708.0,2018-08-05 04:00:00,,,,,41.0,45.455,100.0,56.9969,6.7923,23.0,


In [27]:
# Hay valores faltantes en la columna 'timestamp'
missing_timestamps = df['timestamp'].isnull().sum()
print(f"Número de valores faltantes en 'timestamp': {missing_timestamps}")
df = df.dropna(subset=['timestamp'])

Número de valores faltantes en 'timestamp': 9


In [29]:
df['hour'] = df['timestamp'].dt.hour
df['is_weekend'] = df['timestamp'].dt.dayofweek >= 5
features = ['temp', 'humidity', 'windspeed', 'hour', 'is_weekend']
target = 'demand'
X = df[features]
y = df[target]
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['timestamp'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_weekend'] = df['timestamp'].dt.dayofweek >= 5


Unnamed: 0,temp,humidity,windspeed,hour,is_weekend
0,9.84,81.0,0.0,0,True
1,9.02,80.0,0.0,1,True
2,9.02,80.0,0.0,2,True
3,9.84,75.0,0.0,3,True
4,9.84,75.0,0.0,4,True


In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)
len(X_train), len(X_test)

(5812, 1454)

In [31]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
"""
El ajuste del scaler se hace exclusivamente con X_train.
Esto evita filtrar estadísticas del set de prueba y previene fugas de información.
"""
X_train_scaled[:2]

array([[-1.18494803, -0.2534722 , -0.72543711,  1.0868001 , -0.64030863],
       [-1.38913493, -1.01635647, -0.00807384,  0.94180694,  1.56174686]])