In [None]:
import pandas as pd
import numpy as np

# Cargar dataset
df = pd.read_csv('../data/nyc_taxi_sample.csv')

# Ver primeras filas
print(df.head())

# Eliminar valores nulos
df.dropna(inplace=True)

# Filtrar viajes con duración exagerada (> 2 horas)
if 'trip_duration' in df.columns:
	df = df[df['trip_duration'] < 7200]
else:
	print("Nota: 'trip_duration' no encontrada — se omite el filtro de duración")

# Convertir fechas a datetime (manejar diferentes nombres si es necesario)
if 'pickup_datetime' in df.columns and 'dropoff_datetime' in df.columns:
	df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
	df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
elif 'tpep_pickup_datetime' in df.columns and 'tpep_dropoff_datetime' in df.columns:
	df['pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
	df['dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
else:
	raise KeyError('No se encontraron columnas de fecha esperadas (pickup_datetime/dropoff_datetime)')

# Crear columnas útiles
df['duration_min'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds() / 60
df['hour'] = df['pickup_datetime'].dt.hour

# Guardar dataset limpio (no anonimizado)
clean_path = '../data/nyc_taxi_clean.csv'
df.to_csv(clean_path, index=False)
print(f"✅ Datos limpios guardados en {clean_path}")

# Aplicar anonimización antes de compartir/guardar una copia anónima
try:
	from scripts.privacy import anonymize_dataframe

	id_col = 'id' if 'id' in df.columns else None
	lat_col = 'pickup_latitude' if 'pickup_latitude' in df.columns else 'pickup_lat'
	lon_col = 'pickup_longitude' if 'pickup_longitude' in df.columns else 'pickup_lon'

	anon_df = anonymize_dataframe(
		df,
		id_col=id_col,
		lat_col=lat_col,
		lon_col=lon_col,
		salt='project_salt',
		round_decimals=3,
		add_noise=False,
		in_place=False,
	)

	anon_path = '../data/nyc_taxi_clean_anonymized.csv'
	anon_df.to_csv(anon_path, index=False)
	print(f"✅ Datos anónimos guardados en {anon_path}")
except Exception as e:
	print('No se pudo aplicar anonimización automáticamente:', e)


          id  vendor_id      pickup_datetime     dropoff_datetime  \
0  id2875421          2  2016-03-14 17:24:55  2016-03-14 17:32:30   
1  id2377394          1  2016-06-12 00:43:35  2016-06-12 00:54:38   
2  id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
3  id3504673          2  2016-04-06 19:32:31  2016-04-06 19:39:40   
4  id2181028          2  2016-03-26 13:30:55  2016-03-26 13:38:10   

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude store_and_fwd_flag  trip_duration  
0         40.765602                  N            455  
1         40.731