# 02. Preprocesamiento y Feature Engineering - AeroSafe Risk Predictor

**Objetivo:** Limpiar el dataset de Bird Strikes, manejar valores nulos, codificar variables categóricas y preparar el dataframe final para el entrenamiento de modelos.

In [1]:
import pandas as pd
import numpy as np
import os

# 1. Carga de datos 
data_path = r"D:\UTP\Empresa Aeronáutica\aerosafe-risk-predictor\data\raw\Bird_strikes.csv"
df = pd.read_csv(data_path, encoding="latin1")

# 2. Selección de columnas relevantes para el modelo de riesgo

cols_to_keep = [
    'AircraftType', 'AirportName', 'AltitudeBin', 'MakeModel', 
    'WildlifeSize', 'ConditionsSky', 'ConditionsPrecipitation',
    'Damage'  # Nuestra variable objetivo
]

df_clean = df[cols_to_keep].copy()

# 3. Manejo de Nulos inicial
print("Nulos por columna antes de limpiar:")
print(df_clean.isna().sum())


df_clean = df_clean.fillna('Unknown')

print("\nForma del dataset limpio:", df_clean.shape)
df_clean.head()

Nulos por columna antes de limpiar:
AircraftType                   0
AirportName                    0
AltitudeBin                    0
MakeModel                      0
WildlifeSize                   0
ConditionsSky                  0
ConditionsPrecipitation    23414
Damage                         0
dtype: int64

Forma del dataset limpio: (25429, 8)


Unnamed: 0,AircraftType,AirportName,AltitudeBin,MakeModel,WildlifeSize,ConditionsSky,ConditionsPrecipitation,Damage
0,Airplane,LAGUARDIA NY,"(1000, 2000]",B-737-400,Medium,No Cloud,Unknown,Caused damage
1,Airplane,DALLAS/FORT WORTH INTL ARPT,"(-1, 0]",MD-80,Small,Some Cloud,Unknown,Caused damage
2,Airplane,LAKEFRONT AIRPORT,"(30, 50]",C-500,Small,No Cloud,Unknown,No damage
3,Airplane,SEATTLE-TACOMA INTL,"(30, 50]",B-737-400,Small,Some Cloud,Unknown,No damage
4,Airplane,NORFOLK INTL,"(30, 50]",CL-RJ100/200,Small,No Cloud,Unknown,No damage


In [2]:
# 1. Variable objetivo binaria
df_model = df_clean.copy()

df_model["target_damage"] = (df_model["Damage"] == "Caused damage").astype(int)

print(df_model["target_damage"].value_counts())
df_model[["Damage", "target_damage"]].head()

target_damage
0    22975
1     2454
Name: count, dtype: int64


Unnamed: 0,Damage,target_damage
0,Caused damage,1
1,Caused damage,1
2,No damage,0
3,No damage,0
4,No damage,0


In [3]:
# 2. Separar variables predictoras y objetivo
X = df_model.drop(columns=["Damage", "target_damage"])
y = df_model["target_damage"]

X.head()

Unnamed: 0,AircraftType,AirportName,AltitudeBin,MakeModel,WildlifeSize,ConditionsSky,ConditionsPrecipitation
0,Airplane,LAGUARDIA NY,"(1000, 2000]",B-737-400,Medium,No Cloud,Unknown
1,Airplane,DALLAS/FORT WORTH INTL ARPT,"(-1, 0]",MD-80,Small,Some Cloud,Unknown
2,Airplane,LAKEFRONT AIRPORT,"(30, 50]",C-500,Small,No Cloud,Unknown
3,Airplane,SEATTLE-TACOMA INTL,"(30, 50]",B-737-400,Small,Some Cloud,Unknown
4,Airplane,NORFOLK INTL,"(30, 50]",CL-RJ100/200,Small,No Cloud,Unknown


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# 3. Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Definir columnas categóricas
cat_cols = X.columns.tolist()

# 5. One-Hot Encoder
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# Ajustar en train y transformar train/test
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

X_train_enc.shape, X_test_enc.shape

((20343, 1360), (5086, 1360))

In [6]:
import joblib
import os

# Definir ruta de guardado
processed_dir = r"D:\UTP\Empresa Aeronáutica\aerosafe-risk-predictor\data\processed"
os.makedirs(processed_dir, exist_ok=True)

# Guardar el preprocesador (para usarlo en el futuro con datos nuevos)
joblib.dump(preprocessor, os.path.join(processed_dir, "preprocessor_ohe.pkl"))

# Guardar los datos de entrenamiento y prueba
joblib.dump((X_train_enc, X_test_enc, y_train, y_test),
            os.path.join(processed_dir, "model_data.pkl"))

print("¡Éxito! Datos guardados en data/processed/")

¡Éxito! Datos guardados en data/processed/
