# **Machine Learning - Caso de estudio - Predicción de Retrasos de Vuelos en la Industria Aérea.**

# Librerías

## Librerías generales

In [1]:
import pandas as pd
import numpy as numpy
import time
import joblib
import os

## Librerías de Machine Learning

In [5]:
# import sys
# print(sys.executable)


In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Cargar datos limpios

In [10]:
ruta = r"D:\OneDrive\DOCUMENTOS\Personales\2024\uniandes\8 S\seminario\g11-caso-estudio-flights\data\processed\flights_clean.csv"

In [11]:
vuelos_limpios = pd.read_csv(ruta)

In [12]:
vuelos_limpios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5231130 entries, 0 to 5231129
Data columns (total 41 columns):
 #   Column               Dtype  
---  ------               -----  
 0   MONTH                int64  
 1   DAY                  int64  
 2   DAY_OF_WEEK          int64  
 3   AIRLINE              object 
 4   ORIGIN_AIRPORT       object 
 5   DESTINATION_AIRPORT  object 
 6   SCHEDULED_DEPARTURE  int64  
 7   DEPARTURE_TIME       float64
 8   DEPARTURE_DELAY      float64
 9   SCHEDULED_TIME       float64
 10  DISTANCE             int64  
 11  SCHEDULED_ARRIVAL    int64  
 12  ARRIVAL_TIME         float64
 13  ARRIVAL_DELAY        float64
 14  AIRLINE_NAME         object 
 15  ORIGEN_AEROPUERTO    object 
 16  ORIGEN_CIUDAD        object 
 17  ORIGEN_ESTADO        object 
 18  ORIGEN_LAT           float64
 19  ORIGEN_LON           float64
 20  DEST_AEROPUERTO      object 
 21  DEST_CIUDAD          object 
 22  DEST_ESTADO          object 
 23  DEST_LAT             float64
 24

### **revisar porcentaje de retrazados vs no retrasados**

In [16]:
# import pandas as pd
# import plotly.express as px

# # Cargar datos
# vuelos_limpios = pd.read_csv("data/processed/flights_clean.csv")

# Conteo y porcentaje
conteo = vuelos_limpios["RETRASADO_LLEGADA"].value_counts().rename({0: "A tiempo", 1: "Retrasado"})
porcentaje = (conteo / conteo.sum() * 100).round(2)

# Crear DataFrame para gráfica
df_plot = pd.DataFrame({
    "Estado": conteo.index,
    "Cantidad": conteo.values,
    "Porcentaje": porcentaje.values
})

# Gráfico de pastel
fig = px.pie(
    df_plot,
    values="Cantidad",
    names="Estado",
    color="Estado",
    color_discrete_map={"A tiempo": "green", "Retrasado": "red"},
    title="Distribución de Vuelos según Retraso (>15 min)",
    hole=0.3
)

fig.update_traces(textinfo="label+percent", textfont_size=14)
fig.show()


# Random Forest

## Preparar los datos para el modelo

In [None]:
vuelos_limpios.info()

In [None]:
col_categoricas = ["platform", "genre", "rating_esrb", "gen_platform", "classification_user_score"]
col_numericas = ["year_of_release", "user_score", "critic_score"]

In [None]:
target = "total_sales"

In [None]:
X_categoricas = vuelos_limpios[col_categoricas]
X_numericas = vuelos_limpios[col_numericas]
y = vuelos_limpios[target]

In [None]:
X_categoricas.head()

In [None]:
X_numericas.head()

In [None]:
y

## Aplicación de One-Hot Encoding

In [None]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoder

In [None]:
X_categoricas_encoded = encoder.fit_transform(X_categoricas)

In [None]:
X_categoricas_encoded

In [None]:
nuevas_columnas = encoder.get_feature_names_out(col_categoricas)
nuevas_columnas

In [None]:
games_encoded = pd.DataFrame(
    X_categoricas_encoded, 
    columns = nuevas_columnas
)

In [None]:
print(f"Número de filas x columnas: {games_encoded.shape}")
display(games_encoded.head())

In [None]:
X_numericas.head()

In [None]:
X = pd.concat([X_numericas.reset_index(drop=True), games_encoded], axis=1)
X.head()

In [None]:
y.head()

## Dividir los datos

In [None]:
len(games_clean)

In [None]:
# definir variables para separar datos
RANDOM_STATE = 50 
TEST_SIZE = 0.25

In [None]:
# dividir datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [None]:
print(f"Tamaño X_train: {X_train.shape}")
print(f"Tamaño X_test: {X_test.shape}")
print(f"Tamaño y_train: {y_train.shape}")
print(f"Tamaño y_test: {y_test.shape}")

In [None]:
X_train.head()

## Entrenar el modelo

In [None]:
# fijando parámetros del modelo
modelo = RandomForestRegressor(
    n_estimators=100, 
    random_state=RANDOM_STATE,
    n_jobs=-1, 
    oob_score=True
)

In [None]:
# fit es igual a entrenar el modelo
modelo.fit(X_train, y_train)

In [None]:
print(f"OOB Score (R2 estimado): {modelo.oob_score_}")

## Evaluar el modelo

### RMSE

In [None]:
# predict es igual a predecir el modelo
predicciones = modelo.predict(X_test)

In [None]:
rmse = root_mean_squared_error(y_test, predicciones)
rmse

In [None]:
df_comparacion = pd.DataFrame({"Datos_Reales": y_test, "Predicción": predicciones}).reset_index(drop=True)

In [None]:
df_comparacion.head(20)

1. Seleccionamos los datos numéricos y categoricos. 
2. Los categóricos los pasamos a una matriz de 1 y 0 (OneHotEncoder)
3. Dividimos los datos - Datos de Entrenamiento (75%) y Datos de Prueba (25%)
4. Entreno los datos con mi modelo (cualquier modelo de ML), los datos de X (variables dependientes), y (variable independiente)
5. Evalúo con los datos de prueba solo con X (variables dependientes)
6. Saco métricas de qué tan bueno es mi modelo prediciendo los datos
6.1 Comparamos los datos reales vs la predicción. 

### MAE

In [None]:
mae_random_forest = mean_absolute_error(y_test, predicciones)
print(f"MAE (Random Forest): {mae_random_forest:.6f}")

### R-Cuadrado - Coeficiento de determinación

In [None]:
r2_random_forest = r2_score(y_test, predicciones)

print(f"R2 (Random Forest): {r2_random_forest:.6f}")

### MAPE - Error Procentual Abosuluto Medio

In [None]:
mape_random_forest = mean_absolute_percentage_error(y_test, predicciones)

print(f"MAPE (Random Forest): {mape_random_forest*100:.6f}")

#### Comparación métricas

In [None]:
data_rf = [rmse, mae_random_forest, r2_random_forest, mape_random_forest]

index_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_rf = pd.Series(
    data=data_rf,
    index=index_metricas, 
    name="Random Forest"
)

print("Métricas Random Forest")
print(metricas_rf)

# LightGBM

In [None]:
modelo_lgbm = LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1)

In [None]:
modelo_lgbm

In [None]:
modelo_lgbm.fit(X_train, y_train)

In [None]:
predicciones_lgbm = modelo_lgbm.predict(X_test)

In [None]:
rmse_lgbm = root_mean_squared_error(y_test, predicciones_lgbm)
rmse_lgbm

In [None]:
mae_lgbm = mean_absolute_error(y_test, predicciones_lgbm)
print(f"MAE (Random Forest): {mae_lgbm:.6f}")

In [None]:
r2_lgbm = r2_score(y_test, predicciones_lgbm)

print(f"R2 (Random Forest): {r2_lgbm:.6f}")

In [None]:
mape_lgbm = mean_absolute_percentage_error(y_test, predicciones_lgbm)

print(f"MAPE (Random Forest): {mape_lgbm*100:.6f}")

In [None]:
data_lgbm = [rmse_lgbm, mae_lgbm, r2_lgbm, mape_lgbm]

index_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_lgbm = pd.Series(
    data=data_lgbm,
    index=index_metricas, 
    name="LGBM"
)

print("Métricas LGBM")
print(metricas_lgbm)

# XGBoost

In [None]:
modelo_xgb = XGBRegressor(random_state = RANDOM_STATE, n_jobs=-1)

In [None]:
modelo_xgb.fit(X_train, y_train)

In [None]:
predicciones_xgb = modelo_xgb.predict(X_test)

In [None]:
rmse_xgb = root_mean_squared_error(y_test, predicciones_xgb)
rmse_xgb

In [None]:
mae_xgb = mean_absolute_error(y_test, predicciones_xgb)
print(f"MAE (xgb): {mae_xgb:.6f}")

In [None]:
r2_xgb = r2_score(y_test, predicciones_xgb)

print(f"R2 (xgb): {r2_xgb:.6f}")

In [None]:
mape_xgb = mean_absolute_percentage_error(y_test, predicciones_xgb)

print(f"MAPE (xgb): {mape_xgb*100:.6f}")

In [None]:
data_xgb = [rmse_xgb, mae_xgb, r2_xgb, mape_xgb]

index_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_xgb = pd.Series(
    data=data_xgb,
    index=index_metricas, 
    name="XGB"
)

print("Métricas XGB")
print(metricas_xgb)

# Comparación de modelos

In [None]:
df_comparacion = pd.concat(
    [metricas_rf, metricas_lgbm, metricas_xgb], 
    axis=1
)

display(df_comparacion)

# GridSearchCV

In [None]:
param_grid = {
    "n_estimators": [100, 200], 
    "learning_rate": [0.1, 0.05], 
    "max_depth": [10, -1], 
    "num_leaves": [31, 50]
}

In [None]:
lgbm = LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1)

In [None]:
grid_search = GridSearchCV(
    estimator=lgbm, 
    param_grid=param_grid, 
    cv=6, 
    scoring="neg_root_mean_squared_error", 
    n_jobs= -1, 
    verbose=2
)

# tiempo
print("Iniciando GridSeachCV...")

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

print(f"GridSearchCV completo en {end_time - start_time:.2f} segundos")

In [None]:
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)

In [None]:
best_lgbm_model = grid_search.best_estimator_
best_lgbm_model

In [None]:
predicciones_best_lgbm = best_lgbm_model.predict(X_test)

In [None]:
rmse_best_lgbm = root_mean_squared_error(y_test, predicciones_best_lgbm)
rmse_best_lgbm

In [None]:
mae_best_lgbm = mean_absolute_error(y_test, predicciones_best_lgbm)
print(f"MAE (lgbm): {mae_best_lgbm:.6f}")

In [None]:
r2_best_lgbm = r2_score(y_test, predicciones_best_lgbm)

print(f"R2 (lgbm): {r2_best_lgbm:.6f}")

In [None]:
mape_best_lgbm = mean_absolute_percentage_error(y_test, predicciones_best_lgbm)

print(f"MAPE (xgb): {mape_best_lgbm*100:.6f}")

In [None]:
data_lgbm_optimizado = [
    rmse_best_lgbm, 
    mae_best_lgbm,
    r2_best_lgbm, 
    mape_best_lgbm
]

inxex_metricas = ["RMSE", "MAE", "R-cuadrado", "MAPE"]

metricas_lgbm_optimizado = pd.Series(
    data= data_lgbm_optimizado, 
    index=index_metricas, 
    name="LGBM (Optimizado)"
)

In [None]:
df_comparacion_lgbm = pd.concat(
    [metricas_lgbm, metricas_lgbm_optimizado], 
    axis=1
)

In [None]:
df_comparacion_lgbm

# Guardar archivos del mejor modelo

In [None]:
NOTEBOOK_DIR = os.getcwd()
NOTEBOOK_DIR

In [None]:
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, ".."))

MODEL_DIR = os.path.join(PROJECT_ROOT, "models")

ENCODER_PATH = os.path.join(MODEL_DIR, "onehot_encoder.joblib")

MODEL_PATH = os.path.join(MODEL_DIR, "lgbm_regressor_default.joblib")

In [None]:
os.makedirs(MODEL_DIR, exist_ok=True)

joblib.dump(encoder, ENCODER_PATH)
joblib.dump(modelo_lgbm, MODEL_PATH)

# Verificación de información del encoder

In [None]:
import joblib
import os

In [None]:
encoder_path = r"C:\Users\fnaje\OneDrive\Documents\UniAndes\2do Seminario\seminario-proyecto-demo-games\models\onehot_encoder.joblib"

In [None]:
encoder = joblib.load(encoder_path)

In [None]:
print(type(encoder))

In [None]:
encoder.categories_

In [None]:
print(list(encoder.categories_[0]))

In [None]:
print(list(encoder.categories_[1]))

In [None]:
print(list(encoder.categories_[2]))