In [1]:
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XIaqrqSlVriD"
      },
      "outputs": [],
      "source": [
        "# ============================================\n",
        "# Modelo CatBoost optimizado – versión final\n",
        "# ============================================\n",
        "\n",
        "from catboost import CatBoostClassifier\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "# ---------------------------\n",
        "# 1. Target: retraso grave >= 30 min\n",
        "# ---------------------------\n",
        "causas = ['RETRASO_SISTEMA_AEREO','RETRASO_SEGURIDAD',\n",
        "          'RETRASO_AEROLINEA','RETRASO_AVION_TARDIO','RETRASO_CLIMA']\n",
        "datos[\"RETRASO_TOTAL\"] = datos[causas].sum(axis=1)\n",
        "datos[\"RETRASO_GRAVE\"] = (datos[\"RETRASO_TOTAL\"] >= 30).astype(int)\n",
        "\n",
        "# ---------------------------\n",
        "# 2. Feature engineering de partida\n",
        "# ---------------------------\n",
        "datos[\"FECHA_COMPLETA\"] = pd.to_datetime(\n",
        "    datos[[\"ANO\",\"MES\",\"DIA\"]].rename(columns={\"ANO\":\"year\",\"MES\":\"month\",\"DIA\":\"day\"})\n",
        ")\n",
        "\n",
        "datos[\"DIA_SEMANA\"] = datos[\"FECHA_COMPLETA\"].dt.dayofweek\n",
        "datos[\"MES_PARTIDA\"] = datos[\"FECHA_COMPLETA\"].dt.month\n",
        "datos[\"ES_FIN_DE_SEMANA\"] = datos[\"DIA_SEMANA\"].isin([5,6]).astype(int)\n",
        "\n",
        "datos[\"TEMPORADA\"] = pd.cut(datos[\"MES_PARTIDA\"],\n",
        "                            bins=[0,3,6,9,12],\n",
        "                            labels=[\"Verano\",\"Otoño\",\"Invierno\",\"Primavera\"],\n",
        "                            right=True)\n",
        "\n",
        "# ---------------------------\n",
        "# 3. Variables de llegada\n",
        "# ---------------------------\n",
        "datos[\"HORA_LLEGADA\"] = pd.to_numeric(datos[\"HORA_LLEGADA\"], errors=\"coerce\")\n",
        "datos[\"HORA_LLEGADA\"] = datos[\"HORA_LLEGADA\"].fillna(datos[\"HORA_LLEGADA\"].mean()).astype(int)\n",
        "\n",
        "datos[\"FRANJA_HORARIA_LLEGADA\"] = pd.cut(datos[\"HORA_LLEGADA\"],\n",
        "                                         bins=[0,6,12,18,24],\n",
        "                                         labels=[\"Madrugada\",\"Mañana\",\"Tarde\",\"Noche\"],\n",
        "                                         right=False)\n",
        "\n",
        "# ---------------------------\n",
        "# 4. Nueva variable: LLEGADA_PROGRAMA\n",
        "# ---------------------------\n",
        "datos[\"LLEGADA_PROGRAMA\"] = pd.to_numeric(datos[\"LLEGADA_PROGRAMA\"], errors=\"coerce\")\n",
        "datos[\"LLEGADA_PROGRAMA\"] = datos[\"LLEGADA_PROGRAMA\"].fillna(datos[\"LLEGADA_PROGRAMA\"].mean()).astype(int)\n",
        "\n",
        "datos[\"FRANJA_LLEGADA_PROGRAMA\"] = pd.cut(datos[\"LLEGADA_PROGRAMA\"],\n",
        "                                          bins=[0,6,12,18,24],\n",
        "                                          labels=[\"Madrugada\",\"Mañana\",\"Tarde\",\"Noche\"],\n",
        "                                          right=False)\n",
        "\n",
        "# ---------------------------\n",
        "# 5. Features finales (sin SEGURIDAD_ESPERADA, HORA_SALIDA, FRANJA_HORARIA_SALIDA)\n",
        "# ---------------------------\n",
        "X = datos[[\n",
        "    \"AEROLINEA\",\"AEROPUERTO_ORIGEN\",\"AEROPUERTO_DESTINO\",\n",
        "    \"DISTANCIA\",\"DIA_SEMANA\",\"MES_PARTIDA\",\"ES_FIN_DE_SEMANA\",\n",
        "    \"TEMPORADA\",\n",
        "    \"HORA_LLEGADA\",\"FRANJA_HORARIA_LLEGADA\",\n",
        "    \"LLEGADA_PROGRAMA\",\"FRANJA_LLEGADA_PROGRAMA\"\n",
        "]]\n",
        "y = datos[\"RETRASO_GRAVE\"]\n",
        "\n",
        "categorical_cols = [\n",
        "    \"AEROLINEA\",\"AEROPUERTO_ORIGEN\",\"AEROPUERTO_DESTINO\",\n",
        "    \"FRANJA_HORARIA_LLEGADA\",\"FRANJA_LLEGADA_PROGRAMA\",\"DIA_SEMANA\",\"TEMPORADA\"\n",
        "]\n",
        "\n",
        "# Limpieza automática\n",
        "for col in categorical_cols:\n",
        "    X.loc[:, col] = X[col].astype(str).fillna(\"missing\")\n",
        "\n",
        "num_cols = [\"DISTANCIA\",\"DIA_SEMANA\",\"MES_PARTIDA\",\"ES_FIN_DE_SEMANA\",\n",
        "            \"HORA_LLEGADA\",\"LLEGADA_PROGRAMA\"]\n",
        "for col in num_cols:\n",
        "    X.loc[:, col] = pd.to_numeric(X[col], errors=\"coerce\")\n",
        "    X.loc[:, col] = X[col].fillna(X[col].mean())\n",
        "\n",
        "# ---------------------------\n",
        "# 6. Train/Test split\n",
        "# ---------------------------\n",
        "X_train, X_test, y_train, y_test = train_test_split(\n",
        "    X, y, test_size=0.2, random_state=42, stratify=y\n",
        ")\n",
        "\n",
        "# ---------------------------\n",
        "# 7. Modelo CatBoost\n",
        "# ---------------------------\n",
        "at_model = CatBoostClassifier(\n",
        "    iterations=1000,\n",
        "    learning_rate=0.03,\n",
        "    depth=5,\n",
        "    random_seed=42,\n",
        "    verbose=100,\n",
        "    class_weights=[1, len(y_train[y_train==0]) / len(y_train[y_train==1])]\n",
        ")\n",
        "\n",
        "at_model.fit(\n",
        "    X_train, y_train,\n",
        "    cat_features=categorical_cols,\n",
        "    eval_set=(X_test, y_test),\n",
        "    early_stopping_rounds=50\n",
        ")\n",
        "\n",
        "# ---------------------------\n",
        "# 8. Evaluación con umbral fijo = 0.7912\n",
        "# ---------------------------\n",
        "y_proba = at_model.predict_proba(X_test)[:,1]\n",
        "\n",
        "umbral_optimo = 0.7912\n",
        "y_pred_opt = (y_proba >= umbral_optimo).astype(int)\n",
        "\n",
        "print(\"Umbral fijo:\", umbral_optimo)\n",
        "print(\"Precisión:\", precision_score(y_test,y_pred_opt))\n",
        "print(\"Recall:\", recall_score(y_test,y_pred_opt))\n",
        "print(\"F1:\", f1_score(y_test,y_pred_opt))\n",
        "print(\"Matriz de confusión:\\n\", confusion_matrix(y_test,y_pred_opt))\n",
        "print(\"ROC-AUC:\", roc_auc_score(y_test,y_proba))"
      ]
    }
  ]
}

NameError: name 'null' is not defined