In [2]:
import pandas as pd

In [3]:
data_path = 'X_2025-02.csv'
df = pd.read_csv(data_path)

In [4]:
ID_COLS = ['FL_DATE', 'OP_UNIQUE_CARRIER','TAIL_NUM', 'ORIGIN', 'DEST',]
TARGET = 'DEP_DELAY_15'
features = [c for c in df.columns if c not in ID_COLS + [TARGET]]

In [9]:
X = df[features]
y = df[TARGET]

In [6]:
# Import models to train

# Modeling
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [10]:
# Partir el conjunto de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
base_model = LogisticRegression()
base_model.fit(X_train, y_train)

# Predict probabilities
y_scores = base_model.predict_proba(X_test)[:, 1]

# Calculate AUC
auc = roc_auc_score(y_test, y_scores)
print(f"ROC AUC: {auc:.4f}")

MemoryError: Unable to allocate 1.50 GiB for an array with shape (447661, 451) and data type float64

In [None]:
# Modelos a entrenar
random_state = 42
models = {
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=random_state),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=random_state),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=random_state),
    'RandomForestClassifier': RandomForestClassifier(random_state=random_state)
}

# Parámetros para la búsqueda de hiperparámetros (optimizado para velocidad)
param_grids = {
    'DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': ['balanced', None]
    },
    'LogisticRegression': {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear'],
        'class_weight': ['balanced', None]
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'AdaBoostClassifier': {
        'n_estimators': [50, 100],
        'learning_rate': [0.5, 1.0]
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'class_weight': ['balanced', None]
    }
}

In [None]:
# Resultados de la evaluación
results = []

for model_name, model in models.items():
    print(model_name)

    search = RandomizedSearchCV(model, param_grids[model_name], cv=5, scoring='roc_auc', n_jobs=-1,n_iter=100,)
    search.fit(X_train, y_train)

    # Obtener el mejor modelo y evaluar en el conjunto de prueba
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # Evaluar el rendimiento
    report = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'model': model_name,
        'best_params': search.best_params_,
        'classification_report': report,
        'roc_auc': auc
    })

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='roc_auc', ascending=False).reset_index(drop=True)
results_df

In [13]:
# Train final model with all data

In [None]:
results_df.iloc[0]['model']

In [None]:

results_df.iloc[0]['classification_report']

In [None]:
results_df.iloc[0]['best_params']

In [None]:
final_model = models[results_df.iloc[0]['model']]

In [None]:
final_model.set_params(**results_df.iloc[0]['best_params'])

In [None]:
final_model.fit(X, y)

In [None]:
final_model

In [None]:
pd.to_pickle(final_model, 'final_model.pkl')