In [1]:
import h2o

from h2o.automl import H2OAutoML
from h2o.frame import H2OFrame

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor as RFR, GradientBoostingRegressor as GBR, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import BayesianRidge, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from catboost import CatBoostRegressor as CBR

from sklearn.model_selection import GridSearchCV  
import xgboost as xgb
from xgboost import XGBRegressor
from scipy.optimize import minimize
from scipy.optimize import differential_evolution
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from lightgbm import LGBMRegressor as LGBMR

In [3]:
# Performance metrics
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as prec
from sklearn.metrics import recall_score as rec
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix as cm


In [4]:
fl = pd.read_csv("../data/ML/clean_data_for_ML.csv")
fl = fl[fl.duration.notna()]
fl['Delayed'] = fl['arr_mins_of_delay'].apply(lambda x: 1 if x > 10 else 0)
fl = fl.drop(columns = ['arr_mins_of_delay','cod_flight_IATA','cod_airport_IATA'])
fl['Delayed'] = fl['Delayed'].astype('category')

In [5]:
train, test = tts(fl, test_size=0.2, random_state=42, stratify=fl["Delayed"])
train.to_csv("../data/ML/train_autoML.csv", index=False)
test.to_csv("../data/ML/test_autoML.csv", index=False)

In [6]:
h2o.init(nthreads=8, max_mem_size="8G", enable_assertions=False)

Checking whether there is an H2O instance running at http://localhost:54321.

KeyboardInterrupt: 

In [None]:
train = h2o.import_file("../data/ML/train_autoML.csv")
test = h2o.import_file("../data/ML/test_autoML.csv")

In [None]:
X = train.columns
y = "Delayed"
X.remove(y)

In [None]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [None]:
# test_sol = test['Delayed']
# test = test.drop('Delayed', axis=1)



In [None]:
# h2train = h2o.H2OFrame(train)

# h2test = h2o.H2OFrame(test)

In [None]:
# h2train['Delayed'] = h2train['Delayed'].asfactor()
# # h2test['Delayed'] = h2test['Delayed'].asfactor()

In [None]:
# y = 'Delayed'

# X = [c for c in h2train.columns if c != 'price']

In [None]:
automl = H2OAutoML(
    max_models=20,
    seed=42,
    max_runtime_secs=300,
    sort_metric='recall',            # Métrica para ordenar modelos (F1-score en este caso)
    stopping_metric='logloss'    # Métrica para detener el entrenamiento temprano (puede usar 'logloss' o 'auc')
)


In [None]:
automl.train(x=X, y=y, training_frame=h2train)

In [None]:
lider = automl.leaderboard

lider.head(15)

In [None]:
y_pred = automl.leader.predict(h2test)

In [None]:
# Convierte el marco de datos H2O a un DataFrame de Pandas
predictions_df = h2o.as_list(y_pred)

# Calcula la matriz de confusión utilizando scikit-learn u otras bibliotecas
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(test['Delayed'].as_data_frame(), predictions_df['predict'].values)

print("Matriz de Confusión:")
print(conf_matrix)

In [None]:
# Suponiendo que 'y_pred[0]' es un H2OFrame
y_pred = np.array(y_pred[0].as_data_frame()['predict'].tolist())


In [None]:
y_pred[:10]

In [None]:
# print("Score: ",rfc.score(X_test, y_test))
print("Precission: ",prec(test['Delayed'].as_data_frame(), predictions_df['predict'].values))
print("Accuracy: ",acc(test['Delayed'].as_data_frame(), predictions_df['predict'].values))
print("Recall: ",rec(test['Delayed'].as_data_frame(), predictions_df['predict'].values))
print("F1: ",f1(test['Delayed'].as_data_frame(), predictions_df['predict'].values))

In [None]:
cm(test['Delayed'].as_data_frame(), predictions_df['predict'].values)
cm(test['Delayed'].as_data_frame(), predictions_df['predict'].values)/cm(test['Delayed'].as_data_frame(), predictions_df['predict'].values).sum()*100
plt.figure(figsize=(15, 8))

ax=sns.heatmap(cm(test['Delayed'].as_data_frame(), predictions_df['predict'].values)/cm(test['Delayed'].as_data_frame(), predictions_df['predict'].values).sum() * 100, 
               annot=True)

plt.title('Matriz confusion')
plt.ylabel('Verdad')
plt.xlabel('Prediccion')