In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

# Cargar los datos
df = pd.read_csv('../src/data/raw/hotel_bookings.csv')

In [2]:
# 🔹 Eliminar columnas no útiles o redundantes
df = df.drop(columns=[
    'company', 'agent', 'reservation_status', 'reservation_status_date'
])

In [3]:
# 🔹 Completar valores nulos (por ejemplo en country, children)
df['children'].fillna(0, inplace=True)
df['country'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['children'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['country'].fillna('Unknown', inplace=True)


In [4]:
# 🔹 Crear nuevas variables si querés (ej: estancia total)
df['total_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

In [5]:
# 🔹 Seleccionar columnas para el modelo
target = 'is_canceled'
X = df.drop(columns=[target])
y = df[target]


In [6]:
# 🔹 Identificar columnas numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [7]:

# Asegurarse de que no haya variables que dependan de la variable target
if 'is_canceled' in numeric_features: numeric_features.remove('is_canceled')

In [8]:
# 🔹 Crear transformadores para el pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


In [9]:
# 🔹 Armar el preprocesador combinado
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [10]:
# 🔹 Separar en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Pipeline completo con modelo (ej. Random Forest)
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# 🔹 Entrenar el modelo
model.fit(X_train, y_train)

# 🔹 Predicción y evaluación
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nMatriz de Confusión:\n", confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:\n", classification_report(y_test, y_pred))

Accuracy: 0.8925370634056453

Matriz de Confusión:
 [[14092   815]
 [ 1751  7220]]

Reporte de Clasificación:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92     14907
           1       0.90      0.80      0.85      8971

    accuracy                           0.89     23878
   macro avg       0.89      0.88      0.88     23878
weighted avg       0.89      0.89      0.89     23878



In [12]:
import joblib

# Guardar el modelo entrenado
joblib.dump(model, '../src/model/hotel_cancellation_predictor.pkl')

['../src/model/hotel_cancellation_predictor.pkl']