### Cargando Data procesada

In [1]:
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_columns', None)

In [2]:
hotels_train = pd.read_csv("../data/hotels_processed.csv")

x_train = hotels_train.drop("is_canceled", axis=1)
y_train = hotels_train.filter(["is_canceled"])

x_test = pd.read_csv("../data/hotels_test.csv")
ids_test = x_test.filter(["id"])


Veamos los porcentajes de nulos

In [3]:
filas_totales=hotels_train.shape[0]
print(hotels_train.isna().sum()/filas_totales*100)

hotel                             0.0
lead_time                         0.0
arrival_date_year                 0.0
arrival_date_month                0.0
arrival_date_week_number          0.0
arrival_date_day_of_month         0.0
stays_in_weekend_nights           0.0
stays_in_week_nights              0.0
adults                            0.0
children                          0.0
babies                            0.0
meal                              0.0
country                           0.0
market_segment                    0.0
distribution_channel              0.0
is_repeated_guest                 0.0
previous_cancellations            0.0
previous_bookings_not_canceled    0.0
reserved_room_type                0.0
assigned_room_type                0.0
booking_changes                   0.0
deposit_type                      0.0
agent                             0.0
days_in_waiting_list              0.0
customer_type                     0.0
adr                               0.0
required_car

### Armando pipeline

In [4]:
one_hot_encoder_cols = [
  'hotel',
  'arrival_date_month',
  'meal',
  'market_segment',
  'distribution_channel',
  'reserved_room_type',
  'assigned_room_type',
  'deposit_type',
  'customer_type',
  'is_repeated_guest'
]

binary_encoder_cols = [
  'country',
  'agent'
]

ignored_cols = [
  "reservation_status_date",
  "id",
  "company",
]

# TODO: Incluir en el Column Transformer
def drop_cols(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
  return df.drop(cols, axis=1, errors="ignore")

x_train = drop_cols(x_train, ignored_cols)
x_test = drop_cols(x_test, ignored_cols)

column_trans = ColumnTransformer(
  transformers=(
    ("binary_encoder", BinaryEncoder(cols=binary_encoder_cols), binary_encoder_cols),
    ("one_hot_encoder", OneHotEncoder(cols=one_hot_encoder_cols), one_hot_encoder_cols)
  ),
  remainder="passthrough"
)

pipeline = Pipeline(
  steps=[
    ("transformer", column_trans),
    ("model", DecisionTreeClassifier())
  ])


### Buscando hiperparametros

In [5]:
params = {
  # 'model__criterion':['gini', 'entropy'],
  'model__criterion':['entropy'],
  # 'model__min_samples_leaf':[10, 15, 20],
  'model__min_samples_leaf':[10],
  # 'min_samples_split': list(range(2,20)),
  # 'model__ccp_alpha':np.linspace(0,0.05, 20),
  'model__max_depth':list(range(20, 51, 5))
}

grid_search = GridSearchCV(
  estimator=pipeline,
  param_grid=params,
  scoring="f1",
  n_jobs=-1,
  cv=5
)

grid_search.fit(x_train, y_train)

print(f"Best params: {grid_search.best_params_}")
print(f"F1 score: {grid_search.best_score_}")

Best params: {'model__criterion': 'entropy', 'model__max_depth': 20, 'model__min_samples_leaf': 10}
F1 score: 0.842642343431401


In [6]:
pipeline.set_params(**grid_search.best_params_)
pipeline.fit(x_train, y_train)

### Predict and save

In [7]:
y_predict = pipeline.predict(x_test)
ids_test["is_canceled"] = y_predict
ids_test.to_csv("result.002.csv", index=False)