In [None]:
import xgboost 
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv("Hotel Reservations.csv")
df.head()

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 17 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   type_of_meal_plan                     36275 non-null  object 
 5   required_car_parking_space            36275 non-null  int64  
 6   room_type_reserved                    36275 non-null  object 
 7   lead_time                             36275 non-null  int64  
 8   arrival_year                          36275 non-null  object 
 9   arrival_month                         36275 non-null  object 
 10  market_segment_type                   36275 non-null  object 
 11  repeated_guest 

In [100]:
df.no_of_previous_bookings_not_canceled.max()

58

In [None]:
df.type_of_meal_plan.nunique()

In [None]:
df.room_type_reserved.nunique()

In [None]:
df.market_segment_type.nunique()

In [None]:
df.drop(["Booking_ID", "arrival_date"],axis=1, inplace=True)
df["arrival_year"] = df["arrival_year"].astype(str)
df["arrival_month"] = df["arrival_month"].astype(str)

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


cat_features = ["type_of_meal_plan", "room_type_reserved", "market_segment_type", "arrival_year", "arrival_month"]

cat_pipeline = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), cat_features),
    ]
)

label_encoder = LabelEncoder()
df["booking_status"] = label_encoder.fit_transform(df["booking_status"])

target = ["booking_status"]
features = df.drop("booking_status", axis=1).columns.to_list()

X_train, X_valid, y_train,  y_valid = train_test_split(df[features], df[target], random_state=79)


X_train = cat_pipeline.fit_transform(X_train)
X_valid = cat_pipeline.transform(X_valid)

In [None]:
X_train.shape

In [None]:
def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_float("min_child_weight", 0.1, 10),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 1.0),
    }

    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dtest = xgboost.DMatrix(X_valid, label=y_valid)

    model = xgboost.train(params, dtrain)
    y_pred = model.predict(dtest)

    f1 = f1_score(y_valid, (y_pred > 0.5).astype(int))
    return -f1


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("Best Hyperparameters:", best_params)

In [None]:
final_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    **best_params,
}

final_model = xgboost.XGBClassifier(**final_params)
final_model.fit(X_train, y_train)

In [None]:
y_pred = final_model.predict(X_valid)
f1 = f1_score(y_valid, y_pred)
print("Final Model F1 Score:", f1)

In [101]:
import pickle

# with open("model.pkl", "wb") as file:
#     pickle.dump(final_model, file)

# with open("column_preprocessor.pkl", "wb") as file:
#     pickle.dump(cat_pipeline, file)

with open("label_decoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)