# Project: Hotel Booking Cancellation Prediction 
> Task: You are provided with a dataset of hotel bookings. The goal is to predict whether a booking will be canceled based on various features such as lead time, number of guests, meal preferences, and others. Your task is to build a classification model to predict the target variable is canceled, which indicates whether a booking was canceled (1) or not (0).

@Yevheniia-Rudenko 


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


## Data overview

In [2]:
# Load labeled and unlabeled data
labeled_data = pd.read_csv("/Users/yevrud/redi_school_ML_AI/final_project/data/train.csv")
unlabeled_data = pd.read_csv("/Users/yevrud/redi_school_ML_AI/final_project/data/test.csv")

# Create copies of the datasets
df_train = labeled_data.copy()
df_test = unlabeled_data.copy()


In [3]:
df_train.info()
df_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83573 entries, 0 to 83572
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           83573 non-null  object 
 1   is_canceled                     83573 non-null  int64  
 2   lead_time                       79290 non-null  float64
 3   arrival_date_year               83573 non-null  int64  
 4   arrival_date_month              83573 non-null  object 
 5   arrival_date_week_number        83573 non-null  int64  
 6   arrival_date_day_of_month       83573 non-null  int64  
 7   stays_in_weekend_nights         83573 non-null  float64
 8   stays_in_week_nights            83573 non-null  float64
 9   adults                          83573 non-null  int64  
 10  children                        83571 non-null  float64
 11  babies                          83573 non-null  int64  
 12  meal                            

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,id,estimated_check_in_duration,booking_difficulty_score
count,83573.0,79290.0,83573.0,83573.0,83573.0,83573.0,83573.0,83573.0,83571.0,83573.0,...,83573.0,72224.0,4674.0,83573.0,79290.0,83573.0,83573.0,83573.0,79290.0,79290.0
mean,0.39721,103.379569,27.120326,2016.155912,15.788077,0.928752,2.492898,1.859225,0.102918,0.008137,...,0.220681,86.603774,186.998074,2.321765,101.735184,0.06179,0.572745,59698.368983,51.673768,1.293816
std,0.489323,119.376604,13.619669,0.707381,8.782488,1.118739,2.126703,0.594781,0.397277,0.096017,...,0.654785,110.672677,130.511313,19.326393,57.456359,0.243445,0.794273,34498.851794,60.534659,2.294587
min,0.0,-227.156938,1.0,2015.0,1.0,-1.79131,-3.794357,0.0,0.0,0.0,...,0.0,1.0,9.0,-36.442446,-80.13604,0.0,0.0,1.0,-123.49194,-5.127325
25%,0.0,18.369438,16.0,2016.0,8.0,0.065392,1.105973,2.0,0.0,0.0,...,0.0,9.0,59.0,-5.701373,65.609982,0.0,0.0,29788.0,8.512819,0.103028
50%,0.0,79.56768,27.0,2016.0,16.0,0.795712,2.217654,2.0,0.0,0.0,...,0.0,14.0,174.0,0.39479,96.152208,0.0,0.0,59782.0,40.426761,1.090596
75%,1.0,167.570697,38.0,2017.0,23.0,1.713242,3.551786,2.0,0.0,0.0,...,0.0,229.0,268.0,6.589958,131.647735,0.0,1.0,89525.0,84.541252,2.131485
max,1.0,741.651834,53.0,2017.0,31.0,19.232182,49.354003,55.0,10.0,9.0,...,20.0,535.0,543.0,401.792112,5411.546686,8.0,5.0,119390.0,370.155876,55.593803


In [4]:
df_train.isnull().sum().sort_values(ascending=False)[lambda x : x > 0]

company                        78899
agent                          11349
country                         4540
meal                            4283
lead_time                       4283
adr                             4283
estimated_check_in_duration     4283
booking_difficulty_score        4283
children                           2
dtype: int64

## Data Cleaning and preprocessing

In [5]:
def clean_data(df):
    df = df.copy()

    # lead_time
    median_lead_time = df['lead_time'].median()
    df.loc[df['lead_time'] < 0, 'lead_time'] = median_lead_time
    df['lead_time'] = df['lead_time'].fillna(median_lead_time)

    # adr
    df['adr'] = df['adr'].apply(lambda x: np.nan if x < 0 else x)
    df['adr'] = df['adr'].fillna(df['adr'].median())

    # children
    df['children'] = df['children'].fillna(0)
    df.loc[df['children'] < 0, 'children'] = 0

    # estimated_check_in_duration
    df.loc[df['estimated_check_in_duration'] < 0, 'estimated_check_in_duration'] = np.nan
    df['estimated_check_in_duration'] = df['estimated_check_in_duration'].fillna(
        df['estimated_check_in_duration'].median())

    # booking_difficulty_score
    if 'booking_difficulty_score' in df.columns:
        df['booking_difficulty_score'] = df['booking_difficulty_score'].fillna(
            df['booking_difficulty_score'].median())

    # meal, country
    for col in ['meal', 'country']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])

    # agent, company
    for col in ['agent', 'company']:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    # Drop confidential info
    confidential_cols = ['name', 'email', 'phone-number', 'credit_card', 'id']
    df = df.drop(columns=[col for col in confidential_cols if col in df.columns])

    return df

# Clean both train and test
df_train = clean_data(df_train)
df_test = clean_data(df_test)

## Encode Categorical Features

In [6]:
# Align categorical columns
categorical_cols = df_train.select_dtypes(include='object').columns.tolist()
df_train = pd.get_dummies(df_train, columns=categorical_cols, drop_first=True)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)

# Align columns between train and test
df_test = df_test.reindex(columns=df_train.columns.drop("is_canceled"), fill_value=0)

## Split the train data set into train/val/test

In [7]:
X = df_train.drop("is_canceled", axis=1)
y = df_train["is_canceled"]

# First split: 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split: 20% val, 20% test
X_val, X_test_eval, y_val, y_test_eval = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_eval)
df_test_scaled = scaler.transform(df_test)

## Logistic Regression Model & Random Forest 

In [9]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=2000, solver="saga", random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_val_scaled)
lr_f1 = f1_score(y_val, lr_pred, average='weighted')

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_val_scaled)
rf_f1 = f1_score(y_val, rf_pred, average='weighted')


# Compare
model_scores = {
    "Logistic Regression": lr_f1,
    "Random Forest": rf_f1,
}
for name, score in model_scores.items():
    print(f"{name}: F1 Weighted = {score:.4f}")

Logistic Regression: F1 Weighted = 0.8968
Random Forest: F1 Weighted = 0.8969


In [10]:
# Final prediction with Logistic Regression
final_predictions_lr = lr_model.predict(df_test_scaled)
df_submission_lr = pd.DataFrame({"prediction": final_predictions_lr})
df_submission_lr.to_csv("submission_lr.csv", index=False)


# Final prediction with Random Forest
final_predictions_rf = rf_model.predict(df_test_scaled)
df_submission_rf = pd.DataFrame({"prediction": final_predictions_rf})
df_submission_rf.to_csv("submission_rf.csv", index=False)