In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [13]:
import joblib

# Model Deployment

### Defining the sample data

In [14]:
# Sample data
data = {
    'Booking_ID': [1, 2, 3, 4, 5],
    'number of adults': [6, 1, 3, 2, 4],
    'number of children': [0, 0, 6, 0, 1],
    'number of weekend nights': [4, 1, 0, 2, 1],
    'number of week nights': [5, 1, 4, 5, 3],
    'type of meal': ["Meal Plan 1", "Not Selected", "Meal Plan 2", "Meal Plan 3", "Meal Plan 1"],
    'car parking space': [1, 1, 0, 0, 1],
    'room type': ["Room_Type 4", "Room_Type 7", "Room_Type 2", "Room_Type 1", "Room_Type 3"],
    'lead time': [50, 30, 100, 10, 60],
    'market segment type': ["Online", "Complementary", "Offline", "Aviation", "Online"],
    'repeated': [0, 1, 0, 0, 1],
    'P-C': [1, 0, 1, 0, 1],  # 1 = Canceled, 0 = Not Canceled
    'P-not-C': [0, 1, 0, 1, 0],
    'average price': [100, 150, 250, 140, 130],
    'special requests': [2, 1, 0, 3, 1],
    'date of reservation': ['2023-12-01', '2023-10-29', '2023-02-13', '2023-11-21', '2023-03-07']
}

# Create DataFrame
test_data = pd.DataFrame(data)

### Dates (adding arrival month + removing the date of reservation)

In [15]:
from datetime import datetime, timedelta

# Convert 'date of reservation' column to datetime object using pd.to_datetime (this handles different formats and missing values)
test_data['date of reservation'] = pd.to_datetime(test_data['date of reservation'], errors='coerce')

# Verify that the date conversion was successful
print('date of reservation: ')
print(test_data['date of reservation'], "\n")

# Function to calculate the arrival date using lead time
def calculate_arrival_date(reservation_date, lead_time):
    if pd.isna(reservation_date):
        return pd.NaT  # Handle missing or invalid reservation dates
    else:
        # Add the lead time to the reservation date to calculate the arrival date
        arrival_date = reservation_date + timedelta(days=lead_time)
        return arrival_date

# Apply the function to create the 'arrival_date' column
test_data['arrival_month'] = test_data.apply(lambda row: calculate_arrival_date(row['date of reservation'], row['lead time']), axis=1)

test_data['arrival_month'] = test_data['arrival_month'].dt.month

# Print the updated DataFrame to verify the new 'arrival_date' column
print(test_data[['date of reservation', 'lead time', 'arrival_month']], "\n")


date of reservation: 
0   2023-12-01
1   2023-10-29
2   2023-02-13
3   2023-11-21
4   2023-03-07
Name: date of reservation, dtype: datetime64[ns] 

  date of reservation  lead time  arrival_month
0          2023-12-01         50              1
1          2023-10-29         30             11
2          2023-02-13        100              5
3          2023-11-21         10             12
4          2023-03-07         60              5 



#### Drop the 'date of reservation' column

In [16]:
test_data = test_data.drop(columns=['date of reservation'])

print(test_data.columns)    # Verify if the column is removed

Index(['Booking_ID', 'number of adults', 'number of children',
       'number of weekend nights', 'number of week nights', 'type of meal',
       'car parking space', 'room type', 'lead time', 'market segment type',
       'repeated', 'P-C', 'P-not-C', 'average price', 'special requests',
       'arrival_month'],
      dtype='object')


### Feature Engineering

In [17]:
# Calculate 'Total individuals' and 'Total nights'
test_data["total_individuals"] = test_data["number of adults"] + test_data["number of children"]
test_data["total_nights"] = test_data["number of week nights"] + test_data["number of weekend nights"]

#### Customer Type

In [18]:
# Define customer type based on the number of adults and children
def define_customer_type(row):
    if row['number of adults'] == 1 and row['number of children'] == 0:
        return 'solo'
    elif row['number of adults'] == 1 and row['number of children'] > 0:
        return 'family'  # Single parent with children
    elif row['number of adults'] > 1 and row['number of children'] > 0:
        return 'family'  # More than 1 adult and children
    elif row['number of adults'] > 2 and row['number of children'] == 0:
        return 'group'
    else:
        return 'couple'

test_data['customer_type'] = test_data.apply(define_customer_type, axis=1)

# Apply the function to create 'customer_type' column
test_data['customer_type'] = test_data.apply(define_customer_type, axis=1)
print(test_data['customer_type'])

0     group
1      solo
2    family
3    couple
4    family
Name: customer_type, dtype: object


In [19]:
test_data

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,arrival_month,total_individuals,total_nights,customer_type
0,1,6,0,4,5,Meal Plan 1,1,Room_Type 4,50,Online,0,1,0,100,2,1,6,9,group
1,2,1,0,1,1,Not Selected,1,Room_Type 7,30,Complementary,1,0,1,150,1,11,1,2,solo
2,3,3,6,0,4,Meal Plan 2,0,Room_Type 2,100,Offline,0,1,0,250,0,5,9,4,family
3,4,2,0,2,5,Meal Plan 3,0,Room_Type 1,10,Aviation,0,0,1,140,3,12,2,7,couple
4,5,4,1,1,3,Meal Plan 1,1,Room_Type 3,60,Online,1,1,0,130,1,5,5,4,family


In [20]:
# Save the updated data to CSV
test_data.to_csv('test_data.csv', index=False)
print("CSV file 'test_data.csv' created.")

CSV file 'test_data.csv' created.


### Loading the saved label encoders

In [21]:
# Load the label encoders for the test data
label_encoders = {}
categorical_features = ["type of meal", "room type", "market segment type", "customer_type"]

for col in categorical_features:
    label_encoders[col] = joblib.load(f'{col}_label_encoder.pkl')
    print(f"Label Encoder for '{col}' loaded.")

Label Encoder for 'type of meal' loaded.
Label Encoder for 'room type' loaded.
Label Encoder for 'market segment type' loaded.
Label Encoder for 'customer_type' loaded.


### Applying the loaded encoders to the categorical columns in your test data

In [22]:
# Encode the categorical columns in the test data
for col in categorical_features:
    test_data[col] = label_encoders[col].transform(test_data[col])
    print(f"Encoded '{col}' in the test data.")

Encoded 'type of meal' in the test data.
Encoded 'room type' in the test data.
Encoded 'market segment type' in the test data.
Encoded 'customer_type' in the test data.


### Defining X

In [23]:
# Drop unnecessary columns
X_test_data = test_data.drop(['Booking_ID'], axis=1)

X_test_data

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,arrival_month,total_individuals,total_nights,customer_type
0,6,0,4,5,0,1,3,50,4,0,1,0,100,2,1,6,9,2
1,1,0,1,1,3,1,6,30,1,1,0,1,150,1,11,1,2,3
2,3,6,0,4,1,0,1,100,3,0,1,0,250,0,5,9,4,1
3,2,0,2,5,2,0,0,10,0,0,0,1,140,3,12,2,7,0
4,4,1,1,3,0,1,2,60,4,1,1,0,130,1,5,5,4,1


### Scaling the test data

In [24]:
scaler = joblib.load(f'scaler.pkl')
scaled_X_test_data = scaler.transform(X_test_data)    # Apply the saved scaler to scale the test data

### Make predictions using the loaded model

In [25]:
random_forest = joblib.load(f'best_rf_grid_model.pkl')

In [26]:
# Make predictions on the test data
predictions = random_forest.predict(scaled_X_test_data)

# Print predictions
print("Predictions:", predictions)

# Save predictions to a CSV if needed
output = pd.DataFrame({'Booking_ID': test_data['Booking_ID'], 'Prediction': predictions})
output.to_csv('booking_cancellation_predictions.csv', index=False)

print("Predictions saved to booking_cancellation_predictions.csv")

Predictions: [0 0 0 0 0]
Predictions saved to booking_cancellation_predictions.csv


In [27]:
output

Unnamed: 0,Booking_ID,Prediction
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
