In [4]:
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [5]:
df = pd.read_csv('hotel_booking.csv')

In [6]:
# Deleting these features are they do not influence the outcome 
del df['email']
del df['name']
del df['phone-number']
del df['credit_card']

In [7]:
del df['company']

In [14]:
del df['reservation_status']
del df['reservation_status_date']

In [8]:
# Handle missign values 
df.children = df.children.fillna(0).astype(int)

In [9]:
df.country = df.country.fillna(df.country.mode()[0])

In [10]:
df.agent = df.agent.fillna(df.agent.median()).astype(int)

In [11]:
# Month is stored as string, convert it to integer for supplying it to model

month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

df.arrival_date_month = df.arrival_date_month.map(month_map)

In [15]:
# Columns with string values should be convereted to int for the model to do effective prediction
obj_columns = df.columns[df.dtypes=='object'].to_list()

In [18]:
encoder = LabelEncoder();
for col in obj_columns:
    df[col] = encoder.fit_transform(df[col])

In [21]:
# Split the dataset and then train the model
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

y_train = df_train.is_canceled
y_test = df_test.is_canceled
y_val = df_val.is_canceled

del df_train['is_canceled']
del df_val['is_canceled']
del df_test['is_canceled']

dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = RandomForestClassifier(max_depth=20, min_samples_leaf=10, n_estimators=50,n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

In [22]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)

In [25]:
print(f'Accuracy Score',accuracy_score(y_val, y_pred))

Accuracy Scoe 0.8728955523913225


In [26]:
# Save the model to a bin file for later use
output_file = 'capstone1.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

print(f'The model is saved to file {output_file}')


The model is saved to file capstone1.bin
