<a href="https://www.kaggle.com/code/araspirbadian/hotel-booking-logistic-regression?scriptVersionId=96793454" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Using logistic regression in hotel booking dataset to determine the people who will likely cancel their reservation

Imports and reading the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, plot_roc_curve
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("../input/hotel-booking/hotel_booking.csv")

In [None]:
data.head()

# Handling the missing data

In [None]:
data.isnull().sum()

In [None]:
data.drop(['company', 'agent'], axis = 1, inplace = True)
data.isnull().sum()

In [None]:
data[data['children'].isnull()]

# Dropping rows with children as NAN

In [None]:
#data = data.dropna( axis = 0, subset = ['children'])
data['children'][data['children'].isnull()] = data['children'].mean()

In [None]:
data.isnull().sum()

# Converting categorical to numbers and dropping the columns that won't be useful 

In [None]:
data.info()

In [None]:
#data_co = data.copy()
data.drop(['name', 'email', 'phone-number', 'credit_card', 'arrival_date_month', 'reservation_status_date', 'reservation_status' ], axis = 1, inplace= True)
data['market_segment'] = pd.factorize(data['market_segment'])[0] + 1
data['deposit_type'] = pd.factorize(data['deposit_type'])[0] + 1
data['assigned_room_type'] = pd.factorize(data['assigned_room_type'])[0] + 1
data['customer_type'] = pd.factorize(data['customer_type'])[0] + 1
data['distribution_channel'] = pd.factorize(data['distribution_channel'])[0] + 1
data['reserved_room_type'] = pd.factorize(data['reserved_room_type'])[0] + 1
data['meal'] = pd.factorize(data['meal'])[0] + 1
data['country'] = pd.factorize(data['country'])[0] + 1
data['hotel'] = pd.factorize(data['hotel'])[0] + 1
data.head()

# Checking the correlations

In [None]:
fig = plt.figure( figsize = ( 10,10))
data_correlations = data.corr(method = 'pearson')
sns.heatmap(data_correlations, cmap='Wistia', linecolor='black', linewidths=2)

In [None]:
data.info()

# Making the train and test datasets

In [None]:
y_all = data['is_canceled']
x_all = data.drop('is_canceled', axis = 1 , inplace = False)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.1, random_state=101)

# Scaling the data and using Logistic Regression

In [None]:
scaler_train= StandardScaler()
scaler_train.fit(x_train)
scaler_test= StandardScaler()
scaler_test.fit(x_test)
x_train_scaled = scaler_train.transform(x_train)
x_test_scaled = scaler_test.transform(x_test)

log_model= LogisticRegression()
log_model.fit(x_train_scaled, y_train)

# Prediction

In [None]:
y_pred= log_model.predict(x_test_scaled)

# Evaluation

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plot_confusion_matrix(log_model, x_test_scaled, y_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plot_precision_recall_curve(log_model, x_test_scaled, y_test)

In [None]:
plot_roc_curve(log_model, x_test_scaled, y_test)