Data Loading and Initial Exploration

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('hotel_bookings.csv')

# Print the shape and datatypes of the dataframe
print(df.shape)
print(df.dtypes)

# Calculate the cancellation rate
cancel_rate = df['is_canceled'].mean() * 100
print(f"Cancellation Rate: {cancel_rate}%")

# Check for null values
print(df.isnull().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)


Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Check the distribution of cancellations
sns.countplot(data=df, x='is_canceled')
plt.title('Distribution of Cancellations')
plt.show()

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns


Feature Selection

In [None]:
from scipy.stats import chi2_contingency

# Perform Chi-Square test for independence between categorical variables and cancellations
chi2_results = {}
for col in cat_cols:
    crosstab = pd.crosstab(df[col], df['is_canceled'])
    chi2_test_results = chi2_contingency(crosstab)
    chi2_results[col] = chi2_test_results[1]  # Store the p-value

# Correlation of numerical features with cancellations
corr = df.corr()
cancellation_corr = corr['is_canceled'].sort_values()
print(cancellation_corr)


Data Visualization

In [None]:
# Visualizing Distributions of numerical and categorical variables
sns.countplot(data=df, x='is_canceled')
plt.show()

sns.countplot(data=df, x='hotel')
plt.show()

sns.countplot(data=df, x='hotel', hue='is_canceled')
plt.show()

sns.countplot(data=df, x='deposit_type', hue='is_canceled')
plt.show()

sns.countplot(data=df, x='customer_type', hue='is_canceled')
plt.show()

sns.countplot(data=df, x='meal', hue='is_canceled')
plt.show()

sns.countplot(data=df, x='market_segment', hue='is_canceled')
plt.xticks(rotation=45)
plt.show()


Model Building and Evaluation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Selected features
selected_features = ['lead_time', 'previous_cancellations', 'adults', 'days_in_waiting_list',
                     'previous_bookings_not_canceled', 'is_repeated_guest', 'booking_changes',
                     'required_car_parking_spaces', 'total_of_special_requests', 'hotel', 'country',
                     'market_segment', 'distribution_channel', 'assigned_room_type', 'deposit_type',
                     'customer_type', 'reservation_status_date', 'reserved_room_type', 'meal']

# Preprocess the data
X = df[selected_features]
y = df['is_canceled']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CatBoostClassifier
model = CatBoostClassifier(random_seed=42, verbose=False)

# Fit model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'depth': randint(4, 10),
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': randint(100, 1000)
}

# Initialize a CatBoostClassifier
model = CatBoostClassifier(random_seed=42, verbose=False)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=3, scoring='accuracy', verbose=2, n_jobs=-1, random_state=42)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print('Best parameters:', random_search.best_params_)
print('Best score:', random_search.best_score_)

# Use the best model to make predictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Feature Importance

In [None]:
# Get feature importance
feature_importances = best_model.get_feature_importance()

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({'feature': selected_features, 'importance': feature_importances})

# Sort by importance
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)

# Display
print(feature_importances_df)


ROC Curve

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Getting the probabilities of our predictions
y_scores = best_model.predict_proba(X_test)
y_scores = y_scores[:,1]

# Compute ROC curve
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_scores)

# Compute ROC AUC
roc_auc = roc_auc_score(y_test, y_scores)

# Plotting ROC Curve
plt.figure(figsize=(10,6))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
