# **Import Libraries**

In [4]:
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_val_predict  # Model evaluation and selection
from sklearn.preprocessing import OneHotEncoder  # One-hot encoding categorical variables
from sklearn.ensemble import RandomForestClassifier  # Random forest classifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, precision_score, recall_score, f1_score  # Model evaluation metrics

import plotly.graph_objects as go  # Interactive visualizations
import plotly.io as pio  # Saving plots to file

from imblearn.over_sampling import SMOTE  # Oversampling the minority class

# **Data Preparation**

In [5]:
# Read the data
data = pd.read_csv('/content/customer_booking (1).csv', encoding='latin-1')

# Drop the 'route' column from the DataFrame
data.drop(['route'], axis=1, inplace=True)

# Transform the 'booking_origin' column
thresholds = {'booking_origin': 19}
for column, threshold in thresholds.items():
    category_counts = data[column].map(data[column].value_counts())
    data[column] = data[column].where(category_counts <= threshold, 'other')

# **Model Training & Evaluation**

In [6]:
# Preprocess the data
categorical_features = ['sales_channel', 'trip_type', 'flight_day', 'booking_origin']
X_categorical = data[categorical_features]

# Perform one-hot encoding
one_hot_encoder = OneHotEncoder()
X_categorical_encoded = one_hot_encoder.fit_transform(X_categorical)

# Convert the encoded features back to a DataFrame
X_categorical_encoded_df = pd.DataFrame(
    X_categorical_encoded.toarray(),
    columns=one_hot_encoder.get_feature_names_out(categorical_features)
)

# Combine the encoded categorical features with the numerical features
X_numerical = data.drop(categorical_features + ['booking_complete'], axis=1)
X = pd.concat([X_numerical, X_categorical_encoded_df], axis=1)
y = data['booking_complete']

# Handle class imbalance with SMOTE oversampling
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Evaluate the Random Forest model on the testing set
rf_preds = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)
rf_roc_auc = roc_auc_score(y_test, rf_preds)

print("Random Forest Model Performance on Testing Set:")
print(f"Accuracy: {rf_accuracy:.2f}")
print(f"Precision: {rf_precision:.2f}")
print(f"Recall: {rf_recall:.2f}")
print(f"F1 Score: {rf_f1:.2f}")
print(f"ROC AUC: {rf_roc_auc:.2f}")

# Perform stratified cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
rf_scores = cross_val_score(rf, X, y, cv=cv)
rf_mean_accuracy = np.mean(rf_scores)
rf_mean_roc_auc = np.mean(cross_val_score(rf, X, y, cv=cv, scoring='roc_auc'))
rf_mean_precision = np.mean(cross_val_score(rf, X, y, cv=cv, scoring='precision'))
rf_mean_recall = np.mean(cross_val_score(rf, X, y, cv=cv, scoring='recall'))
rf_mean_f1 = np.mean(cross_val_score(rf, X, y, cv=cv, scoring='f1'))
print()
print("Random Forest Model Performance using Cross-Validation:")
print(f"Accuracy: {rf_mean_accuracy:.2f}")
print(f"Precision: {rf_mean_precision:.2f}")
print(f"Recall: {rf_mean_recall:.2f}")
print(f"F1 Score: {rf_mean_f1:.2f}")
print(f"ROC AUC: {rf_mean_roc_auc:.2f}")

Random Forest Model Performance on Testing Set:
Accuracy: 0.90
Precision: 0.95
Recall: 0.84
F1 Score: 0.89
ROC AUC: 0.90

Random Forest Model Performance using Cross-Validation:
Accuracy: 0.90
Precision: 0.96
Recall: 0.84
F1 Score: 0.89
ROC AUC: 0.95


# **ROC Curve**

In [7]:
# Calculate the ROC curve and AUC for testing set
y_prob_test = rf.predict_proba(X_test)[:, 1]
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_prob_test)
roc_auc_test = roc_auc_score(y_test, y_prob_test)

# Calculate the ROC curve and AUC for cross-validation
y_prob_cv = cross_val_predict(rf, X, y, cv=cv, method='predict_proba')[:, 1]
fpr_cv, tpr_cv, thresholds_cv = roc_curve(y, y_prob_cv)
roc_auc_cv = roc_auc_score(y, y_prob_cv)

# Calculate the ROC curve for random guess
random_guess_fpr = [0, 1]
random_guess_tpr = [0, 1]

# Create the ROC curve trace for testing set
roc_trace_test = go.Scatter(
    x=fpr_test,
    y=tpr_test,
    name=f'Testing Set ROC (AUC = {roc_auc_test:.2f})',
    mode='lines',
    line=dict(color='orange')
)

# Create the ROC curve trace for cross-validation
roc_trace_cv = go.Scatter(
    x=fpr_cv,
    y=tpr_cv,
    name=f'Cross-Validation ROC (AUC = {roc_auc_cv:.2f})',
    mode='lines',
    line=dict(color='#0074D9')
)

# Create the ROC curve trace for random guess
roc_trace_random_guess = go.Scatter(
    x=random_guess_fpr,
    y=random_guess_tpr,
    name='Random Guess ROC',
    mode='lines',
    line=dict(color='#FF4136', dash='dash')
)

# Create the layout with adjusted title positioning
layout = go.Layout(
    title='Receiver Operating Characteristic (ROC)',
    title_x=0.4,  # Set the title position to the center of the x-axis
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    showlegend=True
)

# Create the figure and add the traces
fig = go.Figure(data=[roc_trace_test, roc_trace_cv, roc_trace_random_guess], layout=layout)

# Show the plot
pio.show(fig)

# **Feature Importance**

In [8]:
# Get feature importances from the Random Forest model
rf_feature_importances = rf.feature_importances_
feature_names = X.columns

# Sort the feature importances in descending order and select the top 10 features
rf_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': rf_feature_importances})
rf_importance_df = rf_importance_df.sort_values('Importance', ascending=False).head(20)

# Random Forest feature importance plot
fig = go.Figure()
fig.add_trace(go.Bar(x=rf_importance_df['Importance'], y=rf_importance_df['Feature'], orientation='h', marker=dict(color='#0074D9')))

fig.update_layout(title="Top 20 Most Important Features", title_x=0.5, xaxis_title="Importance", yaxis_title="Features")

# Show the plot
pio.show(fig)