In [None]:

# British Airways Data Science Simulation (Forage)
# Note: CSV file not included due to copyright.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

# Load Dataset (Path is example only)
# Replace below line with your own dataset loading if applicable

df = pd.read_csv("customer_booking.csv", encoding='ISO-8859-1')

# Explore dataset
# print("Dataset Shape:", df.shape)
# print("\nData Types:\n", df.dtypes)
# print("\nMissing Values:\n", df.isnull().sum())
# print("\nSample Data:\n", df.head())

# Preprocessing
# Encode categorical columns
categorical_cols = ['sales_channel', 'trip_type', 'flight_day', 'route', 'booking_origin']
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Encode cyclical time features
df['flight_hour_sin'] = np.sin(2 * np.pi * df['flight_hour'] / 24)
df['flight_hour_cos'] = np.cos(2 * np.pi * df['flight_hour'] / 24)
df.drop('flight_hour', axis=1, inplace=True)

# Feature and target split
X = df.drop('booking_complete', axis=1)
y = df['booking_complete']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Feature importance visualization
importances = model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
sns.barplot(x=importances[indices], y=features[indices])
plt.title("Feature Importance from Random Forest")
plt.tight_layout()
plt.show()
