In [None]:
# Install packages
!pip install xgboost
!pip install -q imbalanced-learn

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Load the cleaned data
df = pd.read_csv('data/merged_data.csv')
df.head()

Unnamed: 0,Gender,Date of Birth,Source of Traffic,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Logging,Onboard Dining Service,Online Check-in,...,Cleanliness,Ext_Intcode,Age,Cruise Name,Ticket Type,WiFi,Dining,Entertainment,Cruise Distance,Age Group
0,,1973-10-05,Direct - Company Website,2.0,3.0,5.0,3.0,2023-01-01 00:00:00,4.0,2.0,...,3.0,LB446RWOOZI,50.0,Blastoise,,1.0,1,1.0,3567.0,46-60
1,Female,,Indirect - Social Media,1.0,4.0,1.0,,2023-01-01 00:01:00,4.0,,...,4.0,LB138HKBECM,,Blastoise,Deluxe,,0,1.0,672.0,
2,Female,1998-07-22,Indirect - Search Engine,,3.0,0.0,5.0,2023-01-01 00:02:00,,,...,,BL713UHBAAN,25.0,Lapras,Deluxe,,0,0.0,1167.0,18-30
3,Female,1970-05-01,Direct - Company Website,4.0,4.0,4.0,4.0,2023-01-01 00:05:00,3.0,4.0,...,4.0,LB243DMKCFL,53.0,Lapras,Deluxe,,0,1.0,280.0,46-60
4,Male,1960-01-07,Direct - Company Website,3.0,4.0,2.0,,2023-01-01 00:06:00,1.0,2.0,...,,LB218CFLOBS,63.0,Lapras,Standard,,1,,1842.6943,61-80


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141538 entries, 0 to 141537
Data columns (total 26 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Gender                                      127308 non-null  object 
 1   Date of Birth                               126056 non-null  object 
 2   Source of Traffic                           141538 non-null  object 
 3   Onboard Wifi Service                        120910 non-null  float64
 4   Embarkation/Disembarkation time convenient  125005 non-null  float64
 5   Ease of Online booking                      122047 non-null  float64
 6   Gate location                               123400 non-null  float64
 7   Logging                                     141538 non-null  object 
 8   Onboard Dining Service                      123793 non-null  float64
 9   Online Check-in                             124909 non-null  float64
 

### Data Preprocessing

In [4]:
# Removing Logging column since the time where the customers information was logged shouldn't be relevant to avoid curse of dimensionality
# Removing Ext_Intcode since it is only relevant for consolidating our datasets
# Removing Date of Birth since we got the Age
# Removing Cruise Name since there was no specification/mention that there were any differences between the two
# Removing Age Group

columns_to_remove = ['Logging', 'Ext_Intcode', 'Date of Birth', 'Cruise Name', 'Age Group']
df.drop(columns_to_remove, axis=1, inplace=True)

In [5]:
# Dropping rows where our target variable 'Ticket Type' has missing values
df.dropna(subset=['Ticket Type'], inplace=True)


In [6]:
# Train Test Split Data

X = df.drop('Ticket Type', axis=1)
y = df['Ticket Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Ticket Type']) # Stratify to ensure that the train and test data have the same proportion of ticket types


In [7]:
# Converting categorical columns 'Gender', Source of Traffic' and 'Ticket Type' to numerical with one hot encoding as they should not be ordinal in nature 

# Encode categorical columns separately for training and testing sets
X_train_encoded = pd.get_dummies(X_train, columns=['Gender', 'Source of Traffic'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=['Gender', 'Source of Traffic'], drop_first=True)


In [8]:
X_train_encoded.head()

Unnamed: 0,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Onboard Dining Service,Online Check-in,Cabin Comfort,Onboard Entertainment,Cabin service,Baggage handling,...,Cleanliness,Age,WiFi,Dining,Entertainment,Cruise Distance,Gender_Male,Source of Traffic_Direct - Email Marketing,Source of Traffic_Indirect - Search Engine,Source of Traffic_Indirect - Social Media
128306,2.0,2.0,2.0,2.0,3.0,,3.0,3.0,,4.0,...,,30.0,1.0,1,1.0,,1,0,0,0
19779,5.0,5.0,5.0,5.0,,4.0,,4.0,4.0,4.0,...,3.0,47.0,1.0,1,0.0,1620.0,0,1,0,0
107588,,3.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,,...,,,0.0,1,1.0,5333.35276,1,1,0,0
129179,5.0,4.0,5.0,3.0,3.0,5.0,3.0,3.0,1.0,4.0,...,3.0,,,0,,580.0,1,0,0,0
44639,3.0,3.0,3.0,3.0,,5.0,4.0,4.0,5.0,5.0,...,4.0,29.0,0.0,0,0.0,3419.0,1,0,0,0


### Filling in Missing Values with KNN Imputation

In [9]:
# Initialize KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Impute missing values in training data
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_test_imputed = imputer.transform(X_test_encoded)


In [11]:
# Round the imputed values to integers 
X_train_imputed = np.round(X_train_imputed)
X_test_imputed = np.round(X_test_imputed)

# Convert imputed data back to DataFrame
X_train = pd.DataFrame(X_train_imputed, columns=X_train_encoded.columns)
X_test = pd.DataFrame(X_test_imputed, columns=X_test_encoded.columns)

# Display the first 5 rows of the imputed training data
X_train.head()

Unnamed: 0,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Onboard Dining Service,Online Check-in,Cabin Comfort,Onboard Entertainment,Cabin service,Baggage handling,...,Cleanliness,Age,WiFi,Dining,Entertainment,Cruise Distance,Gender_Male,Source of Traffic_Direct - Email Marketing,Source of Traffic_Indirect - Search Engine,Source of Traffic_Indirect - Social Media
0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,4.0,4.0,...,3.0,30.0,1.0,1.0,1.0,1540.0,1.0,0.0,0.0,0.0
1,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,...,3.0,47.0,1.0,1.0,0.0,1620.0,0.0,1.0,0.0,0.0
2,3.0,3.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,...,3.0,47.0,0.0,1.0,1.0,5333.0,1.0,1.0,0.0,0.0
3,5.0,4.0,5.0,3.0,3.0,5.0,3.0,3.0,1.0,4.0,...,3.0,30.0,1.0,0.0,1.0,580.0,1.0,0.0,0.0,0.0
4,3.0,3.0,3.0,3.0,3.0,5.0,4.0,4.0,5.0,5.0,...,4.0,29.0,0.0,0.0,0.0,3419.0,1.0,0.0,0.0,0.0


As our main objective is to predict what type of tickets potential customers will purchase, post-trip features will not be relevant as the survey is only done after purchase of tickets as we will not have those information at the purchasing stage. We have utilized these features for imputation of our other features in the pre-purchase data, so we will drop them off here. 

In [12]:
# Removing post trip columns since they will not be available to our predictor model in real use case

columns_to_remove = ['Cruise Distance', 'WiFi','Dining','Entertainment']
X_train = X_train.drop(columns_to_remove, axis=1)
X_test = X_test.drop(columns_to_remove, axis=1)

#### Scaling our input features

In [13]:
# We will scale our input features since 'Age' differs largely with the rest
# As there are outliers seen in our data and we do not want to remove the outliers due to our data size being limited enough already, we will use RobustScaler, which scales features based on their median and interquartile range, making it robust to outliers

# Instantiate the scaler
scaler = RobustScaler()

# Fit the scaler on your training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform your test data using the same scaler
X_test_scaled = scaler.transform(X_test)



In [14]:
# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

#### Creating a Baseline Model

In [15]:
# Initialize and train the logistic regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train_encoded)

# Predict on the test set
y_pred = logreg.predict(X_test_scaled)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f'Baseline Model Accuracy: {accuracy * 100:.2f}%')

# Display the classification report
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))

Baseline Model Accuracy: 63.81%
              precision    recall  f1-score   support

      Deluxe       0.00      0.00      0.00      1743
      Luxury       0.67      0.69      0.68     11514
    Standard       0.61      0.69      0.64     10800

    accuracy                           0.64     24057
   macro avg       0.43      0.46      0.44     24057
weighted avg       0.59      0.64      0.61     24057



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Convert NumPy arrays to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
y_train_encoded_df = pd.DataFrame(y_train_encoded, columns=['target_column_name'])  # Replace 'target_column_name' with your actual target column name
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Save DataFrames to CSV files
X_train_scaled_df.to_csv('data/X_train.csv', index=False)
y_train_encoded_df.to_csv('data/y_train_encoded.csv', index=False)
X_test_scaled_df.to_csv('data/X_test.csv', index=False)


In [17]:
# Initialize and train the Random Forest model
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train_scaled, y_train_encoded)

# Predict on the test set using Random Forest
y_pred_rf = rf.predict(X_test_scaled)

# Calculate and print the accuracy for Random Forest
accuracy_rf = accuracy_score(y_test_encoded, y_pred_rf)
print(f'Random Forest Model Accuracy: {accuracy_rf * 100:.2f}%')
print(classification_report(y_test_encoded, y_pred_rf, target_names=label_encoder.classes_))



Random Forest Model Accuracy: 81.62%
              precision    recall  f1-score   support

      Deluxe       0.67      0.09      0.16      1743
      Luxury       0.88      0.86      0.87     11514
    Standard       0.77      0.89      0.82     10800

    accuracy                           0.82     24057
   macro avg       0.77      0.61      0.62     24057
weighted avg       0.81      0.82      0.80     24057





In [19]:
# Initialize and train the XGBoost model
xgb = XGBClassifier(scale_pos_weight=20, random_state=42)
xgb.fit(X_train_scaled, y_train_encoded)

# Predict on the test set using XGBoost
y_pred_xgb = xgb.predict(X_test_scaled)

# Calculate and print the accuracy for XGBoost
accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
print(f'XGBoost Model Accuracy: {accuracy_xgb * 100:.2f}%')
print(classification_report(y_test_encoded, y_pred_xgb, target_names=label_encoder.classes_))

Parameters: { "scale_pos_weight" } are not used.



XGBoost Model Accuracy: 81.24%
              precision    recall  f1-score   support

      Deluxe       0.39      0.02      0.03      1743
      Luxury       0.86      0.87      0.87     11514
    Standard       0.77      0.88      0.82     10800

    accuracy                           0.81     24057
   macro avg       0.67      0.59      0.57     24057
weighted avg       0.79      0.81      0.78     24057



### Resampling & Hyperparameter Tuning

Since our target variable class is imbalanced, we will first use resampling in an attempt to improve our better performing models. 

In [26]:
# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Resample the training data prior to scaling
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_encoded)


In [27]:
# Count the frequency of unique values in y_train_resampled
unique_values, counts = np.unique(y_train_resampled, return_counts=True)

# Create a dictionary to represent the distribution
distribution = dict(zip(unique_values, counts))

# Print the distribution
print(distribution)

{0: 46056, 1: 46056, 2: 46056}


In [28]:
# Fit the scaler on resampled training data and transform it
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)

In [29]:
# Initialize the classifiers
rf_clf_resampled = RandomForestClassifier(random_state=42)
xgb_clf_resampled = XGBClassifier(random_state=42)

# Train the classifiers on the resampled data
rf_clf_resampled.fit(X_train_resampled_scaled, y_train_resampled)
xgb_clf_resampled.fit(X_train_resampled_scaled, y_train_resampled)

# Predict on the test set
rf_preds_resampled = rf_clf_resampled.predict(X_test_scaled)
xgb_preds_resampled = xgb_clf_resampled.predict(X_test_scaled)



In [30]:
# Decode the predicted labels
rf_decoded_preds_resampled = label_encoder.inverse_transform(rf_preds_resampled)
xgb_decoded_preds_resampled = label_encoder.inverse_transform(xgb_preds_resampled)

# Calculate accuracy
rf_acc_resampled = accuracy_score(y_test, rf_decoded_preds_resampled) * 100
xgb_acc_resampled = accuracy_score(y_test, xgb_decoded_preds_resampled) * 100

# Generate classification reports
rf_report_resampled = classification_report(y_test, rf_decoded_preds_resampled, target_names=label_encoder.classes_)
xgb_report_resampled = classification_report(y_test, xgb_decoded_preds_resampled, target_names=label_encoder.classes_)



In [31]:
print(f'Resampled Random Forest Model Accuracy:',rf_acc_resampled)
print(rf_report_resampled)

Resampled Random Forest Model Accuracy: 61.790746975932166
              precision    recall  f1-score   support

      Deluxe       0.08      0.25      0.12      1743
      Luxury       0.89      0.50      0.64     11514
    Standard       0.71      0.80      0.75     10800

    accuracy                           0.62     24057
   macro avg       0.56      0.52      0.51     24057
weighted avg       0.75      0.62      0.65     24057



In [32]:
print(f'Resampled XGBoost Model Accuracy', xgb_acc_resampled)
print(xgb_report_resampled)

Resampled XGBoost Model Accuracy 66.92854470632248
              precision    recall  f1-score   support

      Deluxe       0.13      0.33      0.18      1743
      Luxury       0.86      0.72      0.78     11514
    Standard       0.73      0.67      0.70     10800

    accuracy                           0.67     24057
   macro avg       0.57      0.57      0.56     24057
weighted avg       0.75      0.67      0.70     24057



Models performed worse after resampling. We will perform hyperparameter tuning instead. 

In [33]:
# Due to time constraints, we will use RandomSearchCV

# Define a smaller parameter grid
param_distributions = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

# Choosing Random Forest which performed slightly better
rf = RandomForestClassifier()

# Instantiate the randomized search model
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
                                   n_iter=50, cv=2, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the random search to the data
random_search.fit(X_train_scaled, y_train_encoded)

# Get the best parameters
best_params = random_search.best_params_
best_params


Fitting 2 folds for each of 50 candidates, totalling 100 fits


  warn(


{'n_estimators': 100,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False}

In [43]:
# Initialize the classifier with the best parameters
rf_best = RandomForestClassifier(n_estimators=200, min_samples_split=10, min_samples_leaf=1,
                                 max_features='auto', max_depth=30, bootstrap=False, random_state=42)

# Train the classifier
rf_best.fit(X_train_scaled, y_train_encoded)

# Predict on the test data
y_pred_rf_best = rf_best.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, y_pred_rf_best)
print(f"Best Params Random Forest Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test_encoded, y_pred_rf_best, target_names=label_encoder.classes_))


  warn(


Best Params Random Forest Accuracy: 82.11%

Classification Report:
               precision    recall  f1-score   support

      Deluxe       0.87      0.06      0.12      1743
      Luxury       0.88      0.86      0.87     11514
    Standard       0.77      0.90      0.83     10800

    accuracy                           0.82     24057
   macro avg       0.84      0.61      0.61     24057
weighted avg       0.83      0.82      0.80     24057



In [35]:
# Define the parameter grid
param_dist = {
    'n_estimators': np.arange(50, 1000, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': np.arange(3, 10, 1),
    'colsample_bytree': [0.5, 0.7, 0.8, 0.9, 1],
    'subsample': [0.5, 0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'min_child_weight': np.arange(1, 10, 1)
}

# Instantiate the model
xgb = XGBClassifier(objective='multi:softmax', num_class=3)

# Create the RandomizedSearchCV object
random_search_xgb = RandomizedSearchCV(
    xgb, 
    param_distributions=param_dist, 
    n_iter=100, 
    scoring='accuracy', 
    cv=5, 
    verbose=1, 
    n_jobs=-1
)

# Fit the model
random_search_xgb.fit(X_train_scaled, y_train_encoded)

# Get the best parameters and estimator
best_params_xgb = random_search_xgb.best_params_
best_xgb = random_search_xgb.best_estimator_

print("Best Parameters:", best_params_xgb)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'subsample': 0.7, 'n_estimators': 700, 'min_child_weight': 9, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.9}


In [42]:
# Train the best XGBoost model on the entire training dataset
best_xgb.fit(X_train_scaled, y_train_encoded)

# Make predictions on the test data
y_pred = best_xgb.predict(X_test_scaled)


# Evaluate the model
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Best Params XGBoost Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))


Best Params XGBoost Accuracy: 82.13%

Classification Report:
               precision    recall  f1-score   support

      Deluxe       0.51      0.06      0.11      1743
      Luxury       0.88      0.88      0.88     11514
    Standard       0.78      0.89      0.83     10800

    accuracy                           0.82     24057
   macro avg       0.72      0.61      0.60     24057
weighted avg       0.80      0.82      0.80     24057



In our use case, we are trying to predict the ticket type potential customers will purchase and aim to customise the experiences and amenities, hence we will evaluate the models based on the precision score, since having false positives will incur unneccessary costs for the business strategies. 

Based on the precision scores, we will pick the better model, Random Forest, with overall higher precision score for all 3 ticket types. 

### Feature Importance

In [37]:
# Extracting feature importances from the trained XGBoost model
feature_importances = rf_best.feature_importances_
features = X_train.columns

# Creating a DataFrame to display the feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Displaying the top features based on their importance
importance_df.head(10)

Unnamed: 0,Feature,Importance
13,Age,0.139612
5,Online Check-in,0.110819
1,Embarkation/Disembarkation time convenient,0.076618
0,Onboard Wifi Service,0.076221
3,Gate location,0.061858
8,Cabin service,0.061613
9,Baggage handling,0.057985
2,Ease of Online booking,0.056662
11,Onboard Service,0.055813
7,Onboard Entertainment,0.053786


# Creating a Pipeline to train entire set of data with chosen models and prepare for Deployment

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import json

# Custom transformer for target encoding
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, y):
        self.label_encoder.fit(y)
        return self

    def transform(self, y):
        return self.label_encoder.transform(y)

    def inverse_transform(self, y_encoded):
        return self.label_encoder.inverse_transform(y_encoded)



# Load configuration from JSON file
with open('config.json', 'r', encoding='utf-8') as file:
    config = json.load(file)

# Define the classifier based on the configuration
if config['classifier'] == 'xgboost':
    classifier = XGBClassifier(**config['xgboost'])
elif config['classifier'] == 'random_forest':
    classifier = RandomForestClassifier(**config['random_forest'])
else:
    raise ValueError(f"Unknown classifier: {config['classifier']}")

# Define the feature transformer (in this case, just scaling)
feature_transformer = ColumnTransformer(
    transformers=[
        ('scaler', RobustScaler(), list(X_train.columns))
    ],
    remainder='passthrough'
)

# Create the full pipeline
pipeline = Pipeline([
    ('features', feature_transformer),
    ('classifier', classifier)
])

# Combine the training and test sets
X_combined = pd.concat([X_train, X_test])
y_combined = pd.concat([y_train, y_test])

# Fit the target encoder
target_encoder = TargetEncoder()
y_combined_encoded = target_encoder.fit_transform(y_combined)

# Fit the pipeline
pipeline.fit(X_combined, y_combined_encoded)

# Serialize and save the pipeline and target encoder
with open('model_pipeline.pkl', 'wb') as pipeline_file:
    pickle.dump(pipeline, pipeline_file)

with open('target_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(target_encoder, encoder_file)


In [39]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
