In [None]:
'''
XGBoost: eXtreme Gradient Boosting


Definition:
1. XGBoost Machine Learning Algorithm is an optimized distributed gradient boosting library 
    designed to be highly efficient, flexible, and portable.
2. It implements machine learning algorithms under the Gradient Boosting framework.
3. XGBoost is widely used for supervised learning problems, particularly in classification and regression tasks.
4. It is known for its performance and speed, 
    making it a popular choice in data science competitions and real-world applications.

How it works:
1. XGBoost builds an ensemble of decision trees in a sequential manner, 
    where each new tree corrects the errors made by the previous ones.
2. It uses a gradient descent algorithm to minimize the loss function, 
    which measures the difference between the predicted and actual values.
3. The algorithm can handle both numerical and categorical data, and it supports missing values natively.
4. XGBoost includes regularization techniques to prevent overfitting, 
    making it robust against noisy data.
5. Different from Gradient Boosting, XGBoost uses a more sophisticated tree pruning algorithm
    and can handle sparse data efficiently.


'''

In [1]:
# Data Collection:
#https://www.kaggle.com/datasets/susant4learning/holiday-package-purchase-prediction?resource=download&select=Travel.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df = pd.read_csv(r'Travel.xls')
df.head()


Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [2]:
df.isnull().sum()
df['Gender'] = df['Gender'].apply(lambda x:'Female' if x=='Fe Male' else x)
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [3]:
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':'Unmarried'})
df['MaritalStatus'].value_counts()
## Checking missing values

feature_with_na = [features for features in df.columns if df[features].isnull().sum() > 0]
for feature in feature_with_na:
    #print(f"{feature} has {df[feature].isnull().sum()} missing values.")
    print(feature, np.round(df[feature].isnull().mean()*100, 5), '% missing values')
# statistical summary of numerical columns
df[feature_with_na].select_dtypes(exclude='object').describe()
import warnings
warnings.filterwarnings('ignore')
df.Age.fillna(df.Age.median(), inplace=True)
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)
df.isnull().sum()
df.drop(columns=['CustomerID'], inplace=True)
# Create new columns for feature extraction
df['TotalVisiting'] = df.NumberOfChildrenVisiting + df.NumberOfPersonVisiting
df.drop(columns=['NumberOfChildrenVisiting', 'NumberOfPersonVisiting'], inplace=True)
# get all numerical columns
numerical_cols = [feature for feature in df.columns if df[feature].dtype!= 'O']
print(len(numerical_cols))
# get all numerical columns
categorical_cols = [feature for feature in df.columns if df[feature].dtype == 'O']
print(len(categorical_cols))
# discrete features - also knowns as categorical features
# are those features which have a limited number of unique values
discrete_features = [feature for feature in numerical_cols if len(df[feature].unique()) < 25]
print(len(discrete_features))
# continuous features - are those features which have a large number of unique values
continuous_features = [feature for feature in numerical_cols if feature not in discrete_features]
print(len(continuous_features))
from sklearn.model_selection import train_test_split
X = df.drop(columns=['ProdTaken'], axis=1)
y = df['ProdTaken']
# separate dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns
print("Categorical Features:", cat_features)
print("Numerical Features:", num_features)
# One Hot Encoding for Categorical Features and Standardization for Numerical Features
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    [
    ("OneHotEncoder", categorical_transformer, cat_features),
    ("StandardScaler", numeric_transformer, num_features)
    ]
)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
y_train

Age 4.62357 % missing values
TypeofContact 0.51146 % missing values
DurationOfPitch 5.13502 % missing values
NumberOfFollowups 0.92062 % missing values
PreferredPropertyStar 0.53191 % missing values
NumberOfTrips 2.86416 % missing values
NumberOfChildrenVisiting 1.35025 % missing values
MonthlyIncome 4.76678 % missing values
12
6
9
3
Categorical Features: Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],
      dtype='object')
Numerical Features: Index(['Age', 'CityTier', 'DurationOfPitch', 'NumberOfFollowups',
       'PreferredPropertyStar', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'OwnCar', 'MonthlyIncome', 'TotalVisiting'],
      dtype='object')


3995    0
2610    0
3083    0
3973    0
4044    0
       ..
4426    0
466     0
3092    0
3772    0
860     1
Name: ProdTaken, Length: 3910, dtype: int64

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score

In [9]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier()
}

In [10]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_roc_auc = roc_auc_score(y_train, y_train_pred)

    # test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_roc_auc = roc_auc_score(y_test, y_test_pred)

    print(f"Model: {list(models.keys())[i]}")
    print("Training Set Performance:")
    print(f"Training Accuracy: {model_train_accuracy}")
    print(f"Training F1 Score: {model_train_f1}")
    print(f"Training Recall: {model_train_recall}")
    print(f"Training Precision: {model_train_precision}")
    print(f"Training ROC AUC: {model_train_roc_auc}")

    print("Test Set Performance:")
    print(f"Test Accuracy: {model_test_accuracy}")
    print(f"Test F1 Score: {model_test_f1}")
    print(f"Test Recall: {model_test_recall}")
    print(f"Test Precision: {model_test_precision}")
    print(f"Test ROC AUC: {model_test_roc_auc}")
    print("-"*50)


Model: Logistic Regression
Training Set Performance:
Training Accuracy: 0.8460358056265984
Training F1 Score: 0.8202118738880438
Training Recall: 0.30315500685871055
Training Precision: 0.7015873015873015
Training ROC AUC: 0.6368022755136056
Test Set Performance:
Test Accuracy: 0.83640081799591
Test F1 Score: 0.8086633047343356
Test Recall: 0.2931937172774869
Test Precision: 0.691358024691358
Test ROC AUC: 0.630713758257549
--------------------------------------------------
Model: Random Forest
Training Set Performance:
Training Accuracy: 1.0
Training F1 Score: 1.0
Training Recall: 1.0
Training Precision: 1.0
Training ROC AUC: 1.0
Test Set Performance:
Test Accuracy: 0.9284253578732107
Test F1 Score: 0.9228478076359619
Test Recall: 0.6544502617801047
Test Precision: 0.9689922480620154
Test ROC AUC: 0.8246838348290613
--------------------------------------------------
Model: Decision Tree
Training Set Performance:
Training Accuracy: 1.0
Training F1 Score: 1.0
Training Recall: 1.0
Traini

In [11]:
#  Hyperparameter Tuning for XGBoost

import xgboost


rf_params = {
    "max_depth": [5, 8, 15, None, 10],
    "max_features": [5, 7, "auto", 8],
    "n_estimators": [100, 200, 500, 1000],
    "min_samples_split": [2, 8, 15, 20]
}

xgboost_params = {
    "max_depth": [5, 8, 12, 20, 30],
    "learning_rate": [0.1, 0.01],
    "n_estimators": [100, 200, 300],
    "colsample_bytree": [0.5,0.8, 1, 0.3, 0.4]
    }

In [12]:
# model list for hyperparameter tuning
randomcv_models = [
    ("RF", RandomForestClassifier(), rf_params),
    ("Xgboost", XGBClassifier(), xgboost_params),
]

In [13]:
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1,
                                )
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name, params in model_param.items():
    print(f"Best parameters for {model_name}: {params}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters for RF: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}
Best parameters for Xgboost: {'n_estimators': 300, 'max_depth': 20, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [14]:
# remodeling using hyperparameter
models = {
    "Random Forest": RandomForestClassifier(**model_param['RF']),
    "Xgboost": XGBClassifier(**model_param['Xgboost'])
}

In [15]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_roc_auc = roc_auc_score(y_train, y_train_pred)

    # test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_roc_auc = roc_auc_score(y_test, y_test_pred)

    print(f"Model: {list(models.keys())[i]}")
    print("Training Set Performance:")
    print(f"Training Accuracy: {model_train_accuracy}")
    print(f"Training F1 Score: {model_train_f1}")
    print(f"Training Recall: {model_train_recall}")
    print(f"Training Precision: {model_train_precision}")
    print(f"Training ROC AUC: {model_train_roc_auc}")

    print("Test Set Performance:")
    print(f"Test Accuracy: {model_test_accuracy}")
    print(f"Test F1 Score: {model_test_f1}")
    print(f"Test Recall: {model_test_recall}")
    print(f"Test Precision: {model_test_precision}")
    print(f"Test ROC AUC: {model_test_roc_auc}")
    print("-"*50)


Model: Random Forest
Training Set Performance:
Training Accuracy: 1.0
Training F1 Score: 1.0
Training Recall: 1.0
Training Precision: 1.0
Training ROC AUC: 1.0
Test Set Performance:
Test Accuracy: 0.934560327198364
Test F1 Score: 0.9300638588831596
Test Recall: 0.6858638743455497
Test Precision: 0.9703703703703703
Test ROC AUC: 0.8403906411117836
--------------------------------------------------
Model: Xgboost
Training Set Performance:
Training Accuracy: 1.0
Training F1 Score: 1.0
Training Recall: 1.0
Training Precision: 1.0
Training ROC AUC: 1.0
Test Set Performance:
Test Accuracy: 0.9601226993865031
Test F1 Score: 0.9587468438566427
Test Recall: 0.8167539267015707
Test Precision: 0.975
Test ROC AUC: 0.9058356672897941
--------------------------------------------------
