In [None]:
'''
XGBoost: eXtreme Gradient Boosting


Definition:
1. XGBoost Machine Learning Algorithm is an optimized distributed gradient boosting library 
    designed to be highly efficient, flexible, and portable.
2. It implements machine learning algorithms under the Gradient Boosting framework.
3. XGBoost is widely used for supervised learning problems, particularly in classification and regression tasks.
4. It is known for its performance and speed, 
    making it a popular choice in data science competitions and real-world applications.

How it works:
1. XGBoost builds an ensemble of decision trees in a sequential manner, 
    where each new tree corrects the errors made by the previous ones.
2. It uses a gradient descent algorithm to minimize the loss function, 
    which measures the difference between the predicted and actual values.
3. The algorithm can handle both numerical and categorical data, and it supports missing values natively.
4. XGBoost includes regularization techniques to prevent overfitting, 
    making it robust against noisy data.
5. Different from Gradient Boosting, XGBoost uses a more sophisticated tree pruning algorithm
    and can handle sparse data efficiently.


'''

In [1]:
# Data Collection:
#https://www.kaggle.com/datasets/susant4learning/holiday-package-purchase-prediction?resource=download&select=Travel.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df = pd.read_csv(r'Travel.xls')
df.head()


Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [2]:
df.isnull().sum()
df['Gender'] = df['Gender'].apply(lambda x:'Female' if x=='Fe Male' else x)
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [3]:
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':'Unmarried'})
df['MaritalStatus'].value_counts()
## Checking missing values

feature_with_na = [features for features in df.columns if df[features].isnull().sum() > 0]
for feature in feature_with_na:
    #print(f"{feature} has {df[feature].isnull().sum()} missing values.")
    print(feature, np.round(df[feature].isnull().mean()*100, 5), '% missing values')
# statistical summary of numerical columns
df[feature_with_na].select_dtypes(exclude='object').describe()
import warnings
warnings.filterwarnings('ignore')
df.Age.fillna(df.Age.median(), inplace=True)
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)
df.isnull().sum()
df.drop(columns=['CustomerID'], inplace=True)
# Create new columns for feature extraction
df['TotalVisiting'] = df.NumberOfChildrenVisiting + df.NumberOfPersonVisiting
df.drop(columns=['NumberOfChildrenVisiting', 'NumberOfPersonVisiting'], inplace=True)
# get all numerical columns
numerical_cols = [feature for feature in df.columns if df[feature].dtype!= 'O']
print(len(numerical_cols))
# get all numerical columns
categorical_cols = [feature for feature in df.columns if df[feature].dtype == 'O']
print(len(categorical_cols))
# discrete features - also knowns as categorical features
# are those features which have a limited number of unique values
discrete_features = [feature for feature in numerical_cols if len(df[feature].unique()) < 25]
print(len(discrete_features))
# continuous features - are those features which have a large number of unique values
continuous_features = [feature for feature in numerical_cols if feature not in discrete_features]
print(len(continuous_features))
from sklearn.model_selection import train_test_split
X = df.drop(columns=['ProdTaken'], axis=1)
y = df['ProdTaken']
# separate dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns
print("Categorical Features:", cat_features)
print("Numerical Features:", num_features)
# One Hot Encoding for Categorical Features and Standardization for Numerical Features
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    [
    ("OneHotEncoder", categorical_transformer, cat_features),
    ("StandardScaler", numeric_transformer, num_features)
    ]
)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
y_train

Age 4.62357 % missing values
TypeofContact 0.51146 % missing values
DurationOfPitch 5.13502 % missing values
NumberOfFollowups 0.92062 % missing values
PreferredPropertyStar 0.53191 % missing values
NumberOfTrips 2.86416 % missing values
NumberOfChildrenVisiting 1.35025 % missing values
MonthlyIncome 4.76678 % missing values
12
6
9
3
Categorical Features: Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],
      dtype='object')
Numerical Features: Index(['Age', 'CityTier', 'DurationOfPitch', 'NumberOfFollowups',
       'PreferredPropertyStar', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'OwnCar', 'MonthlyIncome', 'TotalVisiting'],
      dtype='object')


3995    0
2610    0
3083    0
3973    0
4044    0
       ..
4426    0
466     0
3092    0
3772    0
860     1
Name: ProdTaken, Length: 3910, dtype: int64

In [5]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [14]:
# Create a function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)
    print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    

In [15]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "Xgboost Regressor": XGBRegressor()
}

In [17]:
import warnings
warnings.filterwarnings("ignore")

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)

    print(f"Model: {list(models.keys())[i]}")
    
    
    print("\n" + "="*35 + "\n")

Train Set Metrics:
MAE: 0.2590, RMSE: 0.3463, R2: 0.2095
Test Set Metrics:
MAE: 0.2660, RMSE: 0.3547, R2: 0.1995
Model: LinearRegression


Train Set Metrics:
MAE: 0.2592, RMSE: 0.3463, R2: 0.2092
Test Set Metrics:
MAE: 0.2661, RMSE: 0.3547, R2: 0.1994
Model: Ridge


Train Set Metrics:
MAE: 0.3034, RMSE: 0.3895, R2: 0.0000
Test Set Metrics:
MAE: 0.3089, RMSE: 0.3965, R2: -0.0005
Model: Lasso


Train Set Metrics:
MAE: 0.1087, RMSE: 0.2107, R2: 0.7074
Test Set Metrics:
MAE: 0.1521, RMSE: 0.2810, R2: 0.4977
Model: KNeighborsRegressor


Train Set Metrics:
MAE: 0.0000, RMSE: 0.0000, R2: 1.0000
Test Set Metrics:
MAE: 0.0818, RMSE: 0.2860, R2: 0.4795
Model: DecisionTreeRegressor


Train Set Metrics:
MAE: 0.0496, RMSE: 0.0839, R2: 0.9535
Test Set Metrics:
MAE: 0.1323, RMSE: 0.2192, R2: 0.6943
Model: RandomForestRegressor


Train Set Metrics:
MAE: 0.2497, RMSE: 0.3568, R2: 0.1608
Test Set Metrics:
MAE: 0.2592, RMSE: 0.3671, R2: 0.1427
Model: AdaBoostRegressor


Train Set Metrics:
MAE: 0.2018, RM

In [18]:
rf_params = {
    "max_depth":[5,8,15,None,10],
    "n_estimators":[100,200,500,1000],
    "min_samples_split":[2,8,15,20],
    "max_features":[5,7,"auto",8]
}
xgboost_params = {
    "max_depth": [5, 8, 12, 20, 30],
    "learning_rate": [0.1, 0.01],
    "n_estimators": [100, 200, 300],
    "colsample_bytree": [0.5,0.8, 1, 0.3, 0.4]
    }

In [19]:
randomcv_models = [
    ("RF", RandomForestRegressor(), rf_params),
    ("Xgboosting", XGBRegressor(), xgboost_params)
]

In [20]:
from sklearn.model_selection import RandomizedSearchCV  
model_param = {}
for name, model, params in randomcv_models:
    random_search = RandomizedSearchCV(model, params, n_iter=100, cv=3, verbose=2, n_jobs=-1)
    random_search.fit(X_train, y_train)
    model_param[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters for RF: {'n_estimators': 500, 'min_samples_split': 2, 'max_features': 7, 'max_depth': 15}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters for Xgboosting: {'n_estimators': 100, 'max_depth': 12, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [21]:
models = {
    "Random Forest": RandomForestRegressor(**model_param['RF']),
    "Xgboosting": XGBRegressor(**model_param['Xgboosting'])
}

In [22]:
import warnings
warnings.filterwarnings("ignore")

for i in range(len(list(models))):
    
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Model: {list(models.keys())[i]}")
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)
    
    print("\n" + "="*35 + "\n")

Model: Random Forest
Train Set Metrics:
MAE: 0.0630, RMSE: 0.0978, R2: 0.9370
Test Set Metrics:
MAE: 0.1541, RMSE: 0.2359, R2: 0.6459


Model: Xgboosting
Train Set Metrics:
MAE: 0.0045, RMSE: 0.0088, R2: 0.9995
Test Set Metrics:
MAE: 0.1022, RMSE: 0.1906, R2: 0.7688


