# TRAVEL PACKAGE PREDICTION

# IMPORTING LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score

import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# IMPORTING DATASET

In [None]:
df = pd.read_csv('tour_package.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

**Observations:**

*Age:* Min: 18, Max: 61, Mean: 37

*MonthlyIncome:* Min: 1000, Max: 98678, Mean: 23619

*DurationOfPitch:* Min: 5 mins, Max: 127 mins

*NumberOfTrips:* Min: 1, Max: 22



In [None]:
numeric_cols = df.describe().columns
categ_cols = df.drop(numeric_cols, axis=1).columns
for i in categ_cols:
    print('*'*50)
    print(f'{i}: {df[i].unique()}')

# DATA PRE PROCESSING

In [None]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')
df.drop('CustomerID', axis=1, inplace=True)

In [None]:
numeric_cols = df.describe().columns
categ_cols = df.drop(numeric_cols, axis=1).columns
categ_cols

In [None]:
df[categ_cols].describe(include='all').T

## Creating Bins

### Age

In [None]:
df['Age'].describe()

In [None]:
df['Age_Bin'] = pd.cut(df['Age'], bins=[18, 25, 30, 40, 50, 70], labels=['18-25', '26-30', '31-40', '41-50', '51-65'])

### Monthly Income

In [None]:
df['MonthlyIncome'].describe()

In [None]:
df['Salary_Bin'] = pd.cut(df['MonthlyIncome'], bins=[0, 15000, 20000, 25000, 30000, 35000, 40000, 50000, 100000], labels=['<15000', '<20000', '<25000', '<30000', '<35000', '<40000', '<50000', '<100000'])

In [None]:
df['Salary_Bin'].value_counts()

# UNIVARIATE ANALYSIS (PENDING)

In [None]:
df.columns

In [None]:
# def plot_data_distribution(data):
#     """
#     This function plots a combined graph for univariate analysis of a continuous variable.
#     It checks the spread, central tendency, dispersion, and outliers of the data.
#     """
#     variable_name = data.name.upper()
#     fig, (ax_box, ax_dis) = plt.subplots(nrows=2, sharex=True, gridspec_kw={"height_ratios": (.25, .75)}, figsize=(8, 5))
    
#     mean_value = data.mean()
#     median_value = data.median()
#     mode_value = data.mode().tolist()[0] if not data.mode().empty else None

#     sns.set_theme(style="white")
#     fig.suptitle(f"SPREAD OF DATA FOR {variable_name}", fontsize=18, fontweight='bold')
    
#     sns.boxplot(x=data, showmeans=True, orient='h', color="teal", ax=ax_box)
#     ax_box.set_xlabel('')

#     sns.despine(top=True, right=True, left=True)
#     sns.histplot(data, kde=False, color='purple', ax=ax_dis)

#     ax_dis.axvline(mean_value, color='r', linestyle='--', linewidth=2)
#     ax_dis.axvline(median_value, color='g', linestyle='-', linewidth=2)
#     if mode_value is not None:
#         ax_dis.axvline(mode_value, color='y', linestyle='-', linewidth=2)

#     plt.legend({'Mean': mean_value, 'Median': median_value, 'Mode': mode_value})

In [None]:
# cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips',
#         'PitchSatisfactionScore', 'NumberOfChildrenVisiting', 'MonthlyIncome']


# for i in range(len(cols)):
#     plot_data_distribution(df[cols[i]])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips',
        'PitchSatisfactionScore', 'NumberOfChildrenVisiting', 'MonthlyIncome']

for col in cols:
    fig, axs = plt.subplots(2, 1, figsize=(6, 3), sharex=True, gridspec_kw={'height_ratios': [0.15, 0.85]})
    
    # Boxplot
    sns.boxplot(x=df[col], ax=axs[0], color='#5E8D9F')
    axs[0].set(xlabel=None)
    
    # Histogram
    sns.histplot(df[col], kde=True, ax=axs[1])
    axs[1].axvline(df[col].mean(), color='r', linestyle='--', label='Mean')
    axs[1].axvline(df[col].median(), color='g', linestyle='-', label='Median')
    axs[1].axvline(df[col].mode()[0], color='b', linestyle='-', label='Mode')
    axs[1].legend()
    
    plt.suptitle(f'Spread and Boxplot of {col}')
    plt.show()

In [None]:
plt.figure(figsize=(14, 20))

cols=['TypeofContact', 'CityTier',
       'Occupation', 'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups',
       'ProductPitched', 'PreferredPropertyStar', 'MaritalStatus',
       'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar',
       'NumberOfChildrenVisiting', 'Designation', 'Age_Bin',
       'Salary_Bin']

colors = sns.color_palette("pastel")

for i, variable in enumerate(cols):
    plt.subplot(9, 2, i + 1)
    ax = sns.countplot(x=df[variable], hue=df[variable], data=df, palette=colors, legend=False)
    sns.despine(top=True, right=True, left=False)
    for p in ax.patches:
        percentage = '{:.2f}%'.format(100 * p.get_height() / len(df[variable]))
        x = p.get_x() + p.get_width() / 2 - 0.05
        y = p.get_y() + p.get_height()
        plt.annotate(percentage, (x, y), ha='center')
    plt.tight_layout()
    plt.title(cols[i].upper())

plt.show()

# MULTIVARIATE ANALYSIS (PENDING)

In [None]:
df2 = df.copy()
df2.drop(['ProdTaken'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df2[df2.describe().columns].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Heatmap of Correlation Matrix')
plt.show()

In [None]:
columns = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
           'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
           'NumberOfChildrenVisiting', 'MonthlyIncome']

for col in columns:
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    sns.histplot(df[df['ProdTaken'] == 0][col], label='ProdTaken=0', kde=True)
    sns.histplot(df[df['ProdTaken'] == 1][col], label='ProdTaken=1', kde=True)
    plt.legend()
    plt.title(f'Distribution of {col} by ProdTaken')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x='ProdTaken', y=col, data=df)
    plt.title(f'Boxplot of {col} by ProdTaken')
    
    plt.show()

# OUTLIER DETECTION

In [None]:
def remove_outliers(df):
    numeric_cols = df.select_dtypes(include=[np.number]).drop(columns='ProdTaken')
    
    Q1 = numeric_cols.quantile(0.25)
    Q3 = numeric_cols.quantile(0.75)
    IQR = Q3 - Q1
    
    df_out = df[~((df[numeric_cols.columns] < (Q1 - 1.5 * IQR)) | (df[numeric_cols.columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
    
    return df_out

df_clean = remove_outliers(df)

In [None]:
df_clean.describe().T

In [None]:
non_numeric_cols = df_clean.select_dtypes(exclude=[np.number]).columns
df_clean = df_clean.copy()
df_clean[non_numeric_cols] = df_clean[non_numeric_cols].astype('category')
df_clean.info()

In [None]:
df_clean.describe(include='category').T

# HANDLING MISSING VALUES

In [None]:
df = df_clean.copy()

In [None]:
df = df.drop(['Age_Bin', 'Salary_Bin'], axis=1)

In [None]:
df['DurationOfPitch'] = df['DurationOfPitch'].fillna(0)
df['NumberOfTrips'] = df['NumberOfTrips'].fillna(0)
df['NumberOfChildrenVisiting'] = df['NumberOfChildrenVisiting'].fillna(0)
df['NumberOfFollowups'] = df['NumberOfFollowups'].fillna(0)

In [None]:
df['TypeofContact'].value_counts()

In [None]:
df['TypeofContact'] = df['TypeofContact'].fillna('Self Enquiry')

In [None]:
df['PreferredPropertyStar'].value_counts()

In [None]:
df['PreferredPropertyStar'] = df['PreferredPropertyStar'].fillna(df['PreferredPropertyStar'].mode()[0])

In [None]:
df.dropna(subset=['Age', 'MonthlyIncome'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
len(df.columns)

# DATA PRE PROCESSING

In [None]:
X = df.drop(['ProdTaken'], axis=1)
y = df['ProdTaken']

### Converting categorical variables to numeric

In [None]:
X = pd.get_dummies(X)
X.head()

### Balancing dataset

In [None]:
resample = SMOTEENN(random_state=42)
X_resampled, y_resampled = resample.fit_resample(X, y)

X, y = X_resampled, y_resampled

# APPLYING MODELS

## BEFORE HYPERPARAMETER TUNING

In [None]:
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Bagging': BaggingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = []

for model_name, model in models.items():
    cv_scores = cross_val_score(model, X, y, cv=5)
    results.append({
        'model': model_name,
        'average_cv_score': cv_scores.mean()
    })

results_df = pd.DataFrame(results)

## AFTER HYPERPARAMETER TUNING

In [None]:
from xgboost import XGBClassifier

model_params = {
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [2, 5, 10, 20],
            'min_samples_leaf': [1, 2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50, 100, 200],
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [2, 5, 10, 20],
            'min_samples_leaf': [1, 2, 5, 10]
        }
    },
    'Bagging': {
        'model': BaggingClassifier(),
        'params': {
            'n_estimators': [10, 50, 100, 200],
            'max_samples': [0.5, 1.0],
            'max_features': [0.5, 1.0]
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1.0]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1.0],
            'max_depth': [3, 5, 10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1.0],
            'max_depth': [3, 5, 10],
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0]
        }
    }
}

In [None]:
scores = []

scoring = {'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)}

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, scoring=scoring, refit='f1_score')
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision': clf.cv_results_['mean_test_precision'],
        'recall': clf.cv_results_['mean_test_recall'],
        'f1_score': clf.cv_results_['mean_test_f1_score']
    })

scores_df = pd.DataFrame(scores)
scores_df

In [None]:
results_df.rename(columns={'average_cv_score': 'before_tuning_accuracy'}, inplace=True)
scores_df.rename(columns={'best_score': 'after_tuning_accuracy'}, inplace=True)

final_df = pd.merge(results_df, scores_df, on='model')
cols = [col for col in final_df.columns if col != 'best_params']
final_df = final_df[cols + ['best_params']]

final_df