In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("/kaggle/input/travel-company-insurance-prediction/Travel Company Old Clients.csv",sep=';')

# Dataset Exploration

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
def plots(df, x, y, axes, i):
    sns.histplot(df, x=x, kde=True, color='g',ax=axes[i][0])
    sns.histplot(df, x=x, hue=y, kde=True, ax=axes[i][1])
    group = df.groupby(y)
    sns.barplot(x=group[x].mean().index,
               y=group[x].mean(),ax=axes[i][2])
    
    for container in axes[i][2].containers:
        axes[i][2].bar_label(container,size=15,color='black')
        
    axes[i][3].pie(df[y].value_counts(),
                  labels=df[y].value_counts().index,
                  autopct='%0.2f%%')

In [None]:
def plot_group(df, x, y):
    fig, axes = plt.subplots(nrows=len(y), ncols=4, figsize=(20, 20))
    for i in range(len(y)):
        plots(df, x, y[i], axes, i)
    plt.suptitle("Barplots and histograms of {} grouped by categorical features".format(x),size=20)
    plt.tight_layout()
    plt.show()

In [None]:
categoricals = ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad', 'TravelInsurance']
numerical = ['Age', 'AnnualIncome']

# Data distribution using histograms, mean values on barplots grouped by categorical values and pie charts

In [None]:
for i in numerical:
    plot_group(df, i, categoricals)

# Scatterplots

In [None]:
sns.pairplot(df,vars=['Age', 'AnnualIncome'], hue='TravelInsurance')

# Confirming absense of outliers

In [None]:
sns.boxplot(df, x='AnnualIncome')

In [None]:
sns.boxplot(df, x='Age')

# Defining maps for categorical data

In [None]:
employment = {
    'Private Sector/Self Employed': 1,
    'Government Sector': 0
}
graduate = {
    'Yes': 0,
    'No': 1
}
frequent = {
    'Yes':0,
    'No':1
}
travelled = {
    'Yes':0,
    'No': 1
}

maps = [employment, graduate, frequent, travelled]
k = 0
for i in categoricals[:-1]:
    df[i] = df[i].map(maps[k])
    k += 1

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Applying SMOTE to equalise target value

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)
print("Before ", Counter(y_train))
sm = SMOTE()
x_train, y_train = sm.fit_resample(x_train, y_train)
print("After ", Counter(y_train))

# Training pipeline

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    acc *= 100
    acc = round(acc, 2)
    title = "{}: {}%".format(name, acc)
    print(title)
    return model

# Defining models and tuning their hyperparameters

In [None]:
rfr = RandomForestClassifier(max_depth=5,criterion='gini')
gnb = GaussianNB()
etc = ExtraTreesClassifier(max_depth=5)
abc = AdaBoostClassifier(learning_rate=0.1)
dtc = DecisionTreeClassifier(max_depth=5)
xgb = XGBClassifier(n_estimators=1000, max_depth=10, eta=0.1, subsample=0.7, colsample_bytree=0.8)

models = [rfr, gnb, etc, abc, dtc, xgb]
names = ['Random Forest', 'Naive Bayes', 'Extra Trees',
        'Ada Boost', 'Decision Tree', 'XGB']

# Evaluation

In [None]:
trained_models = []
for i, j in zip(models, names):
    trained_models += [training(i, j)]

# Time to predict for new customers

In [None]:
dt = pd.read_csv("/kaggle/input/travel-company-insurance-prediction/Travel Company New Clients.csv", sep=';')
dt.head()

In [None]:
dt.isna().sum()

In [None]:
k = 0
for i in categoricals[:-1]:
    dt[i] = dt[i].map(maps[k])
    k += 1

In [None]:
xt = dt.values

# Selection of models to predict

<p>
    I decided to use Random Forest
</p>

In [None]:
predicted = trained_models[0].predict(xt)

In [None]:
dt['TravelInsurance'] = predicted

# Predictions and EDA

In [None]:
dt

# Data visualisation for new clients with predicted values

In [None]:
plt.pie(dt['TravelInsurance'].value_counts(),
       labels=dt['TravelInsurance'].value_counts().index,
       autopct='%0.2f%%')

In [None]:
sns.pairplot(dt,vars=['Age', 'AnnualIncome'], hue='TravelInsurance')

In [None]:
for i in numerical:
    plot_group(dt, i, categoricals)