In [99]:
import pandas as pd
import numpy as np  
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [100]:
df = pd.read_csv('data/Travel.csv')
df.head(5)

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [101]:
#check for null values
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [102]:
#check all the categorial categories
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [103]:
df['MaritalStatus'].value_counts()


MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [104]:
df['TypeofContact'].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [105]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')
df['MaritalStatus'] = df['MaritalStatus'].replace('Single', 'Unmarried')

In [106]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [107]:
##checking for missing values
##these are the features with nan values
features_with_na = [features for features in df.columns if df[features].isnull().sum() >= 1]
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean()*100, 5), '% missing values')


Age 4.62357 % missing values
TypeofContact 0.51146 % missing values
DurationOfPitch 5.13502 % missing values
NumberOfFollowups 0.92062 % missing values
PreferredPropertyStar 0.53191 % missing values
NumberOfTrips 2.86416 % missing values
NumberOfChildrenVisiting 1.35025 % missing values
MonthlyIncome 4.76678 % missing values


In [108]:
#statistics on numerical columns
df[features_with_na].select_dtypes(exclude='object').describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


In [109]:
#Age
df['Age'].fillna(df['Age'].median(), inplace=True)

#Type of contract
df['TypeofContact'].fillna(df['TypeofContact'].mode()[0], inplace=True)

#Duration of Pitch
df['DurationOfPitch'].fillna(df['DurationOfPitch'].median(), inplace=True)

#NumberofFollowups
df['NumberOfFollowups'].fillna(df['NumberOfFollowups'].mode()[0], inplace=True)

#PreferredPropertyStar
df['PreferredPropertyStar'].fillna(df['PreferredPropertyStar'].mode()[0], inplace=True)

#Number of Trips
df['NumberOfTrips'].fillna(df['NumberOfTrips'].median(), inplace=True)

#Numberofchildrenvisting
df['NumberOfChildrenVisiting'].fillna(df['NumberOfChildrenVisiting'].mode()[0], inplace=True)

#Monthly Income
df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)

In [110]:
df.drop('CustomerID', inplace=True, axis=1)

In [111]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [112]:
df['TotalVisiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(columns=['NumberOfPersonVisiting', 'NumberOfChildrenVisiting'], axis=1, inplace=True)

In [113]:
#get all numeric features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of numerical features:', len(num_features))

Num of numerical features: 12


In [114]:
#get all categorial features
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print ('Num of categorial features:', len(cat_features))

Num of categorial features: 6


In [115]:
#get all discrete features
discrete_feature = [feature for feature in num_features if len(df[feature].unique()) <= 25]
print ('Number of discrete features:', len(discrete_feature))

Number of discrete features: 9


In [116]:
#continous features
continous_features = [feature for feature in num_features if feature not in discrete_feature]
print('Number of continous feature:', len(continous_features))

Number of continous feature: 3


In [117]:
#Train and test split

from sklearn.model_selection import train_test_split
X = df.drop(['ProdTaken'], axis=1)
y = df['ProdTaken']

In [118]:
y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [119]:
X.head()

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisiting
0,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,Manager,20993.0,3.0
1,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Unmarried,7.0,1,3,0,Executive,17090.0,3.0
3,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [121]:
#create column transformers with 3 type of transformers

cat_features = X.select_dtypes(include="object").columns
num_features = X.select_dtypes(exclude="object").columns

print(cat_features)

Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],
      dtype='object')


In [122]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [123]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [127]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score,recall_score, f1_score, roc_auc_score, roc_curve

In [128]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

for i in range (len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #make prediction
    y_train_pred= model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_roc_auc_score = roc_auc_score(y_train, y_train_pred)

    #test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_roc_auc_score = roc_auc_score(y_test, y_test_pred)

    print(list(models.keys())[i])

    print('Model performance of training set')
    print("Accuracy:{:.4f}".format(model_train_accuracy))
    print("F1 Score:{:.4f}".format(model_train_f1))
    print("Precision:{:.4f}".format(model_train_precision))
    print("Recall:{:.4f}".format(model_train_recall))
    print("RoC AUC score:{:.4f}".format(model_train_roc_auc_score))

    print ("--------------------------------------")

    print ('Model performance for test set')
    print("Accuracy:{:.4f}".format(model_test_accuracy))
    print("F1 Score:{:.4f}".format(model_test_f1))
    print("Precision:{:.4f}".format(model_test_precision))
    print("Recall:{:.4f}".format(model_test_recall))
    print("RoC AUC score:{:.4f}".format(model_test_roc_auc_score))



Decision Tree
Model performance of training set
Accuracy:1.0000
F1 Score:1.0000
Precision:1.0000
Recall:1.0000
RoC AUC score:1.0000
--------------------------------------
Model performance for test set
Accuracy:0.9192
F1 Score:0.9185
Precision:0.8077
Recall:0.7696
RoC AUC score:0.8626
Random Forest
Model performance of training set
Accuracy:1.0000
F1 Score:1.0000
Precision:1.0000
Recall:1.0000
RoC AUC score:1.0000
--------------------------------------
Model performance for test set
Accuracy:0.9305
F1 Score:0.9253
Precision:0.9695
Recall:0.6649
RoC AUC score:0.8299


In [129]:
#Hyperparameter training
rf_params = {"max_depth":[5,8,15, None, 10],
             "max_features":[5,7,"auto",8],
             "min_samples_split":[2,8,15,20],
             "n_estimators":[100,200,500,1000]
            }


In [132]:
#model list for hyperparameter tuning
randomcv_models = [("RF", RandomForestClassifier(), rf_params),]

In [135]:
from sklearn.model_selection import RandomizedSearchCV

model_param={}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=3, verbose=2, n_jobs=-1)

    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"--------------Best params for {model_name}----------------")
    print(model_param[model_name])



Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=5, max_features=7, min_samples_split=15, n_estimators=500; total time=   3.1s
[CV] END max_depth=5, max_features=7, min_samples_split=15, n_estimators=500; total time=   3.1s
[CV] END max_depth=5, max_features=7, min_samples_split=15, n_estimators=500; total time=   3.2s
[CV] END max_depth=8, max_features=7, min_samples_split=20, n_estimators=500; total time=   4.2s
[CV] END max_depth=None, max_features=8, min_samples_split=15, n_estimators=1000; total time=   9.9s
[CV] END max_depth=8, max_features=7, min_samples_split=20, n_estimators=500; total time=   4.5s
[CV] END max_depth=10, max_features=auto, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_split=2, n_estimators=1000; total time=   0.0s
[CV] END max_depth=None, max_features=8, min_sam

In [136]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, min_samples_split=2, max_features=5,max_depth=None),
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_roc_auc_score = roc_auc_score(y_train, y_train_pred)

    #test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_roc_auc_score = roc_auc_score(y_test, y_test_pred)

    print(list(models.keys())[i])

    print('Model performance of training set')
    print("Accuracy:{:.4f}".format(model_train_accuracy))
    print("F1 Score:{:.4f}".format(model_train_f1))
    print("Precision:{:.4f}".format(model_train_precision))
    print("Recall:{:.4f}".format(model_train_recall))
    print("RoC AUC score:{:.4f}".format(model_train_roc_auc_score))

    print ("--------------------------------------")

    print ('Model performance for test set')
    print("Accuracy:{:.4f}".format(model_test_accuracy))
    print("F1 Score:{:.4f}".format(model_test_f1))
    print("Precision:{:.4f}".format(model_test_precision))
    print("Recall:{:.4f}".format(model_test_recall))
    print("RoC AUC score:{:.4f}".format(model_test_roc_auc_score))

Random Forest
Model performance of training set
Accuracy:1.0000
F1 Score:1.0000
Precision:1.0000
Recall:1.0000
RoC AUC score:1.0000
--------------------------------------
Model performance for test set
Accuracy:0.9264
F1 Score:0.9209
Precision:0.9542
Recall:0.6545
RoC AUC score:0.8234
