<a href="https://colab.research.google.com/github/a-donat/Benchmarks_PyCaret/blob/main/Predicting_Telemarketing_Response.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I. Set-Up

## I.A. Import Libraries and Download Data

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d aguado/telemarketing-jyb-dataset
! unzip '/content/telemarketing-jyb-dataset.zip' -d '/content/telemarketing'

In [None]:
! pip install pycaret

In [3]:
#import matplotlib.pyplot as plt
#import seaborn as sns
import VisualizeDataAbbrev as viz

import numpy as np
import pandas as pd
from pycaret.classification import *
from sklearn.model_selection import train_test_split

## I.B. Load Data and Check Data Integrity

In [32]:
ds_df = pd.read_csv("/content/telemarketing/train.csv", sep=";")

In [33]:
ds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28645 entries, 0 to 28644
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      28645 non-null  int64  
 1   age             28645 non-null  int64  
 2   job             28645 non-null  object 
 3   marital         28645 non-null  object 
 4   education       28645 non-null  object 
 5   default         28645 non-null  object 
 6   housing         28645 non-null  object 
 7   loan            28645 non-null  object 
 8   contact         28645 non-null  object 
 9   month           28645 non-null  object 
 10  day_of_week     28645 non-null  object 
 11  campaign        28645 non-null  int64  
 12  pdays           28645 non-null  int64  
 13  previous        28645 non-null  int64  
 14  poutcome        28645 non-null  object 
 15  emp.var.rate    28645 non-null  float64
 16  cons.price.idx  28645 non-null  float64
 17  cons.conf.idx   28645 non-null 

In [34]:
ds_df.nunique()

Unnamed: 0        28645
age                  78
job                  12
marital               4
education             8
default               3
housing               3
loan                  3
contact               2
month                10
day_of_week           5
campaign             41
pdays                25
previous              8
poutcome              3
emp.var.rate         10
cons.price.idx       26
cons.conf.idx        26
euribor3m           311
nr.employed          11
y                     2
dtype: int64

In [35]:
ds_df["y"].value_counts()

no     25362
yes     3283
Name: y, dtype: int64

In [36]:
ds_df = ds_df[list(ds_df)[1:]].copy()

In [37]:
ds_df["job"].value_counts()

admin.           7213
blue-collar      6459
technician       4700
services         2718
management       2082
retired          1202
self-employed    1003
entrepreneur      998
housemaid         727
unemployed        686
student           615
unknown           242
Name: job, dtype: int64

In [38]:
ds_df["month"].value_counts()

may    9552
jul    5060
aug    4287
jun    3670
nov    2845
apr    1822
oct     514
sep     399
mar     372
dec     124
Name: month, dtype: int64

In [39]:
ds_df["month"] = ds_df["month"].replace(
    {"may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9,
     "oct": 10, "nov": 11, "dec": 12})

In [41]:
ds_df["poutcome"].value_counts()

nonexistent    24824
failure         2870
success          951
Name: poutcome, dtype: int64

In [40]:
ds_df.describe().round(5)

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,28645.0,28645.0,28645.0,28645.0,28645.0,28645.0,28645.0,28645.0,28645.0
mean,39.98352,2.5586,962.63418,0.16855,0.08153,93.57552,-40.48285,3.62197,5167.00143
std,10.42033,2.75994,186.52608,0.48681,1.57405,0.57985,4.63922,1.73553,72.34489
min,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,2.0,999.0,0.0,1.1,93.798,-41.8,4.857,5191.0
75%,47.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,43.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1



# II. Preprocessing

In [42]:
train_df, test_df = train_test_split(
    ds_df, test_size=0.20, stratify=ds_df["y"], random_state=1)

# III. Create Models

In [43]:
exp_clf101 = setup(data=train_df, target = "y", session_id=123,
                   fix_imbalance=True)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,y
2,Target type,Binary
3,Target mapping,"no: 0, yes: 1"
4,Original data shape,"(22916, 20)"
5,Transformed data shape,"(35281, 62)"
6,Transformed train set shape,"(28406, 62)"
7,Transformed test set shape,"(6875, 62)"
8,Ordinal features,1
9,Numeric features,9


In [44]:
best_model = compare_models(fold=5)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8923,0.7827,0.3362,0.5492,0.4167,0.3612,0.3746,7.636
xgboost,Extreme Gradient Boosting,0.8916,0.7737,0.2873,0.5501,0.377,0.3241,0.3449,8.584
lightgbm,Light Gradient Boosting Machine,0.8916,0.7889,0.2818,0.5534,0.3729,0.3205,0.3427,1.838
rf,Random Forest Classifier,0.8885,0.7688,0.3041,0.5243,0.3847,0.3281,0.343,4.276
dummy,Dummy Classifier,0.8854,0.5,0.0,0.0,0.0,0.0,0.0,0.506
ada,Ada Boost Classifier,0.8805,0.7636,0.4135,0.4752,0.4417,0.3752,0.3766,2.43
et,Extra Trees Classifier,0.8758,0.7498,0.3183,0.4432,0.3701,0.3033,0.3086,3.934
dt,Decision Tree Classifier,0.8313,0.618,0.3341,0.2932,0.3123,0.2166,0.2172,1.282
nb,Naive Bayes,0.8188,0.7585,0.58,0.3335,0.4232,0.325,0.3427,0.578
ridge,Ridge Classifier,0.8142,0.0,0.6284,0.3344,0.4364,0.3373,0.3615,0.83


Processing:   0%|          | 0/65 [00:00<?, ?it/s]