I. Title:Bank Marketing, Url:https://archive.ics.uci.edu/dataset/222/bank+marketing

II. Variable Name,Role,Type,Demographic Description,Units

1. age,Feature,Integer,Age
2. job,Feature,Categorical,Occupation,type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')	
3. marital,Feature,Categorical,Marital Status,marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)	
4. education,Feature,Categorical,Education Level (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')	
5. default,Feature,Binary,has credit in default?	
6. balance,Feature,Integer,average yearly balance euros
7. housing,Feature,Binary,has housing loan?	
8. loan,Feature,Binary,has personal loan?	
9. contact,Feature,Categorical,contact communication type (categorical: 'cellular','telephone')
10. day_of_week,Feature,Date,last contact day of the week
11. month,Feature,Date,last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')	
12. duration,Feature,Integer,last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.	
13. campaign,Feature,Integer,number of contacts performed during this campaign and for this client (numeric, includes last contact)	
14. pdays,Feature,Integer,number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted)
15. previous,Feature,Integer,number of contacts performed before this campaign and for this client	
16. poutcome,Feature,Categorical,outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
17. y,Target,Binary,has the client subscribed a term deposit?

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,f1_score,confusion_matrix
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from ucimlrepo import fetch_ucirepo 

In [3]:
bank_marketing = fetch_ucirepo(id=222) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
print(bank_marketing.metadata) 
print(bank_marketing.variables) 

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [63]:
data_fin=X.copy()
data_fin["target"]=y
data_fin.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,target
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,0.0
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,0.0
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,0.0
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,0.0
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,


In [64]:
data_fin=data_fin.drop(["age","education","pdays","month","day_of_week","campaign","previous","poutcome","duration"],axis=1)

In [65]:
data_fin.shape

(45211, 8)

In [66]:
data_fin.isna().sum()

job          288
marital        0
default        0
balance        0
housing        0
loan           0
contact    13020
target       288
dtype: int64

In [67]:
data_fin=data_fin.drop(["contact"],axis=1)

In [68]:
for i in data_fin.columns:
    print(i,data_fin[i].nunique())

job 11
marital 3
default 2
balance 7168
housing 2
loan 2
target 2


In [69]:
data_fin=data_fin.replace({"yes":1,"no":0})

In [70]:
data_fin["marital"]=data_fin["marital"].map({"single":0,"married":2,"divorced":1})

In [71]:
data_fin["target"].value_counts()

target
0.0    39668
1.0     5255
Name: count, dtype: int64

In [72]:
data_fin=data_fin[~data_fin["job"].isna()]
# data_fin.reset_index(inplace=True,drop=True)

In [73]:
dummies=pd.get_dummies(data_fin["job"]).astype(int)

In [74]:
data_fin=data_fin.drop("job",axis=1)
data_fin=pd.concat([dummies,data_fin],axis=1)

In [75]:
# kbins=KBinsDiscretizer(encode="ordinal",strategy="quantile",n_bins=10)
# data_fin["balance"]=kbins.fit_transform(np.array(data_fin["balance"]).reshape(-1,1))

In [76]:
x=data_fin.drop("target",axis=1)
y=data_fin["target"]
model=RandomForestClassifier()
print(y.sum()/len(y))
for i in range(0,2,1):
    a,b,c,d=train_test_split(x,y,stratify=y,test_size=0.2,random_state=i)
    model.fit(a,c)
    probs=model.predict_proba(b)[:,1]
    pred=model.predict(b)
    print(roc_auc_score(d,probs),f1_score(d,pred))
    print(confusion_matrix(d,pred))

0.11697794003071924
0.6633620086935103 0.30494645588985214
[[7323  611]
 [ 752  299]]
0.6663623802171914 0.3037974683544304
[[7310  624]
 [ 751  300]]


In [111]:
# let's do analysis of calling data, not including poutcome as it is the outcome
data_calling=X[["campaign","previous","duration"]]

In [112]:
data_calling["target"]=bank_marketing.data.targets
data_calling["target"]=data_calling["target"].map({"yes":1,"no":0})

In [113]:
x=data_calling.drop("target",axis=1)
y=data_calling["target"]
model=RandomForestClassifier()
print(y.sum()/len(y))
for i in range(0,2,1):
    a,b,c,d=train_test_split(x,y,stratify=y,test_size=0.2,random_state=i)
    model.fit(a,c)
    probs=model.predict_proba(b)[:,1]
    pred=model.predict(b)
    print(roc_auc_score(d,probs),f1_score(d,pred))
    print(confusion_matrix(d,pred))

0.11698480458295547
0.7505431971335668 0.31410622501427754
[[7567  418]
 [ 783  275]]
0.7478169133287484 0.30929024812463934
[[7578  407]
 [ 790  268]]


In [83]:
# let's check if data size is influencing the decision

In [84]:
x=data_calling.drop("target",axis=1)
y=data_calling["target"]
model=RandomForestClassifier()
print(y.sum()/len(y))
for i in range(0,2,1):
    a,b,c,d=train_test_split(x,y,stratify=y,train_size=0.2,random_state=i)
    model.fit(a,c)
    probs=model.predict_proba(b)[:,1]
    pred=model.predict(b)
    print(roc_auc_score(d,probs),f1_score(d,pred))
    print(confusion_matrix(d,pred))

0.11698480458295547
0.7506445549289329 0.3088296833589064
[[30107  1831]
 [ 3124  1107]]
0.7514272512364013 0.31246582832148717
[[29996  1942]
 [ 3088  1143]]


In [None]:
# this means there is merit in calling data

In [86]:
a,b,c,d=train_test_split(x,y,stratify=y,train_size=0.5,random_state=1)
model.fit(a,c)
probs_test=model.predict_proba(b)[:,1]
pred_test=model.predict(b)
probs_train=model.predict_proba(a)[:,1]
pred_train=model.predict(a)
print(roc_auc_score(c,probs_train),f1_score(c,pred_train))
print(confusion_matrix(c,pred_train))
print(roc_auc_score(d,probs_test),f1_score(d,pred_test))
print(confusion_matrix(d,pred_test))

0.9594271708045514 0.7349424768779608
[[19801   160]
 [ 1015  1629]]
0.7502612419359527 0.30571814497973887
[[18843  1118]
 [ 1966   679]]


In [None]:
# similar results, means there is potential of ml

In [114]:
data_calling=data_calling.loc[data_fin.index]
data_calling_fin=pd.concat([data_calling,data_fin],axis=1)

In [115]:
data_calling_fin=data_calling_fin.loc[:, ~data_calling_fin.columns.duplicated()]

In [91]:
x=data_calling_fin.drop("target",axis=1)
y=data_calling_fin["target"]

a,b,c,d=train_test_split(x,y,stratify=y,train_size=0.5,random_state=1)
model.fit(a,c)
probs_test=model.predict_proba(b)[:,1]
pred_test=model.predict(b)
probs_train=model.predict_proba(a)[:,1]
pred_train=model.predict(a)
print(roc_auc_score(c,probs_train),f1_score(c,pred_train))
print(confusion_matrix(c,pred_train))
print(roc_auc_score(d,probs_test),f1_score(d,pred_test))
print(confusion_matrix(d,pred_test))

1.0 1.0
[[19834     0]
 [    0  2627]]
0.8591121279987672 0.38455935906773486
[[19135   699]
 [ 1836   792]]


In [None]:
# awesome results improved but are still bad, ml can't help