In [1]:
import pandas as pd

In [2]:
bank = pd.read_csv('data/bank_marketing.csv')

In [26]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 22 columns):
Unnamed: 0        4119 non-null int64
age               4119 non-null int64
job               4119 non-null object
marital           4119 non-null object
education         4119 non-null object
default           4119 non-null object
housing           4119 non-null object
loan              4119 non-null object
contact           4119 non-null object
month             4119 non-null object
day_of_week       4119 non-null object
duration          4119 non-null int64
campaign          4119 non-null int64
pdays             4119 non-null int64
previous          4119 non-null int64
poutcome          4119 non-null object
emp.var.rate      4119 non-null float64
cons.price.idx    4119 non-null float64
cons.conf.idx     4119 non-null float64
euribor3m         4119 non-null float64
nr.employed       4119 non-null float64
y                 4119 non-null int64
dtypes: float64(5), int64(7), object(

In [27]:
X = bank[['age', 'job', 'education', 'day_of_week']]

In [28]:
y = bank['y']

In [29]:
X.describe(include = 'all') #without "include=all" you only get descrive for quantifiable variables

Unnamed: 0,age,job,education,day_of_week
count,4119.0,4119,4119,4119
unique,,12,8,5
top,,admin.,university.degree,thu
freq,,1012,1264,860
mean,40.11362,,,
std,10.313362,,,
min,18.0,,,
25%,32.0,,,
50%,38.0,,,
75%,47.0,,,


In [30]:
X.job.value_counts()

admin.           1012
blue-collar       884
technician        691
services          393
management        324
retired           166
self-employed     159
entrepreneur      148
unemployed        111
housemaid         110
student            82
unknown            39
Name: job, dtype: int64

In [31]:
X.education.value_counts()

university.degree      1264
high.school             921
basic.9y                574
professional.course     535
basic.4y                429
basic.6y                228
unknown                 167
illiterate                1
Name: education, dtype: int64

In [32]:
X.education.value_counts().sum() / X.education.value_counts()

university.degree         3.258703
high.school               4.472313
basic.9y                  7.175958
professional.course       7.699065
basic.4y                  9.601399
basic.6y                 18.065789
unknown                  24.664671
illiterate             4119.000000
Name: education, dtype: float64

In [33]:
X.day_of_week.value_counts()

thu    860
mon    855
tue    841
wed    795
fri    768
Name: day_of_week, dtype: int64

In [34]:
bank_dummies = pd.get_dummies(bank[['age', 'job', 'education', 'day_of_week']], drop_first = True)
#bank_dummies = pd.get_dummies(bank[X], drop_first = True)

In [35]:
bank_dummies.head()

Unnamed: 0,age,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed
0,30,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,39,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,25,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
3,38,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,47,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(bank_dummies, y)

In [38]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [53]:
dummy = DummyClassifier()
lgr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=4)

In [40]:
y.value_counts()

0    3668
1     451
Name: y, dtype: int64

In [41]:
perc = y.value_counts()[0] / (y.value_counts()[0]+y.value_counts()[1])
perc

#does this mean that for every 9 guesses you'll get it right?

0.890507404709881

In [54]:
for model in [dummy, lgr_clf, knn_clf]:
    model.fit(X_train, y_train)
    print('The ', model, '\nhas accuracy\n', model.score(X_test, y_test), '\n')

The  DummyClassifier(constant=None, random_state=None, strategy='stratified') 
has accuracy
 0.8116504854368932 

The  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 
has accuracy
 0.8815533980582524 

The  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform') 
has accuracy
 0.8786407766990292 



In [51]:
#dummy = DummyClassifier (strategy = 'most_frequent') #this will change the result quite a lot from strategy = stratified

In [43]:
pd.DataFrame([bank_dummies.columns, lgr_clf.coef_[0]]).T

Unnamed: 0,0,1
0,age,0.0132865
1,job_blue-collar,-0.629681
2,job_entrepreneur,-0.800255
3,job_housemaid,-0.286499
4,job_management,-0.804325
5,job_retired,0.466658
6,job_self-employed,-0.550208
7,job_services,-0.360833
8,job_student,0.843136
9,job_technician,-0.278534


In [44]:
from sklearn.metrics import confusion_matrix

In [45]:
confusion_matrix(y_test, lgr_clf.predict(X_test))

array([[908,   0],
       [122,   0]])

In [46]:
#lgr_imbalanced = LogisticRegression?

In [47]:
lgr_imbalanced = LogisticRegression(class_weight='balanced')

In [48]:
lgr_imbalanced.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [49]:
lgr_imbalanced.score(X_test, y_test)

0.5640776699029126

In [50]:
confusion_matrix(y_test, lgr_imbalanced.predict(X_test))

array([[524, 384],
       [ 65,  57]])

In [56]:
#you see differential performance based on your models and their class of balance or imbalance, class weights play a big role

In [None]:
#you can't score well with your classifier if you don't balance your classes

In [None]:
#class imbalance is an idicator for poor performance