In [1]:
import wget

In [2]:
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
wget.download(url, 'bank+marketing.zip')

-1 / unknown

'bank+marketing.zip'

In [3]:
import pandas as pd
import numpy as np

In [127]:
df = pd.read_csv("bank-full.csv", sep = ";")

In [20]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


### Data preparation

In [128]:
cols = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]

In [129]:
df = df[cols]

In [130]:
df.head(10)

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,231,yes,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,447,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,2,yes,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,121,yes,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,593,yes,unknown,5,may,55,1,-1,0,unknown,no


In [131]:
len(df)

45211

### Exploratory Data Analysis

In [132]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [133]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [134]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)

In [71]:
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [180]:
numeric = list(df.dtypes[df.dtypes == 'int64'].index)

In [135]:
for c in df.columns :
    print(c)
    print(df[c].unique()[:5])
    print(df[c].nunique())
    print()

age
[58 44 33 47 35]
77

job
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown']
12

marital
['married' 'single' 'divorced']
3

education
['tertiary' 'secondary' 'unknown' 'primary']
4

balance
[2143   29    2 1506    1]
7168

housing
['yes' 'no']
2

contact
['unknown' 'cellular' 'telephone']
3

day
[5 6 7 8 9]
31

month
['may' 'jun' 'jul' 'aug' 'oct']
12

duration
[261 151  76  92 198]
1573

campaign
[1 2 3 5 4]
48

pdays
[ -1 151 166  91  86]
559

previous
[0 3 1 4 2]
41

poutcome
['unknown' 'failure' 'other' 'success']
4

y
['no' 'yes']
2



### Question 1

In [8]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

mode is "secondary"

### Question 2

In [181]:
corr = df[numeric].corr()

In [43]:
print(corr)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [183]:
np.fill_diagonal(corr.values, np.nan)

In [184]:
corr.idxmax()

age          balance
balance          age
day         campaign
duration     balance
campaign         day
pdays       previous
previous       pdays
dtype: object

In [185]:
corr.idxmin()

age            pdays
balance     campaign
day            pdays
duration    campaign
campaign       pdays
pdays            day
previous         day
dtype: object

the biggest correlation : age and balance

### Target encoding

In [136]:
df.y = (df.y == 'yes').astype(int)

In [104]:
df.y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int32

#### Split the data

In [137]:
from sklearn.model_selection import train_test_split

In [138]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [139]:
len(df_full_train), len(df_test)

(36168, 9043)

In [140]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [141]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [142]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [143]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [144]:
y_train

array([0, 0, 0, ..., 0, 1, 0])

In [145]:
del df_train["y"]
del df_val["y"]
del df_test["y"]

### Question 3

In [65]:
from sklearn.metrics import mutual_info_score

In [68]:
def mutual_info_y_score(series):
    return mutual_info_score(series, y_train)

In [74]:
mi = df_train[categorical].apply(mutual_info_y_score)
mi.sort_values(ascending=False)

poutcome     0.029533
month        0.025090
contact      0.013356
housing      0.010343
job          0.007316
education    0.002697
marital      0.002050
dtype: float64

Variable with the biggest mutual information score : poutcome

### Question 4

__One-hot encoding__

In [146]:
from sklearn.feature_extraction import DictVectorizer

In [147]:
train_dicts = df_train.to_dict(orient='records')

In [148]:
train_dicts[0]

{'age': 32,
 'job': 'technician',
 'marital': 'single',
 'education': 'tertiary',
 'balance': 1100,
 'housing': 'yes',
 'contact': 'cellular',
 'day': 11,
 'month': 'aug',
 'duration': 67,
 'campaign': 1,
 'pdays': -1,
 'previous': 0,
 'poutcome': 'unknown'}

In [149]:
dv = DictVectorizer(sparse=False)

In [150]:
X_train = dv.fit_transform(train_dicts)

In [151]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [152]:
X_train.shape

(27126, 47)

In [153]:
val_dicts = df_val.to_dict(orient='records')

In [154]:
X_val = dv.transform(val_dicts)

__Logistic regression__

In [27]:
from sklearn.linear_model import LogisticRegression

In [155]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [156]:
(y_train == 1).sum()

3128

In [157]:
model.fit(X_train, y_train)

In [78]:
model.intercept_[0]

-0.9075237913978677

In [31]:
model.coef_[0].round(2)

array([-0.  ,  0.  , -0.08,  0.25,  0.07, -1.23,  0.01,  0.  , -0.42,
       -0.23, -0.07, -0.19, -0.1 , -0.81,  0.07, -0.24, -0.24, -0.28,
       -0.09,  0.34, -0.27, -0.15,  0.24, -0.16,  0.02, -0.13, -0.3 ,
       -0.45, -0.16,  0.08, -0.67,  0.32, -0.31, -0.95, -0.95,  0.26,
        1.24, -0.47, -0.9 ,  0.74,  0.71, -0.  , -0.76, -0.55,  1.49,
       -1.08,  0.  ])

In [102]:
y_train

array([0, 0, 0, ..., 0, 1, 0])

In [104]:
X_train[1]

array([ 38.,   0.,   1.,   1.,   0.,   0.,  17., 258.,   0.,   1.,   0.,
         0.,   0.,   1.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,  -1.,   0.,   0.,
         0.,   1.,   0.])

__Accuracy on the validation dataset__

In [160]:
y_pred = model.predict_proba(X_val)[:, 1]

In [161]:
term_deposit_prediction = (y_pred >= 0.5)

In [162]:
term_deposit_prediction.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [163]:
y_val

array([0, 0, 1, ..., 0, 0, 1])

In [164]:
(y_val == term_deposit_prediction.astype(int)).mean()

0.9011280690112807

Accuracy on the validation dataset : 0.9011280690112807

In [165]:
df_pred = pd.DataFrame()

In [166]:
df_pred["proba"] = y_pred
df_pred["pred"] = term_deposit_prediction.astype(int)
df_pred["actual"] = y_val

In [167]:
df_pred

Unnamed: 0,proba,pred,actual
0,0.013152,0,0
1,0.010395,0,0
2,0.146835,0,1
3,0.208729,0,0
4,0.423039,0,1
...,...,...,...
9037,0.022816,0,0
9038,0.285076,0,1
9039,0.054195,0,0
9040,0.009990,0,0


In [168]:
df_pred["correct"] = df_pred["pred"] == df_pred["actual"]

In [169]:
df_pred

Unnamed: 0,proba,pred,actual,correct
0,0.013152,0,0,True
1,0.010395,0,0,True
2,0.146835,0,1,False
3,0.208729,0,0,True
4,0.423039,0,1,False
...,...,...,...,...
9037,0.022816,0,0,True
9038,0.285076,0,1,False
9039,0.054195,0,0,True
9040,0.009990,0,0,True


In [170]:
df_pred["correct"].mean()

0.9011280690112807

In [129]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(2)))

{'age': -0.0,
 'balance': 0.0,
 'campaign': -0.08,
 'contact=cellular': 0.25,
 'contact=telephone': 0.07,
 'contact=unknown': -1.23,
 'day': 0.01,
 'duration': 0.0,
 'education=primary': -0.42,
 'education=secondary': -0.23,
 'education=tertiary': -0.07,
 'education=unknown': -0.19,
 'housing=no': -0.1,
 'housing=yes': -0.81,
 'job=admin.': 0.07,
 'job=blue-collar': -0.24,
 'job=entrepreneur': -0.24,
 'job=housemaid': -0.28,
 'job=management': -0.09,
 'job=retired': 0.34,
 'job=self-employed': -0.27,
 'job=services': -0.15,
 'job=student': 0.24,
 'job=technician': -0.16,
 'job=unemployed': 0.02,
 'job=unknown': -0.13,
 'marital=divorced': -0.3,
 'marital=married': -0.45,
 'marital=single': -0.16,
 'month=apr': 0.08,
 'month=aug': -0.67,
 'month=dec': 0.32,
 'month=feb': -0.31,
 'month=jan': -0.95,
 'month=jul': -0.95,
 'month=jun': 0.26,
 'month=mar': 1.24,
 'month=may': -0.47,
 'month=nov': -0.9,
 'month=oct': 0.74,
 'month=sep': 0.71,
 'pdays': -0.0,
 'poutcome=failure': -0.76,
 'pou

### Question 5

In [39]:
exclude_cols = ["age", "balance", "marital", "previous"]

In [172]:
df_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
1,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown
2,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown
3,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown
4,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown


In [171]:
for exc_col in exclude_cols:

    print(exc_col)
    df_exc_col = df_train.copy()
    del df_exc_col[exc_col]
    
    train_dicts_exc_col = df_exc_col.to_dict(orient='records')

    
    X_train_exc_col = dv.fit_transform(train_dicts_exc_col)
    

    model.fit(X_train_exc_col, y_train)


    df_val_exc_col = df_val.copy()

    del df_val_exc_col[exc_col]
    

    val_dicts_exc_col = df_val_exc_col.to_dict(orient='records')

    X_val_exc_col = dv.transform(val_dicts_exc_col)


    y_pred_exc_col = model.predict_proba(X_val_exc_col)[:, 1]
    term_deposit_prediction = (y_pred_exc_col >= 0.5)
    

    
    acc = (y_val == term_deposit_prediction.astype(int)).mean()

    print("acc = ", acc)
    print("diff = ", df_pred["correct"].mean() - acc)


    
    

age
acc =  0.9011280690112807
diff =  0.0
balance
acc =  0.9011280690112807
diff =  0.0
marital
acc =  0.900353904003539
diff =  0.000774165007741745
previous
acc =  0.9006856890068569
diff =  0.00044238000442387015


In [67]:
df_val.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,38,services,divorced,secondary,-10,yes,unknown,17,jun,61,2,-1,0,unknown
1,42,management,single,tertiary,1146,yes,unknown,15,may,98,2,-1,0,unknown
2,43,management,married,tertiary,149,yes,unknown,23,jun,662,2,-1,0,unknown
3,50,management,married,tertiary,8205,yes,telephone,25,oct,293,3,508,1,other
4,43,management,married,tertiary,79,no,cellular,26,may,640,1,-1,0,unknown


### Question 6

__Regularization__

In [175]:
arr_c = [0.01, 0.1, 1, 10, 100]

In [177]:
for par in arr_c:
    print("c = ", par)
    model_reg = LogisticRegression(solver='liblinear', C=par, max_iter=1000, random_state=42)
    model_reg.fit(X_train, y_train)
    y_pred_reg = model_reg.predict_proba(X_val)[:, 1]
    term_deposit_prediction_reg = (y_pred_reg >= 0.5)
    acc = (y_val == term_deposit_prediction_reg.astype(int)).mean()
    print("acc = ", acc)
    print("diff = ", df_pred["correct"].mean() - acc)


c =  0.01
acc =  0.898363193983632
diff =  0.002764875027648772
c =  0.1
acc =  0.9014598540145985
diff =  -0.00033178500331776384
c =  1
acc =  0.9011280690112807
diff =  0.0
c =  10
acc =  0.9015704490157045
diff =  -0.00044238000442375913
c =  100
acc =  0.9007962840079629
diff =  0.00033178500331787486


In [178]:
-0.00044238000442375913 - -0.00033178500331776384

-0.0001105950011059953

Smallest `C` that leads to the best accuracy on the validation set : c=10