In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./bank-full.csv",delimiter=';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


In [5]:
df = df.drop(columns = ['default', 'loan'])

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


In [7]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [9]:
categorical = ['job',          
    'marital',    
    'education',  
    'housing',   
    'contact',    
    'month', 
    'poutcome'
]    

numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Question 1

In [10]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

# Question 2

In [11]:
corr_mat = df.corr(numeric_only=True)
corr_mat

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [12]:
sorted_mat = corr_mat.unstack().sort_values() 
sorted_mat.abs()

day       pdays       0.093044
pdays     day         0.093044
          campaign    0.088628
campaign  pdays       0.088628
duration  campaign    0.084570
campaign  duration    0.084570
day       previous    0.051710
previous  day         0.051710
          campaign    0.032855
campaign  previous    0.032855
duration  day         0.030206
day       duration    0.030206
age       pdays       0.023758
pdays     age         0.023758
campaign  balance     0.014578
balance   campaign    0.014578
age       day         0.009120
day       age         0.009120
age       duration    0.004648
duration  age         0.004648
pdays     duration    0.001565
duration  pdays       0.001565
previous  duration    0.001203
duration  previous    0.001203
previous  age         0.001288
age       previous    0.001288
balance   pdays       0.003435
pdays     balance     0.003435
day       balance     0.004503
balance   day         0.004503
age       campaign    0.004760
campaign  age         0.004760
previous

# Target encoding

In [13]:
df.y = (df.y == 'yes').astype(int)

# Split the data

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [17]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

# Question 3

In [19]:
from sklearn.metrics import mutual_info_score

In [20]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [21]:
mi = df_full_train[categorical].apply(mutual_info_y_score)
round(mi.sort_values(ascending=False),2)

poutcome     0.03
month        0.02
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

# Question 4

In [22]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [23]:
dv = DictVectorizer(sparse=False)

In [24]:
train_dico = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dico)

In [25]:
val_dico = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dico)

In [26]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [27]:
dv.get_feature_names_out().shape, X_val.shape

((47,), (9042, 47))

## Logistic Regression

In [28]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict(X_val)    
accuracy = round((y_pred == y_val).mean(),2)

# Question 5

In [30]:
features = categorical + numerical
feats = ['age','balance','marital','previous']
abs_acc = []

for i in range(len(feats)) :
    new_list = list(set(features) - {feats[i]})
    print(f"Removing {feats[i]}")
    
    train_dico = df_train[new_list].to_dict(orient='records')
    X_train = dv.fit_transform(train_dico)

    val_dico = df_val[new_list].to_dict(orient='records')
    X_val = dv.fit_transform(val_dico)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)    
    acc = (y_pred == y_val).mean()
        
    print(f"The accuracy is {acc}")
    print(f"The difference between original accuracy : {round(accuracy-acc,7)}")
    abs_acc.append(abs(accuracy-acc))

    features = categorical + numerical
    print()
    print()

Removing age
The accuracy is 0.9011280690112807
The difference between original accuracy : -0.0011281


Removing balance
The accuracy is 0.9011280690112807
The difference between original accuracy : -0.0011281


Removing marital
The accuracy is 0.900353904003539
The difference between original accuracy : -0.0003539


Removing previous
The accuracy is 0.9006856890068569
The difference between original accuracy : -0.0006857




In [46]:
print(f"Lowest accuracy difference reached when removing ")
feats[abs_acc. index(min(abs_acc))]

Lowest accuracy difference reached when removing 


'marital'

# Question 6

In [34]:
train_dico = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dico)

val_dico = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dico)

In [42]:
Acc_list =[]
Reg = [0.01, 0.1, 1, 10, 100]
for c in Reg :
    print(f"For c = {c}")
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)    
    acc = (y_pred == y_val).mean()
        
    print(f"The accuracy is {acc}")
    acc = round(acc,3)
    Acc_list.append(acc)

    print()
    print()

For c = 0.01
The accuracy is 0.898363193983632


For c = 0.1
The accuracy is 0.9014598540145985


For c = 1
The accuracy is 0.9011280690112807


For c = 10
The accuracy is 0.9015704490157045


For c = 100
The accuracy is 0.9007962840079629




In [41]:
print(f"Max accuracy {max(Acc_list)} reached for c={Reg[Acc_list. index(max(Acc_list))]}")

Max accuracy 0.902 reached for c=10
