In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score, mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge

In [2]:
bank_full_df = pd.read_csv('bank-full.csv',sep=';')
bank_full_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
bank_full_df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [4]:
selected_features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 
                     'previous', 'poutcome', 'y']

In [5]:
selected_features

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [6]:
bank_full_df[selected_features].isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
bank_full_df.education.describe()

count         45211
unique            4
top       secondary
freq          23202
Name: education, dtype: object

In [8]:
bank_full_df.education.mode()

0    secondary
Name: education, dtype: object

In [9]:
df_full = bank_full_df[selected_features]

In [10]:
## Lets get numerical features from the given set of feature
numerical = df_full.select_dtypes(include = 'int64').columns
numerical

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')

In [11]:
df_full[numerical]

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [12]:
df_corr = df_full[numerical].corr()
df_corr

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [13]:
df_full.y = (df_full.y == 'yes').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full.y = (df_full.y == 'yes').astype(int)


In [14]:
df_full

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,1
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,1
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,0


In [15]:
df_full_train, df_test = train_test_split(df_full, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

len(df_train), len(df_test), len(df_val)
df_full_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3344,41,blue-collar,married,primary,849,yes,unknown,15,may,72,1,-1,0,unknown,0
17965,49,technician,married,primary,1415,yes,cellular,30,jul,269,2,-1,0,unknown,0
18299,42,admin.,married,secondary,3842,no,cellular,31,jul,130,4,-1,0,unknown,0
10221,37,management,single,tertiary,-119,yes,unknown,11,jun,375,11,-1,0,unknown,0
32192,56,blue-collar,married,primary,3498,no,cellular,15,apr,264,2,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,44,housemaid,single,primary,1059,no,unknown,18,jun,2093,1,-1,0,unknown,1
44732,23,student,single,tertiary,508,no,cellular,8,sep,210,1,92,1,failure,0
38158,34,technician,divorced,tertiary,1317,yes,cellular,15,may,239,1,-1,0,unknown,0
860,33,retired,married,secondary,165,no,unknown,7,may,111,1,-1,0,unknown,0


In [16]:
df_test.y == 1

3776     False
9928     False
33409    False
31885    False
15738    False
         ...  
13353    False
38732     True
5654     False
3779     False
11677    False
Name: y, Length: 9043, dtype: bool

In [17]:
df_test.reset_index(drop=True)
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,38,services,divorced,secondary,-10,yes,unknown,17,jun,61,2,-1,0,unknown,0
1,42,management,single,tertiary,1146,yes,unknown,15,may,98,2,-1,0,unknown,0
2,43,management,married,tertiary,149,yes,unknown,23,jun,662,2,-1,0,unknown,1
3,50,management,married,tertiary,8205,yes,telephone,25,oct,293,3,508,1,other,0
4,43,management,married,tertiary,79,no,cellular,26,may,640,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9037,47,technician,married,secondary,356,no,cellular,29,jul,44,5,-1,0,unknown,0
9038,32,admin.,married,secondary,1604,no,cellular,7,jul,625,1,-1,0,unknown,1
9039,45,admin.,married,secondary,857,yes,cellular,21,nov,169,1,92,20,other,0
9040,40,admin.,married,secondary,153,yes,unknown,14,may,159,2,-1,0,unknown,0


In [18]:
y_train = df_train.y
y_test = df_test.y
y_val = df_val.y

len(df_train), len(df_test), len(df_val)

(27126, 9043, 9042)

In [19]:
del df_train['y']
del df_test['y']
del df_val['y']

In [20]:
# Get the list of all categorical variables
categorical = df_train.select_dtypes(include=['object']).columns
categorical

Index(['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome'],
      dtype='object')

In [21]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [22]:
mutual_info = df_full_train[categorical].apply(mutual_info_y_score)
mi_score = mutual_info.sort_values(ascending=False).to_frame(name="MI_SCORE")
np.round(mi_score, 2)

Unnamed: 0,MI_SCORE
poutcome,0.03
month,0.02
contact,0.01
housing,0.01
job,0.01
education,0.0
marital,0.0


In [23]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical + numerical

['job',
 'marital',
 'education',
 'housing',
 'contact',
 'month',
 'poutcome',
 'age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous']

In [24]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val =  dv.fit_transform(val_dict)

In [25]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.01315207, 0.01039464, 0.14683511, ..., 0.05419486, 0.00999035,
       0.2869727 ])

In [27]:
accuracy = (y_val == (y_pred >= 0.5)).mean()
np.round(accuracy,2)

0.9

In [28]:
y_pred = model.predict(X_val)
accuracy = np.round(accuracy_score(y_val, y_pred),4)
print(f'Accuracy : {accuracy}')

Accuracy : 0.9011


In [29]:
all_features = numerical + categorical
all_features


['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job',
 'marital',
 'education',
 'housing',
 'contact',
 'month',
 'poutcome']

In [30]:
actual_acc = accuracy
scores_df = pd.DataFrame(columns = ['Removed Feature','Accuracy Score', 'Difference'])
index = 0
for feature in all_features:
    partial_features = all_features.copy()
    partial_features.remove(feature)

    dv = DictVectorizer(sparse=False)
    train_dict = df_train[partial_features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    dict_val = df_val[partial_features].to_dict(orient='records')
    X_val = dv.transform(dict_val)

    y_pred = model.predict(X_val)
    acc_score = accuracy_score(y_pred, y_val)

    scores_df.loc[index] = [feature, acc_score, actual_acc - acc_score]
    index += 1


In [31]:
scores_df

Unnamed: 0,Removed Feature,Accuracy Score,Difference
0,age,0.901128,-2.8e-05
1,balance,0.901128,-2.8e-05
2,day,0.900796,0.000304
3,duration,0.89029,0.01081
4,campaign,0.899801,0.001299
5,pdays,0.900907,0.000193
6,previous,0.900686,0.000414
7,job,0.901128,-2.8e-05
8,marital,0.900354,0.000746
9,education,0.900796,0.000304


In [32]:
min = scores_df.Difference.min()
min

-2.8069011280718037e-05

In [33]:
scores_df[scores_df.Difference == min]

Unnamed: 0,Removed Feature,Accuracy Score,Difference
0,age,0.901128,-2.8e-05
1,balance,0.901128,-2.8e-05
7,job,0.901128,-2.8e-05


In [34]:
y_val

11019    0
3403     0
12260    1
45110    0
39549    1
        ..
17743    0
12796    1
27107    0
2823     0
26490    1
Name: y, Length: 9042, dtype: int32

In [35]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[all_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[all_features].to_dict(orient='records')
X_val =  dv.fit_transform(val_dict)

len(X_train), len(X_val)

(27126, 9042)

In [36]:

for reg_value in [0.01, 0.1, 1, 10, 100]:
    model = Ridge(alpha=reg_value, solver='sag', max_iter=1000, random_state=42)
    model.fit(X_train,y_train)

    y_pred = model.predict(X_val)
    #print(y_pred)
    accuracy_score = accuracy_score(y_val, y_pred, normalize=False)
    score = np.round(accuracy_score, 3)


    print(f'Alpha : {reg_value}, accuracy score : {score}')
    



ValueError: Classification metrics can't handle a mix of binary and continuous targets