In [1]:
import pandas as pd
import numpy as np

In [2]:
bank_df = pd.read_csv('bank-full.csv', sep = ';')
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
bank_additional_df = pd.read_csv('bank-additional-full.csv', sep = ';')
bank_additional_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
bank_df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

Data Preparation

In [5]:
categorical_columns = list(bank_df.dtypes[bank_df.dtypes == 'object'].index)
for c in categorical_columns:
    bank_df[c] = bank_df[c].str.lower().str.replace('-','.')
    
df = bank_df[['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue.collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


Check for missing values

In [6]:
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

From the above analysis, no missing values are present

Question 1
What is the most frequent observation (mode) for the column education?

In [7]:
mode = df.education.mode()
mode

0    secondary
Name: education, dtype: object

The most frequent observation for the column education is secondary

Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- age and balance
- day and campaign
- day and pdays
- pdays and previous

In [8]:
df[['age','balance']].corr()

Unnamed: 0,age,balance
age,1.0,0.097783
balance,0.097783,1.0


In [9]:
df[['day','campaign']].corr()

Unnamed: 0,day,campaign
day,1.0,0.16249
campaign,0.16249,1.0


In [10]:
df[['day','pdays']].corr()

Unnamed: 0,day,pdays
day,1.0,-0.093044
pdays,-0.093044,1.0


In [11]:
df[['pdays','previous']].corr()

Unnamed: 0,pdays,previous
pdays,1.0,0.45482
previous,0.45482,1.0


In [12]:
df_final = df

From the above analysis the features pdays and previous have the biggest correlation

Target encoding
Now we want to encode the y variable.
Let's replace the values yes/no with 1/0.

In [13]:
df_final['z'] = df_final['y'].map( {'yes':1 ,'no':0}) 
df_final.z.unique()
df_final.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['z'] = df_final['y'].map( {'yes':1 ,'no':0})


age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
z             int64
dtype: object

In [14]:
del df_final['y']
df_final['y'] = df_final['z']
del df_final['z']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['y'] = df_final['z']


Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value y is not in your dataframe.

In [15]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df_final, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [18]:
df_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
1,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown
2,49,blue.collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown
3,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown
4,31,self.employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown


In [19]:
df_val.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,38,services,divorced,secondary,-10,yes,unknown,17,jun,61,2,-1,0,unknown
1,42,management,single,tertiary,1146,yes,unknown,15,may,98,2,-1,0,unknown
2,43,management,married,tertiary,149,yes,unknown,23,jun,662,2,-1,0,unknown
3,50,management,married,tertiary,8205,yes,telephone,25,oct,293,3,508,1,other
4,43,management,married,tertiary,79,no,cellular,26,may,640,1,-1,0,unknown


In [20]:
df_test.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,40,blue.collar,married,secondary,580,yes,unknown,16,may,192,1,-1,0,unknown
1,47,services,single,secondary,3644,no,unknown,9,jun,83,2,-1,0,unknown
2,25,student,single,tertiary,538,yes,cellular,20,apr,226,1,-1,0,unknown
3,42,management,married,tertiary,1773,no,cellular,9,apr,311,1,336,1,failure
4,56,management,married,tertiary,217,no,cellular,21,jul,121,2,-1,0,unknown


Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

- contact
- education
- housing
- poutcome

In [21]:
df_full_train.y.value_counts()

y
0    31970
1     4198
Name: count, dtype: int64

In [22]:
from sklearn.metrics import mutual_info_score

def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [23]:
round(mutual_info_y_score(df_full_train.contact),2)

np.float64(0.01)

In [24]:
round(mutual_info_y_score(df_full_train.education),2)

np.float64(0.0)

In [25]:
round(mutual_info_y_score(df_full_train.housing),2)

np.float64(0.01)

In [26]:
round(mutual_info_y_score(df_full_train.poutcome),2)

np.float64(0.03)

From the above analysis poutcome has the greatest mutual info y score

Question 4
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [28]:
df_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
1,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown
2,49,blue.collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown
3,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown
4,31,self.employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown


In [30]:
from sklearn.feature_extraction import DictVectorizer
categorical = [
    'job',
    'marital',
    'education',
    'housing',
    'contact',
    'month',
    'poutcome'
]

numerical = [
    'balance',
    'day',
    'duration',
    'campaign',
    'pdays',
    'previous']
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [32]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict_proba(X_val)[:, 1]

In [55]:
subscribed_term_deposit_decision = (y_pred >= 0.5)

In [56]:
(y_val == subscribed_term_deposit_decision).mean()

np.float64(0.9009068790090687)

In [38]:
round((y_val == subscribed_term_deposit_decision).mean(),2)

np.float64(0.9)

The accuracy on the validation set is 0.9

Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

- age
- balance
- marital
- previous

In [49]:
categorical = [
    'job',
    'marital',
    'education',
    'housing',
    'contact',
    'month',
    'poutcome'
]

numerical = [
    'age',
    'balance',
    'day',
    'duration',
    'campaign',
    'pdays',
    'previous']

In [41]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [44]:
mi = df_full_train[categorical].apply(mutual_info_y_score)
mi.sort_values(ascending=False)

poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
education    0.002458
marital      0.002019
dtype: float64

Marital has the least amount of mutual information with y

In [60]:
categorical_exclude_marital = [
    'job',
    'education',
    'housing',
    'contact',
    'month',
    'poutcome'
]


dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_exclude_marital + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_exclude_marital + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
model_categorical_exclude_marital = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_categorical_exclude_marital.fit(X_train, y_train)



In [62]:
y_pred = model_categorical_exclude_marital.predict_proba(X_val)[:, 1]
subscribed_term_deposit_decision_categorical_exclude_marital = (y_pred >= 0.5)
(y_val == subscribed_term_deposit_decision_categorical_exclude_marital).mean()

np.float64(0.9009068790090687)

Excluding marital is giving an accuracy of 0.900907

In [50]:
num_df = df_full_train[numerical].corrwith(df_full_train.y).abs()
num_df.sort_values(ascending=False)

duration    0.393402
pdays       0.105742
previous    0.092051
campaign    0.072571
balance     0.052518
age         0.026684
day         0.025887
dtype: float64

In [63]:
numerical_exclude_age = [
    'day',
    'balance',
    'duration',
    'campaign',
    'pdays',
    'previous']

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical_exclude_age].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical_exclude_age].to_dict(orient='records')
X_val = dv.transform(val_dict)
model_numerical_exclude_age = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_numerical_exclude_age.fit(X_train, y_train)

y_pred = model_numerical_exclude_age.predict_proba(X_val)[:, 1]
subscribed_term_deposit_decision_numerical_exclude_age = (y_pred >= 0.5)
(y_val == subscribed_term_deposit_decision_numerical_exclude_age).mean()

np.float64(0.9013492590134926)

Excluding age is giving an accuracy of 0.90135

In [64]:
numerical_exclude_balance = [
    'age',
    'day',
    'duration',
    'campaign',
    'pdays',
    'previous']

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical_exclude_balance].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical_exclude_balance].to_dict(orient='records')
X_val = dv.transform(val_dict)
model_numerical_exclude_balance = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_numerical_exclude_balance.fit(X_train, y_train)

y_pred = model_numerical_exclude_balance.predict_proba(X_val)[:, 1]
subscribed_term_deposit_decision_numerical_exclude_balance = (y_pred >= 0.5)
(y_val == subscribed_term_deposit_decision_numerical_exclude_balance).mean()

np.float64(0.9010174740101747)

Excluding balance is giving an accuracy of 0.90102

In [66]:
numerical_exclude_previous = [
    'age',
    'day',
    'balance',
    'duration',
    'campaign',
    'pdays']

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical_exclude_previous].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical_exclude_previous].to_dict(orient='records')
X_val = dv.transform(val_dict)
model_numerical_exclude_previous = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_numerical_exclude_previous.fit(X_train, y_train)

y_pred = model_numerical_exclude_previous.predict_proba(X_val)[:, 1]
subscribed_term_deposit_decision_numerical_exclude_previous = (y_pred >= 0.5)
(y_val == subscribed_term_deposit_decision_numerical_exclude_previous).mean()

np.float64(0.9009068790090687)

Excluding previous is giving an accuracy of 0.90091

The categorical variable 'marital' is giving the least difference and the exact accuracy as the base model

Question 6
Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100