In [3]:
import pandas as pd

In [10]:
df = pd.read_csv("bank-full.csv", delimiter=";")
df = df[['age',
'job',
'marital',
'education',
'balance',
'housing',
'contact',
'day',
'month',
'duration',
'campaign',
'pdays',
'previous',
'poutcome',
'y']]

In [11]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [27]:
df.y = (df.y == "yes").astype(int)

In [28]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  int64 
dtypes: int64(8), object(7)
memory usage: 5.2+ MB


In [30]:
# question 1, most frequent observation for the column education
df.groupby('education')['education'].agg(['count'])

Unnamed: 0_level_0,count
education,Unnamed: 1_level_1
primary,6851
secondary,23202
tertiary,13301
unknown,1857


In [52]:
# correlation for numerical features 
numerical = [numerical for numerical in list((df.dtypes[df.dtypes != "object"]).index) if numerical not in ['y']]
numerical

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [53]:
print(f"age and balance: {df['age'].corr(df.balance)}")

print(f"day and campaign: {df['day'].corr(df.campaign)}")

print(f"day and pday: {df['day'].corr(df.pdays)}")

print(f"pdays and previous: {df['pdays'].corr(df.previous)}")
        

age and balance: 0.09778273937134742
day and campaign: 0.16249021632619293
day and pday: -0.09304407377294044
pdays and previous: 0.45481963548050164


In [54]:
# mutual information scores
categorical = [category for category in list((df.dtypes[df.dtypes == "object"]).index) if category not in ['y'] + numerical ]
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [57]:
# split the dataset
from sklearn.model_selection import train_test_split

In [58]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=42)

df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [59]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [60]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [61]:
del df_train['y']
del df_test['y']
del df_val['y']

In [63]:
from sklearn.metrics import mutual_info_score


def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.y)

mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
education    0.002458
marital      0.002019
dtype: float64

In [64]:
# working with the training and validation dataset

# one hot encoding

from sklearn.feature_extraction import DictVectorizer

In [66]:
dv = DictVectorizer(sparse=False)

In [65]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

In [68]:
X_train = dv.fit_transform(train_dicts)

In [69]:
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')

In [70]:
X_val = dv.transform(val_dicts)

In [71]:
# train logistic regression model 

from sklearn.linear_model import LogisticRegression

In [72]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [73]:
model.fit(X_train,y_train)

In [74]:
y_pred = model.predict_proba(X_val)[:,1]

In [92]:
churn_decision = (y_pred > 0.5)

In [101]:
accuracy = (y_val == churn_decision).mean()

In [85]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

df_pred['correct'] = df_pred['prediction'] == df_pred['actual']
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.013008,0,0,True
1,0.009609,0,0,True
2,0.156794,0,1,False
3,0.238753,0,0,True
4,0.449277,0,1,False
...,...,...,...,...
9037,0.022792,0,0,True
9038,0.270360,0,1,False
9039,0.052184,0,0,True
9040,0.008910,0,0,True


In [97]:
df_pred.correct.mean()

0.9010174740101747

In [98]:
# question 5 
features = df_train.columns.to_list()
features

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [99]:
from sklearn.metrics import accuracy_score

In [118]:
original_score = accuracy
scores = pd.DataFrame(columns=['eliminated_feature', 'accuracy', 'difference'])
for feature in features:
    subset = features.copy()
    subset.remove(feature)
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    
    scores.loc[len(scores)] = [feature, score, original_score - score]

In [115]:
scores[scores['eliminated_feature'].isin(['age','balance','marital','previous'])].sort_values(by='difference')

Unnamed: 0,eliminated_feature,accuracy,difference
2,marital,0.901239,-0.000221
12,previous,0.901017,0.0
0,age,0.900907,0.000111
4,balance,0.900796,0.000221


In [117]:
accuracy

0.9010174740101747

In [121]:
# question 6
scores = {}

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_val, y_pred)
    scores[c] = round(score, 3)
    print(f'c = {c}:\t accuracy = {score}')

c = 0.01:	 accuracy = 0.8910639239106393
c = 0.1:	 accuracy = 0.893607608936076
c = 1:	 accuracy = 0.8932758239327583
c = 10:	 accuracy = 0.8933864189338642
c = 100:	 accuracy = 0.8934970139349702


In [122]:
scores

{0.01: 0.891, 0.1: 0.894, 1: 0.893, 10: 0.893, 100: 0.893}