In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
cols = [
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y'
]

In [3]:
df=pd.read_csv('bank-full.csv' ,delimiter=';', usecols=cols)

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1 

In [6]:
df.education.describe()

count         45211
unique            4
top       secondary
freq          23202
Name: education, dtype: object

In [7]:
num = list(df.dtypes[df.dtypes == 'int64'].index)
num 

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

### Question 2 

In [8]:
corr_matrix = round(df[num].corr(),4)
corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.0978,-0.0091,-0.0046,0.0048,-0.0238,0.0013
balance,0.0978,1.0,0.0045,0.0216,-0.0146,0.0034,0.0167
day,-0.0091,0.0045,1.0,-0.0302,0.1625,-0.093,-0.0517
duration,-0.0046,0.0216,-0.0302,1.0,-0.0846,-0.0016,0.0012
campaign,0.0048,-0.0146,0.1625,-0.0846,1.0,-0.0886,-0.0329
pdays,-0.0238,0.0034,-0.093,-0.0016,-0.0886,1.0,0.4548
previous,0.0013,0.0167,-0.0517,0.0012,-0.0329,0.4548,1.0


In [9]:
df.y = (df['y'] == 'yes').astype(int)

### Question 3

In [10]:
from sklearn.model_selection import train_test_split

df_full , df_test = train_test_split(df , test_size=0.2 , random_state=42)
df_train , df_val = train_test_split(df_full , test_size=0.25 , random_state=42)

In [11]:
len(df_train) , len(df_test) , len(df_val)

(27126, 9043, 9042)

In [12]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

y_train = df_train.y.values
y_val=df_val.y.values
y_test = df_test.y.values

In [13]:
del df_train['y']
del df_test['y']
del df_val['y']

In [14]:
cat = list(df_train.columns[df_train.dtypes=='object'])
cat

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [15]:
from sklearn.metrics import mutual_info_score

def mutual_score(categorical):
    return round(mutual_info_score( categorical,y_train) , 2)

mi = df_train[cat].apply(mutual_score)
mi.sort_values(ascending=False).to_frame(name='mutual score')

Unnamed: 0,mutual score
month,0.03
poutcome,0.03
job,0.01
housing,0.01
contact,0.01
marital,0.0
education,0.0


### Question 4

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [17]:
ohe = OneHotEncoder(sparse_output=False , handle_unknown='ignore')

X_train_cat = ohe.fit_transform(df_train[cat].values)
X_train_cat.shape

(27126, 40)

In [18]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42) 

In [19]:
X_train = np.column_stack([X_train_cat , df_train[num].values])
X_train

array([[ 0.,  0.,  0., ...,  1., -1.,  0.],
       [ 0.,  0.,  1., ...,  1., -1.,  0.],
       [ 0.,  1.,  0., ...,  2., -1.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  1., -1.,  0.],
       [ 0.,  0.,  0., ...,  2., -1.,  0.],
       [ 0.,  0.,  0., ...,  2., -1.,  0.]])

In [20]:
model.fit(X_train , y_train)

In [21]:
X_val_cat = ohe.transform(df_val[cat].values)

X_val = np.column_stack([X_val_cat , df_val[num].values])


prediction = model.predict_proba(X_val)[:,1]
pred= prediction >= 0.5
g=accuracy_score( pred, y_val)

### Question 5

In [22]:
model1 = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42) 

In [23]:
feature = ['age',
    'balance',
    'marital',
    'previous'
]

In [25]:
accuracy

{'age': -0.0001105950011059953,
 'balance': 0.0001105950011059953,
 'marital': 0.0017695200176951476,
 'previous': 0.0001105950011059953}

### Question 6 

In [37]:
 accuracy={}
for f in feature:
    ohe1 = OneHotEncoder(sparse_output=False , handle_unknown='ignore')
    df_train1=df_train.copy()
    df1_train= df_train1.drop(columns=f)

    df_val1=df_val.copy()
    df_val1=df_val1.drop(columns=f)

    cat = list(df1_train.columns[df1_train.dtypes=='object'])
    num = list(df1_train.dtypes[df1_train.dtypes == 'int64'].index)
    X_train1_cat=ohe1.fit_transform(df1_train[cat])

    X_train1 = np.column_stack([X_train1_cat , df1_train[num]])

    X_val_cat1 = ohe1.fit_transform(df_val1[cat].values)

    X_val1 = np.column_stack([X_val_cat1 , df_val1[num].values])

    model1.fit(X_train1 , y_train)

    

    predict = model1.predict_proba(X_val1)[:,1]
    predict = predict >= 0.5
    score= accuracy_score(predict , y_val)

    accuracy.setdefault(f,g-score) 
    

In [38]:
C =  [0.01, 0.1, 1, 10, 100]

In [39]:
score=[]
for c in C:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train , y_train)
    prediction = model.predict_proba(X_val)[:,1]
    pred= prediction >= 0.5
    g=accuracy_score( pred, y_val)
    score.append(g)

ValueError: X has 47 features, but LogisticRegression is expecting 46 features as input.

In [32]:
score

[0.8976996239769962,
 0.9009068790090687,
 0.9010174740101747,
 0.9011280690112807,
 0.9015704490157045]

In [36]:

for c in C:

    ohe2 = OneHotEncoder(sparse_output=False , handle_unknown='ignore')

    X_train_cat = ohe2.fit_transform(df_train[cat].values)
    X_train = np.column_stack([X_train_cat , df_train[num].values])
    X_test_cat = ohe2.transform(df_test[cat].values)

    X_test = np.column_stack([X_test_cat , df_test[num].values])
    model2 = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model2.fit(X_train , y_train)
    prediction = model2.predict_proba(X_test)[:,1]
    pred= prediction >= 0.5
    print(accuracy_score( pred, y_test))
    

0.8968262744664381
0.899037929890523
0.8989273471193188
0.898595598805706
0.8992590954329316
