In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('bank/bank.csv')

In [3]:
df.dropna(); # not really required

In [4]:
df.shape

(4521, 17)

In [5]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [6]:
df_new = df[['job', 'marital', 'default', 'housing', 'loan', 'poutcome', 'y']]

In [7]:
df_new.head()

Unnamed: 0,job,marital,default,housing,loan,poutcome,y
0,unemployed,married,no,no,no,unknown,no
1,services,married,no,yes,yes,failure,no
2,management,single,no,yes,no,failure,no
3,management,married,no,yes,yes,unknown,no
4,blue-collar,married,no,yes,no,unknown,no


In [8]:
data = pd.get_dummies(df_new, columns=['y', 'job', 'marital', 'default', 'housing', 'loan', 'poutcome'])

In [9]:
data.columns

Index(['y_no', 'y_yes', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'default_no', 'default_yes', 'housing_no', 'housing_yes', 'loan_no',
       'loan_yes', 'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

We're trying to predict y. We don't need two columns for y, since it can only have values 'yes' or 'no' - Let's just use the onehot encoding from 'y_yes', and rename 'y_yes' to 'y'.

In [10]:
data.drop(['y_no'], axis=1, inplace=True)

One more column also needs to be renamed just because it's annoying:

In [11]:
data.rename(columns={'y_yes':'y', 'job_admin.':'job_admin'}, inplace=True)

In [12]:
data.head(3)

Unnamed: 0,y,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,1,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,1,0,0,1,1,0,1,0,0,0


Drop 'unknown' columns becaue we actually know *a priori* that we do know all the values.

In [13]:
data = data[[col for col in data.columns if not 'unknown' in col]]

In [14]:
data.columns

Index(['y', 'job_admin', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'marital_divorced', 'marital_married', 'marital_single', 'default_no',
       'default_yes', 'housing_no', 'housing_yes', 'loan_no', 'loan_yes',
       'poutcome_failure', 'poutcome_other', 'poutcome_success'],
      dtype='object')

In [15]:
x = data.loc[:, 'job_admin':]
y = data.loc[:, 'y']            # can't just extract data[['y']]

In [16]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [17]:
classifier = LogisticRegression(solver='lbfgs')

In [18]:
classifier.fit(xtrain, ytrain)

LogisticRegression()

## Testing

In [19]:
classifier.score(xtest, ytest)

0.900972590627763