Predicting the participation of potential bank customers in the lottery with logistic regression algorithm

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, roc_curve
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
data = pd.read_csv('banking.txt')
data = data.dropna()
print(data.shape)
data.head()

In [None]:
data['education'].unique()

In [29]:
data['education'] = np.where(data['education'] == 'basic.4y', 'Basic', data['education'])
data['education'] = np.where(data['education'] == 'basic.6y', 'Basic', data['education'])
data['education'] = np.where(data['education'] == 'basic.9y', 'Basic', data['education'])

In [None]:
data['education'].unique()

In [None]:
data['y'].value_counts()

In [None]:
sns.countplot(x='y', data=data, palette='hls')
plt.show()

In [None]:
count_no_sub = len(data[data['y'] == 0])
count_sub = len(data[data['y'] == 1])

pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
pct_of_sub = count_sub/(count_no_sub+count_sub)

print('Percentage of no subscription: ', pct_of_no_sub*100)
print('Percentage of subscription: ', pct_of_sub*100)

In [None]:
data.describe()

In [None]:
data.groupby('y').mean()

In [None]:
# Visualization
%matplotlib inline
table = pd.crosstab(data.education, data.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title("Stacked bar chart of education vs purchase")
plt.xlabel('education')
plt.ylabel('proporation of customers')

In [None]:
data.age.hist()
plt.title("Histogram of age")
plt.xlabel("Age")
plt.ylabel("Frequency")

In [38]:
# creating dummy vars
cat_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
for var in cat_vars:
    cat_list = 'var_' + var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1 = data.join(cat_list)
    data = data1

cat_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]
data_final = data[to_keep]

In [39]:
data_final_vars = data_final.columns.values.tolist()
y = ['y']
x = [i for i in data_final_vars if i not in y]

In [None]:
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=18)
rfe = rfe.fit(data_final[x], data_final[y])
print(rfe.support_)
print(rfe.ranking_)

In [41]:
cols = ['previous', 'euribor3m', 'job_blue-collar', 'job_retired', 'job_services', 'job_student', 'default_no', 'contact_cellular', 'month_apr', 'month_aug', 'month_dec', 'month_mar', 'month_may', 'month_nov', 'month_oct', 'day_of_week_mon', 'poutcome_failure', 'poutcome_success']
x = data_final[cols]
y = data_final['y']

In [42]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3)

In [43]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix: \n', cm)

In [None]:
prfs = precision_recall_fscore_support(y_test, y_pred)
print('Precision: \n', prfs[0])
print('Recall: \n', prfs[1])
print('F-Score: \n', prfs[2])
print('Support: \n', prfs[3])

In [None]:
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, threshold = roc_curve(y_test, logreg.predict_proba(x_test)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic regression (area = %0.2f' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("Reciever operating characteristics")
plt.legend(loc='lower right')
plt.show()