In [9]:
from __future__ import division
import numpy as np
import scipy as sp
import pandas as pd
import csv
import random as rn

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [10]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import KernelPCA, PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm, cross_validation, grid_search
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif, RFECV, SelectKBest

In [11]:
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adagrad, Adadelta

In [12]:
ls ../datasets/bank/bank

bank-full.csv   bank-names.txt  bank.csv


In [13]:
fields = []

bank_csv = "../datasets/bank/bank/bank-full.csv"
with open(bank_csv, 'rb') as bcsv:
    reader = csv.reader(bcsv, delimiter=';')
    for row in reader:
        fields.append(row)

In [14]:
bank_df = pd.DataFrame(fields[1:], columns=fields[0])
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [15]:
bank_df.describe()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211
unique,77,12,3,4,2,7168,2,2,3,31,12,1573,48,559,41,4,2
top,32,blue-collar,married,secondary,no,0,yes,no,cellular,20,may,124,1,-1,0,unknown,no
freq,2085,9732,27214,23202,44396,3514,25130,37967,29285,2752,13766,188,17544,36954,36954,36959,39922


In [16]:
bank_df.shape

(45211, 17)

In [17]:
# Non-class features
na_indices = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']

In [18]:
indices = bank_df.columns.tolist()
indices

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [19]:
indices = list(set(indices) - set(na_indices))
indices

['poutcome',
 'default',
 'loan',
 'marital',
 'job',
 'contact',
 'month',
 'y',
 'education',
 'housing',
 'day']

In [20]:
# Numeric labeling of features
attrs = []

for col in indices:
    attr_vals = set(bank_df[col])
    attrs.append(list(attr_vals))
    val_map = {val:i for val, i in zip(attrs[-1], range(len(attr_vals)))}
    bank_df[col] = bank_df[col].replace(val_map)

In [21]:
bank_df = bank_df.astype(int)

In [22]:
binary_indices = ['default', 'housing', 'loan', 'y']

# 1-hot encoding
for attr, old_col_name in zip(attrs, indices):
    if not old_col_name in binary_indices:
        for label, col_name in zip(range(len(attr)), attr):
            bank_df[col_name] = bank_df[[old_col_name]].applymap(lambda x: 1 if x == label else 0)

        bank_df.drop(old_col_name, axis=1, inplace=True)

In [23]:
# -1 -> 1000
bank_df.pdays = bank_df.pdays.replace({-1:1000})

In [24]:
# Normalizing numeric data
for col in na_indices:
    bank_df[col] = sp.stats.zscore(bank_df[col])

In [25]:
y = bank_df.y.as_matrix()
bank_df.drop('y', axis=1, inplace=True)

X = bank_df.as_matrix()
X.shape

(45211, 75)

In [39]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
 X,y, train_size=.7, random_state=20)

In [40]:
X_train.shape

(31647, 75)

# Bank Classifers

In [41]:
# Logit
logit_clf = LogisticRegressionCV(solver='liblinear', Cs=[.001,.01,.1,1,10,100], penalty='l1')
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)

0.90607490415806546

In [28]:
# Logit
logit_clf = LogisticRegressionCV(solver='newton-cg', Cs=[.001,.01,.1,1,10,100], penalty='l2')
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)

0.90087289547636218

In [42]:
# SVM 
svm_params = {'C':[.01,1,10,100,500], 'degree':[1,2,3]}
clf = svm.SVC(kernel='poly')
svm_clf = grid_search.GridSearchCV(clf, svm_params, cv=3, n_jobs=4)
svm_clf.fit(X_train,y_train)
print svm_clf.best_score_
svm_clf.best_params_

0.901665244731


{'C': 500, 'degree': 3}

In [43]:
# Random Forests
rf_params = {"max_features":[1,2,4,6,8,12,16,20], 
             "n_estimators":[1024]}
clf = RandomForestClassifier()
rf_clf = grid_search.GridSearchCV(clf, rf_params, cv=3, n_jobs=4)
rf_clf.fit(X_train,y_train)
print rf_clf.best_score_
rf_clf.best_params_

0.904224729042


{'max_features': 20, 'n_estimators': 1024}

In [44]:
# KNN
knn_params = {"n_neighbors":[1,2,3,5,7,10,15,25,50,100,500]}
clf = KNeighborsClassifier()
knn_clf = grid_search.GridSearchCV(clf, knn_params, cv=3, n_jobs=4)
knn_clf.fit(X_train,y_train);
print knn_clf.best_score_
knn_clf.best_params_

0.897778620406


{'n_neighbors': 25}

In [45]:
# Gradient Boosting
gb_params = {"n_estimators":[512,1024],
            "learning_rate":[.01,.1]}
clf = GradientBoostingClassifier()
gb_clf = grid_search.GridSearchCV(clf, gb_params, cv=3, n_jobs=4)
gb_clf.fit(X_train,y_train);
print gb_clf.best_score_
gb_clf.best_params_

0.905741460486


{'learning_rate': 0.1, 'n_estimators': 512}

In [46]:
for classifer in [logit_clf, svm_clf, rf_clf, knn_clf, gb_clf]:
    print classifer.score(X_test, y_test)

0.906074904158
0.907033323503
0.909982306104
0.904231790032
0.911014450015


In [37]:
print sum(y)/len(y)

0.883015195417
