In [207]:
import pandas as pd
pd.options.mode.chained_assignment = None 
# df[df['age'] > 66]['age'] = 67 vs df.loc[df['age'] > 66, 'age'] = 67

from scipy.io import arff

import plotly
plotly.tools.set_credentials_file(username='artemii-yanushevskyi', api_key='aRmQfG7U4SAlhISYVym7')

import plotly.plotly as py
import plotly.graph_objs as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from IPython.display import Image
import plotly.io as pio

import numpy as np
import seaborn as sns

from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
colormap = plt.cm.RdBu

exporting = False

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import OneHotEncoder

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [283]:
def plotattributes(df, attributes=['balance', 'newbalance'], size=50):
    for atr in attributes:
        trace0 = go.Histogram(
            x=df[df['termDeposit'] == 0][atr],
            name='No subscribtion',
            xbins=dict(
                size=size,
            ),
            marker=dict(color='red'),
        )
        trace1 = go.Histogram(
            x=df[df['termDeposit'] == 1][atr],
            name='Subscribtion',
            xbins=dict(
                size=size,
            ),
            marker=dict(color='green'),
        )
        data = [trace0, trace1]
        layout = go.Layout(barmode='stack', title=atr.capitalize())
        fig = go.Figure(data=data, layout=layout)
        static_image_bytes = pio.to_image(fig, format='png')
        if exporting == True:
            display(Image(static_image_bytes))
        else:
            display(iplot(fig))

# Feature Processing

In [346]:
data, meta = arff.loadarff('cworkTrain.arff')
df = pd.DataFrame(data)

# convert attribute values with type "object" to regular strings
objects_df = df.select_dtypes([object]) # select only atributes of object type
stack_df = objects_df.stack() # means create one column
decoded_stack_df = stack_df.str.decode('utf-8') # decode the values in the column
decoded_objects_df = decoded_stack_df.unstack() # separate into columns

# replace in df
for col in decoded_objects_df.columns:
    df[col] = decoded_objects_df[col]

# replace month with a value
df['month'] = pd.to_datetime(df.month, format='%b').dt.month
df['termDeposit'] = pd.to_numeric(df['termDeposit'].apply(lambda x: 0 if x == 'no' else 1))
df['poutcome'] = pd.to_numeric([0 if x=='unknown' else 1 if x=='success' else -1 for x in df['poutcome']])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,termDeposit
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,5,261.0,1.0,-1.0,0.0,0,0
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,5,151.0,1.0,-1.0,0.0,0,0
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,5,76.0,1.0,-1.0,0.0,0,0
3,47.0,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5.0,5,92.0,1.0,-1.0,0.0,0,0
4,35.0,management,married,tertiary,no,231.0,yes,no,unknown,5.0,5,139.0,1.0,-1.0,0.0,0,0


Notice for age:
1. after 62 yo we see that subscription rate is about 50%
2. jobs can be united by success rate
3. unknown education should be made known based on age, job, loan, housing, poutcome etc. KNN with 14 NN

In [347]:
df[df['age'] > 66]['age'] = 67

In [349]:
# sort jobs by success rate
group_job = df.groupby(['job'])[['termDeposit']].mean().sort_values(['termDeposit'])
display(group_job)
# group into categories
groups = []
y = -np.inf
for x in [0.08, 0.1, 0.12, 0.15, 0.2, 0.25, 0.29]:
    group = group_job.index[(y < group_job.termDeposit) & (group_job.termDeposit < x)]
    groups.append(list(group))
    y = x

print(groups)
# create a column name for each group
colnames = []
for group in groups:
    colname = '-'.join([c[:4] for c in group])
    colnames.append(colname)
    
print(colnames)

# create a column for each group
for i in range(len(groups)):
    newcol = [1 if job in groups[i] else 0 for job in df['job']]
    df[colnames[i]] = newcol

# delete job
df = df.drop('job', axis=1)
df.head()

Unnamed: 0_level_0,termDeposit
job,Unnamed: 1_level_1
blue-collar,0.07249
entrepreneur,0.084307
housemaid,0.087702
services,0.088007
unknown,0.103139
technician,0.108542
self-employed,0.115806
admin.,0.119162
management,0.136867
unemployed,0.159378


[['blue-collar'], ['entrepreneur', 'housemaid', 'services'], ['unknown', 'technician', 'self-employed', 'admin.'], ['management'], ['unemployed'], ['retired'], ['student']]
['blue', 'entr-hous-serv', 'unkn-tech-self-admi', 'mana', 'unem', 'reti', 'stud']


Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,day,month,...,previous,poutcome,termDeposit,blue,entr-hous-serv,unkn-tech-self-admi,mana,unem,reti,stud
0,58.0,married,tertiary,no,2143.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,0,1,0,0,0
1,44.0,single,secondary,no,29.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,1,0,0,0,0
2,33.0,married,secondary,no,2.0,yes,yes,unknown,5.0,5,...,0.0,0,0,0,1,0,0,0,0,0
3,47.0,married,unknown,no,1506.0,yes,no,unknown,5.0,5,...,0.0,0,0,1,0,0,0,0,0,0
4,35.0,married,tertiary,no,231.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,0,1,0,0,0


In [354]:
# X_train = df[~(df['education'] == 'unknown')]['marital', 'default', 'housing', 'loan']
# filter_col = [col for col in df if col.startswith('foo')]
# y_train 

Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,day,month,...,previous,poutcome,termDeposit,blue,entr-hous-serv,unkn-tech-self-admi,mana,unem,reti,stud
0,58.0,married,tertiary,no,2143.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,0,1,0,0,0
1,44.0,single,secondary,no,29.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,1,0,0,0,0
2,33.0,married,secondary,no,2.0,yes,yes,unknown,5.0,5,...,0.0,0,0,0,1,0,0,0,0,0
4,35.0,married,tertiary,no,231.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,0,1,0,0,0
5,28.0,single,tertiary,no,447.0,yes,yes,unknown,5.0,5,...,0.0,0,0,0,0,0,1,0,0,0
6,42.0,divorced,tertiary,yes,2.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,1,0,0,0,0,0
7,58.0,married,primary,no,121.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,0,0,0,1,0
8,41.0,divorced,secondary,no,270.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,1,0,0,0,0
9,29.0,single,secondary,no,390.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,1,0,0,0,0
10,53.0,married,secondary,no,6.0,yes,no,unknown,5.0,5,...,0.0,0,0,0,0,1,0,0,0,0


In [234]:
df = df[df['balance'] != 0]

# Balance

In [228]:
plotattributes(df, ['balance'], size=50)

None

## Apply logarithm to ```balance``` attribute.

In [374]:
x = np.linspace(-1000, 1000, num=1000)
y = signn(x) * np.log(1 + np.abs(x))
              
trace1 = go.Scatter(x=x, y=y, marker={'color': 'red'}, 
                    mode="markers+lines")
                                               
data=go.Data([trace1])
layout=go.Layout(title="Graph of $\DeclareMathOperator{\sign}{sign} \sign(x)\cdot\log(1+|x|)$", xaxis={'title':'x'}, yaxis={'title':'y'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='pyguide_1')

In [231]:
signn = np.vectorize(lambda x: -1 if x<0 else 1)

In [235]:
x = df['balance']
df_new['balance log'] = signn(x) * np.log(1 + np.abs(x))
plotattributes(df_new, ['balance log'], size=0.5)

None

Balance is not usually well reported. The people may have big balance but in some other bank. Let's predict balance using linear regression. Let's try to predict balance in case it is zero.
The same should be done with poutcome...

In [176]:
y = df[df['balance'] != 0].drop('balance', axis=1)
X = df[df['balance'] != 0][['balance']]
y.describe()

Unnamed: 0,age,day,month,duration,campaign,pdays,previous,termDeposit
count,33366.0,33366.0,33366.0,33366.0,33366.0,33366.0,33366.0,33366.0
mean,40.77585,15.756279,6.113409,257.933615,2.746149,41.125217,0.590421,0.118504
std,10.614692,8.312469,2.415079,256.78502,3.028759,101.229518,2.433856,0.323209
min,18.0,1.0,1.0,0.0,1.0,-1.0,0.0,0.0
25%,32.0,8.0,5.0,103.0,1.0,-1.0,0.0,0.0
50%,39.0,16.0,6.0,181.0,2.0,-1.0,0.0,0.0
75%,48.0,21.0,8.0,318.0,3.0,-1.0,0.0,0.0
max,94.0,31.0,12.0,4918.0,63.0,871.0,275.0,1.0


# Duration

In [360]:
x = df['duration']
df_new['duration log'] = signn(x) * np.log(1 + np.abs(x))
df_new.loc[df_new['duration log'] <= 2, 'duration log'] = 2
plotattributes(df_new, ['duration log'], size=0.1)

None

1. Link libraries adjust parameters, plotting functions, global variables
2. Import Dataset, separate data and target variable.
3. Conversions
    1. attribute values with type "object" to regular strings
    2. month name to number (1-12)
    3. attribute ```termDeposit``` to 1 if successful and 0 otherwise
    4. attribute ```termDeposit``` to 1 if successful -1 if not and 0 if unknown
3. One Hot encode categorical values
3. Test Run all classifiers


1. re-import Dataset
3. Conversions
1. Groups
    1. group some jobs under a new attribute
5. Predict vague attribute values
    1. Predict possible balance value if it is 0
6. Transformations 
    1. Apply logarithm to ```balance``` attribute.
    2. if age is greater 66, then it is 67
    3. Apply logarithm to ```duration``` attribute.
8. One Hot encode categorical values
7. Remove old columns
9. Apply PCA
8. Run all classifiers on clean data

In [190]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,termDeposit
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,5,261.0,1.0,-1.0,0.0,-1,0
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,5,151.0,1.0,-1.0,0.0,-1,0
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,5,76.0,1.0,-1.0,0.0,-1,0
3,47.0,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5.0,5,92.0,1.0,-1.0,0.0,-1,0
4,35.0,management,married,tertiary,no,231.0,yes,no,unknown,5.0,5,139.0,1.0,-1.0,0.0,-1,0


In [184]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(df[df['balance'] != 0].drop('balance', axis=1).values, df[df['balance'] != 0]['balance'])
reg.score(X, y), reg.coef_, reg.intercept_, reg.predict(df[df['balance'] == 0].drop('balance', axis=1))

ValueError: could not convert string to float: 'other'

Application of logarithm would be more suitable. We can now clearly see that values above and below 0 seem to be normally distributed. This may significantly benefit some classification algorithms like Logistic Regression, although it will have no effect for Decision Tree classifier.

On the other hand, it is easier to see how to apply binning for the attribute: -9 -3 1 4.5 9.5.

We may divide data into two parts for training.

These transformations would result in some improvement if we will create 3 new attributes:
log(x) =,<,> 0 

In [75]:
df['balance_zero'] = [1 if -20 < x < 20 else 0 for x in df['balance']]

## (Later)

In [9]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
df_shuffle = shuffle(df_one_hot_ordered)
X_train, X_test, y_train, y_test = train_test_split(df_shuffle.drop('termDeposit', axis=1), df_shuffle['termDeposit'], test_size=0.33, random_state=45)
display(X_train.shape, y_train.shape)

(24233, 40)

(24233,)

In [11]:
# create dummy variables for df_categorical
from sklearn.preprocessing import OneHotEncoder

df_one_hot = pd.concat([df_numerical, df['termDeposit']], axis=1)
catergorical_attributes = dict()
for atr in df_categorical.columns:
    df_dummies = pd.get_dummies(df[atr], prefix = atr)
    catergorical_attributes[atr] = [col.split('_')[1] for col in df_dummies.columns]
    df_one_hot = pd.concat([df_one_hot, df_dummies], axis=1) # the dataset ready to appy decision tree algorithm

attribute_order_one_hot = []
for i in range(len(df.columns)):
    if df.columns[i] in catergorical_attributes.keys():
        categories = [df.columns[i] + '_' + cat for cat in catergorical_attributes[df.columns[i]]]
        attribute_order_one_hot.extend(categories)
    else:
        attribute_order_one_hot.append(df.columns[i])
        
df_one_hot_ordered = df_one_hot[attribute_order_one_hot]
df_one_hot_ordered.head()

Unnamed: 0,age,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,month,duration,campaign,pdays,previous,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,termDeposit
0,58.0,0,0,0,0,1,0,0,0,0,...,5,261.0,1.0,-1.0,0.0,0,0,0,1,0
1,44.0,0,0,0,0,0,0,0,0,0,...,5,151.0,1.0,-1.0,0.0,0,0,0,1,0
2,33.0,0,0,1,0,0,0,0,0,0,...,5,76.0,1.0,-1.0,0.0,0,0,0,1,0
3,47.0,0,1,0,0,0,0,0,0,0,...,5,92.0,1.0,-1.0,0.0,0,0,0,1,0
4,35.0,0,0,0,0,1,0,0,0,0,...,5,139.0,1.0,-1.0,0.0,0,0,0,1,0
