In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split

import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

import time

## Load Excel File

In [2]:
data = pd.read_excel('Data Set for Assignment II - 1000 Records 75 Attributes.xls')

In [3]:
data.head()

Unnamed: 0,Record Serial Number,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,...,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,OUTCOME
0,1,9,1,3,3,3,1,4,1,5,...,0,1,0,0,0,0,0,0,0,0
1,2,33,1,4,3,8,0,6,0,3,...,0,0,0,0,0,0,0,0,0,0
2,3,8,2,3,3,2,2,4,1,3,...,0,0,0,0,0,0,0,0,0,0
3,4,39,1,3,3,9,1,4,1,5,...,0,1,0,0,0,0,0,0,0,0
4,5,33,1,3,3,8,0,5,0,4,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#Drop Record Serial Number
data.drop('Record Serial Number', axis=1, inplace=True)

In [5]:
data.columns

Index(['MOSTYPE ', 'MAANTHUI ', 'MGEMOMV ', 'MGEMLEEF ', 'MOSHOOFD ',
       'MGODRK ', 'MGODPR ', 'MGODOV ', 'MGODGE ', 'MRELGE ', 'MRELSA ',
       'MRELOV ', 'MFALLEEN ', 'MFGEKIND ', 'MFWEKIND ', 'MOPLHOOG ',
       'MOPLMIDD ', 'MOPLLAAG ', 'MBERHOOG ', 'MBERZELF ', 'MBERBOER ',
       'MBERMIDD ', 'MBERARBG ', 'MBERARBO ', 'MSKA ', 'MSKB1 ', 'MSKB2 ',
       'MSKC ', 'MSKD ', 'MHHUUR ', 'MHKOOP ', 'MAUT1 ', 'MAUT2 ', 'MAUT0 ',
       'MZFONDS ', 'MZPART ', 'MINKM30 ', 'MINK3045 ', 'MINK4575 ',
       'MINK7512 ', 'MINK123M ', 'MINKGEM ', 'MKOOPKLA ', 'PWAPART ',
       'PWABEDR ', 'PWALAND ', 'PPERSAUT ', 'PBESAUT ', 'PMOTSCO ', 'PVRAAUT ',
       'PAANHANG ', 'PTRACTOR ', 'PWERKT ', 'PBROM ', 'PLEVEN ', 'PPERSONG ',
       'PGEZONG ', 'PWAOREG ', 'PBRAND ', 'PZEILPL ', 'PPLEZIER ', 'PFIETS ',
       'PINBOED ', 'PBYSTAND ', 'AWAPART ', 'AWABEDR ', 'AWALAND ',
       'APERSAUT ', 'ABESAUT ', 'AMOTSCO ', 'AVRAAUT ', 'AAANHANG ',
       'ATRACTOR ', 'AWERKT ', 'ABROM ', 'OUTCOME'],

In [6]:
X_all = data.iloc[:,0:76]
y_all = data['OUTCOME']
print(X_all.shape, y_all.shape)

(1000, 76) (1000,)


## Remove Fields with Zero or 5% Variance.

In [7]:
vt = VarianceThreshold(threshold=0.05)
vt_data = vt.fit(X_all)

In [8]:
X_all = X_all.iloc[:,vt_data.get_support()]
print("Shape after removing near-zero variance fields:", X_all.shape)

Shape after removing near-zero variance fields: (1000, 60)


## Normalization

In [9]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X_all)
print("Shape of X_std: ", X_std.shape)

Shape of X_std:  (1000, 60)


## PCA

In [10]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_std)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [11]:
tls.set_credentials_file(username='abhishek.sparta', api_key='yudmTNxLLBWQrw3yNaQX')

In [12]:
eig_vals = pca.explained_variance_ratio_
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

trace1 = Bar(
        x=['PC %s' %i for i in range(1,50)],
        y=var_exp,
        showlegend=False)

trace2 = Scatter(
        x=['PC %s' %i for i in range(1,50)], 
        y=cum_var_exp,
        name='cumulative explained variance')

data = Data([trace1, trace2])

layout=Layout(
        yaxis=YAxis(title='Explained variance in percent'),
        title='Explained variance by different principal components')

fig = Figure(data=data, layout=layout)
py.iplot(fig)

### Selecting First 37 variables

In [13]:
pca = PCA(n_components=37)
X_pca = pca.fit_transform(X_std)
print("Shape of X_pca: ", X_pca.shape)

Shape of X_pca:  (1000, 37)


## Min-Max Normalization

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [15]:
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [16]:
X_pca = scaler.fit_transform(X_pca)

# Data Split - Train, Valid and Test

In [17]:
X_all = pd.DataFrame(X_pca)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.35, random_state=123)
print("X_train:", X_train.shape, ", X_test: ", X_test.shape, ", y_train: ", y_train.shape, ", y_test: ", y_test.shape)

X_train: (650, 37) , X_test:  (350, 37) , y_train:  (650,) , y_test:  (350,)


In [19]:
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.427, random_state=123)
print("X_valid:", X_valid.shape, ", X_test: ", X_test.shape, ", y_valid: ", y_valid.shape, ", y_test: ", y_test.shape)

X_valid: (200, 37) , X_test:  (150, 37) , y_valid:  (200,) , y_test:  (150,)


# Machine Learning

## Cost Function

In [20]:
def cost(tn, fn, tp, fp):
    return( (fn * -1000) + (tp*9000) + (fp * -9000) )

## Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [22]:
tic = time.clock()
decisonTree = DecisionTreeClassifier(random_state=121, min_samples_leaf=3)
scores = cross_val_score(decisonTree, X_train, y_train, cv=5, scoring='f1_macro')
decisonTree.fit(X_train, y_train)
timeTaken = time.clock() - tic

In [23]:
print("Time Taken: ", timeTaken)

Time Taken:  0.067631


In [24]:
print("Average F1-Score: ",np.mean(scores))

Average F1-Score:  0.85292428526


In [25]:
pred = decisonTree.predict(X_valid)

In [26]:
con_matrix = confusion_matrix(y_pred=pred, y_true=y_valid)

In [27]:
cost_dt = cost(con_matrix[0][0], con_matrix[0][1], con_matrix[1][1], con_matrix[1][0])
print("Cost of Decision Tree: ", cost_dt)

Cost of Decision Tree:  24000


## Neural Network

In [28]:
from sklearn.neural_network import MLPClassifier

In [29]:
tic = time.clock()
mlp = MLPClassifier(activation="relu", random_state=12321)
scores = cross_val_score(mlp, X_train, y_train, cv=5)
mlp.fit(X_train, y_train)
timeTaken = time.clock() - tic

In [30]:
scores

array([ 0.93129771,  0.93129771,  0.93846154,  0.9379845 ,  0.9379845 ])

In [31]:
print("Time Taken: ", timeTaken)

Time Taken:  1.1209770000000003


In [32]:
print("Average Accuracy: ",np.mean(scores))

Average Accuracy:  0.935405190111


In [33]:
pred = mlp.predict(X_valid)
con_matrix = confusion_matrix(y_pred=pred, y_true=y_valid)
print(con_matrix)

[[193   0]
 [  0   7]]


In [34]:
cost_dt = cost(con_matrix[0][0], con_matrix[0][1], con_matrix[1][1], con_matrix[1][0])
print("Cost of Decision Tree: ", cost_dt)

Cost of Decision Tree:  63000


# Comparision of Neural Network and Neural Network

In [35]:
pred_test_dt = decisonTree.predict(X_test)
con_matrix = confusion_matrix(y_pred=pred_test_dt, y_true=y_test)
print(con_matrix)
print("Cost of Decision Tree: ", cost(con_matrix[0][0], con_matrix[0][1], con_matrix[1][1], con_matrix[1][0]))

[[142   0]
 [  2   6]]
Cost of Decision Tree:  36000


In [36]:
pred_test_mlp = mlp.predict(X_test)
con_matrix = confusion_matrix(y_pred=pred_test_mlp, y_true=y_test)
print(con_matrix)
print("Cost of Neural Network: ", cost(con_matrix[0][0], con_matrix[0][1], con_matrix[1][1], con_matrix[1][0]))

[[142   0]
 [  0   8]]
Cost of Neural Network:  72000


### Final Comparision

Neural Network is showing better results with unseen data. Neural Network is training a little slower compared to decision tree. 