In [46]:
import pandas as pd
import numpy as np
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.models import Model
from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from utils import get_engine
from sklearn import linear_model
import matplotlib.pyplot as plt

In [72]:
# load dataset
table = "data_fraud_little"
engine = get_engine()
dataframe = pd.read_sql_query("select * from {table} limit 50000".format(table=table),engine)
dataset = dataframe.values
print("First one row of the dataset")
print("Shape [{}]".format(dataset.shape))
print(dataset[0:2,:])
# split into input (X) and output (Y) variables
data_dimensions = 45
#first dimension is the index, must be removed!!!!
X = dataset[:, 1:data_dimensions]
Y = dataset[:, data_dimensions]

print("Fraud {}% ".format(float(np.sum(Y==1))*100.0/Y.shape[0]))
print("Total #samples:",Y.shape[0])
Y = to_categorical(Y, nb_classes=None)


input_dimensions = X.shape[1]
print("shapes: X[{}]=====Y[{}]".format(X.shape, Y.shape))

First one row of the dataset
Shape [(50000, 47)]
[[  4.75200320e+07   7.49524000e+05   1.38868315e+18   6.89620000e+04
    5.83500000e+01   1.77484800e+06   5.77500000e+04   1.87362000e+05
    3.67000000e+02   1.62381800e+06   1.68885100e+06   5.20000000e+02
    5.00000000e+00   0.00000000e+00   0.00000000e+00   8.00000000e+00
    1.00000000e+00   0.00000000e+00   4.00000000e+00   0.00000000e+00
    1.00000000e+00   1.00000000e+02   0.00000000e+00   0.00000000e+00
    0.00000000e+00   5.60000000e+01   1.00000000e+00   1.00000000e+00
    0.00000000e+00   1.00000000e+00   3.00000000e+00   0.00000000e+00
    1.90000000e+01   1.84000000e+03   8.26000000e+02   6.10000000e+07
    5.83500000e+01   5.13265000e+03   7.50000000e+03   7.50000000e+03
    2.18985000e+03  -9.22337204e+18   1.38602880e+18   1.47048000e+03
    1.37419200e+18   0.00000000e+00  -9.22337204e+18]
 [  3.88424760e+07   6.72120000e+05   1.39431354e+18   1.00165000e+05
    1.49900000e+01   4.20536000e+05   1.32586000e+05   0.

In [63]:
# define base mode
def baseline_model():
    return logistic_regresion()
    # return linear_regression()


def keras_lin_reg():
    x = Input((None,input_dimensions))
    y = Dense(1,activation='linear')(x)
    model = Model(x,y,"Linear Regression")
    model.compile(loss='mse', optimizer='sgd')
    return model

def logistic_regresion():
    logistic = linear_model.LogisticRegression(solver='sag', n_jobs=-1,max_iter=500)
    return logistic
def linear_regression():
    lr = linear_model.LinearRegression(n_jobs=-1)
    return lr

def mlp_model(hidden=None,layers=1):
    # create model
    model = Sequential()
    model.add(Dense(input_dimensions, input_dim=input_dimensions, init='normal', activation='relu'))
    if hidden is not None:
        for l in range(layers):
            model.add(Dense(hidden))
    model.add(Dense(2, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

def mlp_model_wrap(layers=1):
    return mlp_model(100,layers)
# fix random seed for reproducibility

In [56]:
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
# estimators.append(('mlp', KerasClassifier(build_fn=mlp_model, nb_epoch=100, batch_size=10000, verbose=1)))
estimators.append(('mlp', KerasClassifier(build_fn=mlp_model_wrap, nb_epoch=100, batch_size=10000, verbose=0)))
# estimators.append(('liner reg', KerasClassifier(build_fn=keras_lin_reg, nb_epoch=100, batch_size=100000, verbose=1)))
# estimators.append(('linear_reg', baseline_model()))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold, scoring='roc_auc',n_jobs=1)
print("Results:", results)
print("Results: %.24f (%.24f) ROC" % (results.mean(), results.std()))
print(pipeline)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [68]:
estimators = []
estimators.append(('standardize', StandardScaler()))

estimators.append(('mlp', KerasClassifier(build_fn=mlp_model_wrap, nb_epoch=100, batch_size=10000, verbose=0)))

pipeline = Pipeline(estimators)#
pipeline.set_params(mlp__layers=2)

Pipeline(steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7fe642fc1d50>)])

In [61]:
mlp_model_wrap.__call__

<method-wrapper '__call__' of function object at 0x7fe63bbcd938>

In [None]:
score_means = list()
score_stds = list()
layers = range(20)
kfold = KFold(n_splits=3, random_state=seed)
for l in layers:
    pipeline.set_params(mlp__layers=l) 
    this_scores = cross_val_score(pipeline, X, Y, cv=kfold, scoring='roc_auc',n_jobs=1)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

plt.errorbar(layers, score_means, np.array(score_stds))

plt.title(
    'Performance of the MLP with Increasing layers')
plt.xlabel('number of layers')
plt.ylabel('AUC')

plt.axis('tight')
plt.show()

In [None]:
pipeline.fit(X,Y)
    pipeline.predict_proba