### Prerequisites

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse as ssp
import pylab as plt
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,MinMaxScaler,OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD,NMF,PCA,FactorAnalysis
from sklearn.feature_selection import SelectFromModel,SelectPercentile,f_classif
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,roc_auc_score
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.cross_validation import StratifiedKFold,KFold
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint,Callback
from keras import backend as K
from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge,Convolution1D,MaxPooling1D,Lambda,AveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,SReLU
from keras.models import Model

Using TensorFlow backend.


In [2]:
seed = 1
np.random.seed(seed)
path = "data/"
dim = 32
hidden = 64

In [3]:
#A callback is a set of functions to be applied at given stages of the training procedure. 
#You can use callbacks to get a view on internal states and statistics of the model during training. 
#You can pass a list of callbacks (as the keyword argument callbacks) to the .fit() method of the Sequential model. 
#The relevant methods of the callbacks will then be called at each stage of the training.

class AucCallback(Callback):  #inherits from Callback
    
    def __init__(self, validation_data=(), patience=25,is_regression=True,best_model_name='best_keras.mdl',feval='roc_auc_score',batch_size=1024*8):
        super(Callback, self).__init__()
        
        self.patience = patience
        self.X_val, self.y_val = validation_data  #tuple of validation X and y
        self.best = -np.inf
        self.wait = 0  #counter for patience
        self.best_model=None
        self.best_model_name = best_model_name
        self.is_regression = is_regression
        self.y_val = self.y_val#.astype(np.int)
        self.feval = feval
        self.batch_size = batch_size
        
    def on_epoch_end(self, epoch, logs={}):
        p = self.model.predict(self.X_val,batch_size=self.batch_size, verbose=0)#.ravel()
        if self.feval=='roc_auc_score':
            current = roc_auc_score(self.y_val,p)

        if current > self.best:
            self.best = current
            self.wait = 0
            self.model.save_weights(self.best_model_name,overwrite=True)
            
        else:
            if self.wait >= self.patience:
                self.model.stop_training = True
                print('Epoch %05d: early stopping' % (epoch))
                
            self.wait += 1 #incremental the number of times without improvement
        print('Epoch %d Auc: %f | Best Auc: %f \n' % (epoch,current,self.best))

In [4]:
def make_batches(size, batch_size):
    nb_batch = int(np.ceil(size/float(batch_size)))
    return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]

### Read data

In [5]:
# Read data
train = pd.read_csv(path+'act_train.csv')
test = pd.read_csv(path+'act_test.csv')
people = pd.read_csv(path+'people.csv')
test['outcome'] = np.nan
data = pd.concat([train,test])

In [6]:
# Join people to activities
data = pd.merge(data,people,how='left',on='people_id').fillna('missing')
train = data[:train.shape[0]]
test = data[train.shape[0]:]

In [7]:
# Encode categorical values
columns = train.columns.tolist()
columns.remove('activity_id')
columns.remove('people_id')
columns.remove('outcome')
data = pd.concat([train,test])
for c in columns:
    data[c] = LabelEncoder().fit_transform(data[c].values)

train = data[:train.shape[0]]
test = data[train.shape[0]:]

data = pd.concat([train,test])
columns = train.columns.tolist()
columns.remove('activity_id')
columns.remove('people_id')
columns.remove('outcome')

In [8]:
X = train[columns].values
X_t = test[columns].values
y = train["outcome"].values
people_id = train["people_id"].values
activity_id = test['activity_id']

#del data
#del train
#del test

In [9]:
skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=seed)
for ind_tr, ind_te in skf:
    X_train = X[ind_tr]
    X_test = X[ind_te]

    y_train = y[ind_tr]
    y_test = y[ind_te]
    break

X_train = [X_train[:,i] for i in range(X.shape[1])]
X_test = [X_test[:,i] for i in range(X.shape[1])]

#del X

### Create neural network

In [13]:
# Fill in inputs layer
flatten_layers = []
inputs = []

for c in columns:
    inputs_c = Input(shape=(1,), dtype='int32', name="Input_"+c)
    num_c = len(np.unique(data[c].values))
    embed_c = Embedding(
                    num_c,
                    dim,
                    dropout=0.2,
                    input_length=1,
                    name="Embedding_"+c
                    )(inputs_c)
    flatten_c= Flatten(name="Flatten_"+c)(embed_c)
    inputs.append(inputs_c)
    flatten_layers.append(flatten_c)
    
flatten = merge(flatten_layers, mode='concat', name="Merge")

In [15]:
# Close model
fc1 = Dense(hidden, activation='relu', name = 'Dense1')(flatten)
dp1 = Dropout(0.5, name='Dropout1')(fc1)

fc2 = Dense(hidden/2, activation='relu', name = 'Dense2')(dp1)
dp2 = Dropout(0.5, name='Dropout2')(fc2)

outputs = Dense(1, activation='sigmoid')(dp2)

model = Model(input=inputs, output=outputs)
model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
          )

In [16]:
model_name = 'mlp_residual_%s_%s.hdf5'%(dim,hidden)
model_checkpoint = ModelCheckpoint(model_name, monitor='val_loss', save_best_only=True)
auc_callback = AucCallback(validation_data=(X_test,y_test), patience=5,is_regression=True,best_model_name=path+'best_keras.mdl',feval='roc_auc_score')

nb_epoch = 10

batch_size = 1024*8

In [13]:
print('Load Model')
model.load_weights(path+model_name)
# model.load_weights(path+'best_keras.mdl')

Load Model


In [35]:
model.save_weights(path+model_name)

In [17]:
model.fit(
    X_train, 
    y_train,
    batch_size=batch_size, 
    nb_epoch=nb_epoch, 
    verbose=1, 
    shuffle=True,
    validation_data=[X_test,y_test],
    # callbacks = [
        # model_checkpoint,
        # auc_callback,
        # ],
    )

# model.load_weights(model_name)
# model.load_weights(path+'best_keras.mdl')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1647967 samples, validate on 549324 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f81729ebda0>

### Model utilization

In [18]:
y_preds = model.predict(X_test,batch_size=1024*8)
print(roc_auc_score(y_train, y_preds))

ValueError: unknown format is not supported

In [20]:
# print('Make submission')
X_t = [X_t[:,i] for i in range(X_t.shape[1])]
outcome = model.predict(X_t,batch_size=1024*8)
submission = pd.DataFrame()
submission['activity_id'] = activity_id
submission['outcome'] = outcome
submission.to_csv('submission_residual_%s_%s.csv'%(dim,hidden),index=False)

In [46]:
###DO NOT DELETE, WORKING EXAMPLE
inputs = [model.get_layer("Input_people_id").input, K.learning_phase()]
outputs = [model.get_layer("Flatten_people_id").output]
func = K.function(inputs, outputs)

sample_size = data.shape[0]
data_ae = []
batches = make_batches(sample_size, batch_size)
for batch_index, (batch_start, batch_end) in enumerate(batches):
    X_0 = np.reshape(data[batch_start:batch_end]["people_id"], (-1, 1))
    yy = func([X_0,0])[0]
    data_ae.append(yy)


data_ae = np.vstack(data_ae)

In [68]:
inputs = [model.get_layer("Input_people_id").input, model.get_layer("Input_date_x").input, K.learning_phase()]
outputs = [model.get_layer("Flatten_people_id").output, model.get_layer("Flatten_date_x").output]
func = K.function(inputs, outputs)

sample_size = data.shape[0]
data_ae = []
batches = make_batches(sample_size, 1024)
for batch_index, (batch_start, batch_end) in enumerate(batches):
    X_0 = np.reshape(data[batch_start:batch_end][["people_id"]], (-1, 1))
    X_1 = np.reshape(data[batch_start:batch_end][["date_x"]], (-1, 1))
    yy = func([X_0,X_1,0])[0]
    data_ae.append(yy)

data_ae = np.vstack(data_ae)

In [16]:
import csv

inputs = [K.learning_phase()]
outputs = [model.get_layer("Dropout").output]

for c in columns:
    inputs_c = model.get_layer("Input_"+c).input
    inputs.append(inputs_c)

func = K.function(inputs, outputs)

sample_size = data.shape[0]
data_ae = []
batches = make_batches(sample_size, 1024*8)

myfile = open(path+'python_embeddings.csv', 'w', newline='')
wrtr = csv.writer(myfile, delimiter=',', quotechar='"')

for batch_index, (batch_start, batch_end) in enumerate(batches):
    print(batch_index, (batch_start, batch_end), sample_size, (sample_size - batch_end)/sample_size)
    X_0 = [0]
    for c in columns:
        X_0.append(np.reshape(data[batch_start:batch_end][c], (-1, 1)))
    yy = func(X_0)[0]
    for row in yy:
        wrtr.writerow(row)
    myfile.flush()
    
myfile.close() 

0 (0, 8192) 2695978 0.9969613995366431
1 (8192, 16384) 2695978 0.9939227990732862
2 (16384, 24576) 2695978 0.9908841986099293
3 (24576, 32768) 2695978 0.9878455981465725
4 (32768, 40960) 2695978 0.9848069976832156
5 (40960, 49152) 2695978 0.9817683972198586
6 (49152, 57344) 2695978 0.9787297967565017
7 (57344, 65536) 2695978 0.9756911962931448
8 (65536, 73728) 2695978 0.9726525958297879
9 (73728, 81920) 2695978 0.969613995366431
10 (81920, 90112) 2695978 0.9665753949030741
11 (90112, 98304) 2695978 0.9635367944397173
12 (98304, 106496) 2695978 0.9604981939763604
13 (106496, 114688) 2695978 0.9574595935130035
14 (114688, 122880) 2695978 0.9544209930496466
15 (122880, 131072) 2695978 0.9513823925862896
16 (131072, 139264) 2695978 0.9483437921229327
17 (139264, 147456) 2695978 0.9453051916595758
18 (147456, 155648) 2695978 0.9422665911962189
19 (155648, 163840) 2695978 0.9392279907328621
20 (163840, 172032) 2695978 0.9361893902695052
21 (172032, 180224) 2695978 0.9331507898061483
22 (1802

In [17]:
data.to_csv(path+'python_activity_ids.csv', columns=["activity_id"])