In [31]:
import time
import numpy as np
import pandas as pd
import shap

from sklearn.utils import shuffle
from tensorflow.keras.backend import backend
from tensorflow.keras.models import *
from tensorflow.keras.layers import * 


print ("============================================================")
print (" CNN for demographic sequences and features.  Anna Muratova ")
print ("============================================================")


start_time = time.time()

df = pd.read_excel("table.xlsx")

df.dropna(inplace = True)   # remove empty (NaN) lines
df = shuffle(df, random_state=7)

print ("Input data shape =", df.shape)
size = df.shape[0]
t = int(size * 0.8)         # train part size

x = df.values[:size, 0:7]
y = np.empty(size, dtype=str)
del df

for i in range(x.shape[0]):
    y[i] = x[i, 0][-1]
    x[i, 0] = x[i, 0][:-1]


X = np.empty((x.shape[0], x.shape[1]), dtype='int32')

for d in range(x.shape[1]):        
    xs = list(set(x[:, d]))
    if d == 0:
        print ("Number of unique sequences =", len(xs), "\n")
    xd = {xs[i]: i for i in range(len(xs))}
    for l in range(x.shape[0]):
        X[l][d] = xd[x[l][d]]


XF_train = X[0:t, 1:]           # features only
XF_test  = X[t:size, 1:]
#print (XF_train[0:5])

yn = np.empty((size), dtype='int32')

ys = list(set(y[:]))
yd = {ys[i]: i for i in range(len(ys))}
for l in range(len(yn)):
    yn[l] = yd[y[l]]

y_train = yn[0:t]
y_test  = yn[t:size]

# all unique characters to the set
events = set()
for seq in x[:, 0]:
    for event in seq:
        events.add(event)

events = list(events)

event_to_id = {t:i+1 for i,t in enumerate(events)}

#print (event_to_id)

max_seq_len = 8
seq_events_numbered = np.zeros((x.shape[0], max_seq_len), dtype='int32')

for i in range(seq_events_numbered.shape[0]):
    for k in range(len(x[i][0])):
        seq_events_numbered[i][k] = event_to_id[x[i][0][k]]

S_train = seq_events_numbered[0:t, :]       # train sequences
S_test  = seq_events_numbered[t:size, :]    # test  sequences


print ("Train features  data shape =", XF_train.shape, y_train.shape)
print ("Test  features  data shape =", XF_test.shape,  y_test.shape)
print ("Train sequences data shape =", S_train.shape, y_train.shape)
print ("Test  sequences data shape =", S_test.shape,  y_test.shape)
print ("\nData preprocessing time, sec =  %0.2f" % (time.time() - start_time))

print ("============================================================")

print ("Keras backend =", backend())

def define_model(seq_length, vocab_size, features_n): # for sequences: length = 8, vocab_size = 9 (including ' ') 
	input_seq = Input(shape=(seq_length,)) 
	# sequences
	embedding_s = Embedding(vocab_size, 40)(input_seq)
	conv_s = Conv1D(filters=100, kernel_size=8, activation='relu')(embedding_s)
	drop_s = Dropout(0.05)(conv_s)
	#pool1 = MaxPooling1D(pool_size=1)(drop1)
	flat_s = Flatten()(drop_s)
	# features
	input_features = Input(shape=(features_n, ))
	dens_f = Dense(100, activation='relu')(input_features)
	drop_f = Dropout(0.05)(dens_f)
	#pool2 = MaxPooling1D(pool_size=2)(drop2)
	flat_f = Flatten()(drop_f)
	    
	# merged sequences and features
	merged = concatenate([flat_s, flat_f]) # merged sequences and features

	dense1 = Dense(200, activation='relu')(merged)
	drop1 = Dropout(0.05)(dense1)
	dense2 = Dense(32, activation='relu')(drop1)
	outputs = Dense(8, activation='softmax')(dense2)
	model = Model(inputs=[input_seq, input_features], outputs=outputs)
	
	model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
	print(model.summary())

	return model


time_01 = time.time()
print ("\nCNN classification by sequences and features")

#.fit([XF_train, S_train], y_train, epochs=70, batch_size=100)

#score, acc = sequences_features.evaluate([XF_test, S_test], y_test, verbose=2)

# define model
model = define_model(8, 9, 6)

# fit model
model.fit([S_train, XF_train], y_train, epochs=100, batch_size=40)

time_02 = time.time()

# evaluate model on test dataset 
loss, acc = model.evaluate([S_test, XF_test], y_test, verbose=0)
print('Test Accuracy: %f' % acc)

time_03 = time.time()

print ("\tModel fitting time .... %0.2f" % (time_02 - time_01))
print ("\tPrediction time ....... %0.2f" % (time_03 - time_02))
print ("\tTotal time ............ %0.2f" % (time_03 - time_01))   
print ("Accuracy: %.3f" % (acc))
print ("============================================================")


 RNN for demographic sequences and features.  Anna Muratova 
Input data shape = (6626, 7)
Number of unique sequences = 1069 

Train features  data shape = (5300, 6) (5300,)
Test  features  data shape = (1326, 6) (1326,)
Train sequences data shape = (5300, 8) (5300,)
Test  sequences data shape = (1326, 8) (1326,)

Data preprocessing time, sec =  0.40
Keras backend = tensorflow

CNN classification by sequences and features
Model: "model_30"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_61 (InputLayer)           [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding_30 (Embedding)        (None, 8, 40)        360         input_61[0][0]                   
______________________________________________________________

Epoch 100/100
Test Accuracy: 0.937406
	Model fitting time .... 22.76
	Prediction time ....... 0.12
	Total time ............ 22.88
Accuracy: 0.937
