# Training Specialized Neural Network Agents to Identify Invalid Keys

This notebook train 5 separate specialized agents for detecting each of the rule violated (in total there are 5 rules). Each character in the keys are transformed to one hot vector before feeding into the neural network model through a process called tokenize. 

In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf
CONFIG = tf.ConfigProto(device_count = {'GPU': 1}, log_device_placement=False, allow_soft_placement=False)
CONFIG.gpu_options.allow_growth = True # Prevents tf from grabbing all gpu memory.
sess = tf.Session(config=CONFIG)
from keras import backend as K
K.set_session(sess)

from keras import backend as K
from keras.optimizers import Adam
import keras

import h5py
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import time
import random

from keras.models import Sequential, Model
from keras.models import load_model
from keras.models import Sequential
from keras.optimizers import SGD, Adam,RMSprop
from keras.layers import Input, Dense, Flatten, Reshape, merge, Activation,Dropout,concatenate,Lambda
from collections import defaultdict
import pandas as pd
import gen_v2
import imp
imp.reload(gen_v2)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Test Tokenizer

In [None]:
_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"
tokenizer = Tokenizer(num_words=38) 
tokenizer.fit_on_texts(list(_chars))
print(len(list(_chars)))
# tokenizer.texts_to_sequences(_chars)

# Prepare data

In [239]:
def prepare_data( n , rule_broken = 'r1'):
    """
    Prepare the data consists of half valid keys and half invalid keys with one rule broken. 
    """
    string_list = []
    r1_func = getattr(gen_v2, rule_broken)
    not_r1_func = getattr(gen_v2, 'not_'+rule_broken)
    print(r1_func, not_r1_func)
    for i in range(n):
        seq = gen_v2.random_char_seq()
        seq = r1_func(seq)
        seq0 = gen_v2.random_char_seq()
        seq0 = not_r1_func(seq0)
        string_list.append( seq + ', ' + str(1) )
        string_list.append( seq0 + ', ' + str(0) )
    random.shuffle(string_list)
    return string_list

def prepare_x_y_train():
    """
    wrap string data into h5 file; 
    use keras tokonizer to transform each character into one hot encoded vector.
    """
    for i in range(1, 6):
        print('preparing rule {}'.format(i))
        string_list = prepare_data( n=100000, rule_broken = 'r'+str(i) )
        texts_into_letters = []
        label = []
        for j,letter in enumerate(string_list):
            texts_into_letters.append( list(string_list[j])[:36] )
            label.append( float( list(string_list[j])[38] ) )  # 38th is the label 1 or 0
        # print(texts_into_letters)    
        _chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"
        tokenizer = Tokenizer(num_words=38) # most frequent 100 words, set it to be large to include all possible words 
#         tokenizer.fit_on_texts(texts_into_letters)
        tokenizer.fit_on_texts(_chars)
        sequences_list = tokenizer.texts_to_sequences(texts_into_letters)
        sequences = np.asarray( [np.asarray(x) for x in sequences_list] )
        # print(sequences.shape)
        cat_sequences = keras.utils.to_categorical(sequences, num_classes=None)[:,:,1:] # the tokenizer starts from 1
        with h5py.File( "/extra/yadongl10/keys/data/r{}.h5".format(i), "w" ) as f:
            f.create_dataset('text', data=cat_sequences)
            f.create_dataset('label', data=label)
            
prepare_x_y_train()

preparing rule 1
<function r1 at 0x2b895b36e158> <function not_r1 at 0x2b895b30f7b8>
preparing rule 2
<function r2 at 0x2b895b30fbf8> <function not_r2 at 0x2b895b30f6a8>
preparing rule 3
<function r3 at 0x2b895b30f730> <function not_r3 at 0x2b895b30f8c8>
preparing rule 4
<function r4 at 0x2b895b30f598> <function not_r4 at 0x2b895b30f840>
preparing rule 5
<function r5 at 0x2b895b30ff28> <function not_r5 at 0x2b895b01cb70>


# Run experiment

In [240]:
hp = {
    'dimension' : 64,
    'n_layer' : 4,
    'init' : 'he_normal',
    'act' : 'relu'
}
hp

def build_model(hp):
    input_text = Input(shape=(36,36,))
    dimensions = [hp['dimension']]* hp['n_layer']
    init = hp['init']
    act = hp['act']

    x= Flatten()(input_text)
    for i in dimensions:
        x = Dense(i,init = init, activation=act)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_text, outputs=output)

    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

    model.compile(loss= 'binary_crossentropy', # contrastive_loss, #
                  optimizer=  optimizer,
                  metrics=['accuracy'])
    model.summary()
    
    return model

    

def run_experiment(hp):
    # prepare data
    model_list = []
    for i in ['r1', 'r2', 'r3', 'r4', 'r5']:
        print('loading data from: /extra/yadongl10/keys/data/{}.h5'.format(i))
        with h5py.File('/extra/yadongl10/keys/data/{}.h5'.format(i), 'r') as f:
            cat_sequences_combine_shuffle = np.asarray( f['text'] )
            label_seq_combine_shuffle = np.asarray(f['label'])
            
        model = build_model(hp)
        cut = int( 0.8 * cat_sequences_combine_shuffle.shape[0] )
        model.fit(cat_sequences_combine_shuffle[:cut,:,:], label_seq_combine_shuffle[:cut],\
          batch_size=64, epochs=15, validation_data=(cat_sequences_combine_shuffle[cut:,:,:],\
                           label_seq_combine_shuffle[cut:]))
        model_list.append(model)
    return model_list
model_list = run_experiment(hp)

loading data from: /extra/yadongl10/keys/data/r1.h5




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 36, 36)            0         
_________________________________________________________________
flatten_19 (Flatten)         (None, 1296)              0         
_________________________________________________________________
dense_91 (Dense)             (None, 64)                83008     
_________________________________________________________________
dense_92 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_93 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_94 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_95 (Dense)             (None, 1)                 65        
Total para

Train on 160000 samples, validate on 40000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
loading data from: /extra/yadongl10/keys/data/r4.h5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        (None, 36, 36)            0         
_________________________________________________________________
flatten_22 (Flatten)         (None, 1296)              0         
_________________________________________________________________
dense_106 (Dense)            (None, 64)                83008     
_________________________________________________________________
dense_107 (Dense)            (None, 64)                4160      
_________________________________________________________________
dense_108 (Dense)            (None, 64)                4160      


In [195]:
for i in ['r1']:
    print('loading data from: /extra/yadongl10/keys/data/{}.h5'.format(i))
    with h5py.File('/extra/yadongl10/keys/data/{}.h5'.format(i), 'r') as f:
        cat_sequences_combine_shuffle = np.asarray( f['text'] )
        label_seq_combine_shuffle = np.asarray( f['label'] )
    md0_r2_pred = model_list[0].predict(cat_sequences_combine_shuffle)

loading data from: /extra/yadongl10/keys/data/r1.h5


In [197]:
print(md0_r2_pred,label_seq_combine_shuffle)

[[1.0000000e+00]
 [2.2854624e-23]
 [3.1014217e-19]
 ...
 [1.0000000e+00]
 [1.0000000e+00]
 [1.0000000e+00]] [1. 0. 0. ... 1. 1. 1.]


# save model

In [8]:
for i in range(10):
    print('saving model',i)
    model_list[i].save('/extra/yadongl10/keys/trained_model/md_for_rule_{}.h5'.format(i))

# Validate model

In [243]:
def texts_to_cat(texts_valid):
    texts_into_letters = []
    for i,letter in enumerate(texts_valid):
        assert len( list(texts_valid.iloc[i]) ) == 36
        texts_into_letters.append(list(texts_valid.iloc[i]))
    texts_valid = texts_into_letters 
    
    _chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"
    tokenizer = Tokenizer(num_words=37) # most frequent 100 words, set it to be large to include all possible words 
    tokenizer.fit_on_texts(_chars)
    sequences_valid_list = tokenizer.texts_to_sequences(texts_valid)
    
    sequences_valid = np.asarray( [np.asarray(x) for x in sequences_valid_list] )
    cat_sequences_valid = keras.utils.to_categorical(sequences_valid, num_classes=None)[:,:,1:]
    print(cat_sequences_valid.shape)
    return cat_sequences_valid

pred_list = []
for i in range(5):
    print(i)
    valid = pd.read_table(  '/extra/yadongl10/keys/invalid.txt',sep=',' ,header=None)
#     texts_to_cat(valid.iloc[:,0]) 
    cat_sequences_combine_shuffle = texts_to_cat(valid.iloc[:,0])
    pred = model_list[i].predict(cat_sequences_combine_shuffle) 
    pred[pred<0.5] = 0
    pred[pred>0.5] = 1
    pred_list.append( pred )
    

0
(10000, 36, 36)
1
(10000, 36, 36)
2
(10000, 36, 36)
3
(10000, 36, 36)
4
(10000, 36, 36)


In [270]:
def pred_on_r1_to_r5(model):
    """
    Use a specialized model to predict on r1,r2,r3,r4,r5 and print the predicted number of valid keys
    Ideally, specialized model for ri should predict very few number of valid when rule i are violated. 
    """
    pred_list=[]
    for i in range(1,6):
        print(i)
        valid = pd.read_table( '/extra/yadongl10/keys/invalid_keys_single_char/r{}.txt'.format(i),sep=',' ,header=None)
#         texts_to_cat(valid.iloc[:,0]) 
        cat_sequences_combine_shuffle = texts_to_cat(valid.iloc[:,0])
        pred = model.predict(cat_sequences_combine_shuffle) 
        pred[pred<0.5] = 0
        pred[pred>0.5] = 1
        pred_list.append( pred )
    # Model 0 predicts on r1 to r5.txt
    for i in range(5):
        print(sum(pred_list[i]))
        
pred_on_r1_to_r5(model_list[4])

1
(2000, 36, 36)
2
(2000, 36, 36)
3
(2000, 36, 36)
4
(2000, 36, 36)
5
(2000, 36, 36)
[1984.]
[1987.]
[1988.]
[1987.]
[10.]
