In [None]:
!nvidia-smi

In [None]:
!pip install tensorflow-gpu==1.15

# Model with adversarial training

*   perturb = epsilon * gradient(input, loss)
*   perturb_input = input + perturb

*   self.scores: output prediction
*   self.loss: loss by oringinal input
*   self.perturb_loss: loss by perturbated input




In [None]:
import tensorflow as tf
from tensorflow.contrib import rnn


class WordRNN(object):
    def __init__(self, max_document_length, num_class,vocab_size,embedding_size):
        self.lr = 0.001
        self.bag_num = 64
         
        # self.embedding_size = 256
        self.num_hidden = 512
        self.fc_num_hidden = 256

        self.x = tf.placeholder(tf.int32, [None, max_document_length],name = 'input_x')
        self.x_len = tf.reduce_sum(tf.sign(self.x), 1)
        self.y = tf.placeholder(tf.float32, [None,num_class], name = 'input_y')
        self.keep_prob = tf.placeholder(tf.float32, [],name = 'dropout')
        self.x_init = tf.placeholder(tf.float32, shape=(vocab_size,embedding_size),name = 'x_init')
        self.global_step = tf.Variable(0, trainable=False, name="Global_Step")

        self.adv_eps = tf.placeholder(tf.float32, shape=(),name = 'adv_eps') # adversarial training parameter eps
        self.mask = mask = tf.placeholder(tf.float32, [None, max_document_length],name="mask") # sentence mask
        
    def build(self,max_document_length,num_class,trainset_embedding):
        self.shapes = shapes = tf.placeholder(tf.int32, [self.bag_num + 1])
        self.lr = tf.Variable(0.001, name='learning_rate',trainable=False)
        embeddings = tf.Variable(self.x_init, dtype=tf.float32,trainable=True,name="pretrained_embedding")
        train_embeddings = tf.Variable(trainset_embedding, dtype=tf.float32,
                                        trainable=True,name="embs_only_in_train")

        embeddings = tf.concat([embeddings, train_embeddings], axis=0)
        self.x_emb = input = tf.nn.embedding_lookup(embeddings, self.x)
   
        
        def neural_net(x_emb, name = 'neural_net', reuse = False):
            
                
            with tf.variable_scope("lstm",reuse = reuse):
                lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.num_hidden)  # forward direction cell
                lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.num_hidden)  # backward direction cell
                

                rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    lstm_fw_cell,lstm_bw_cell, x_emb, sequence_length=self.x_len, dtype=tf.float32)
                

                # Concat output
                lstm_concat = tf.concat(rnn_outputs, axis=2)  # [batch_size, sequence_length, lstm_hidden_size * 2]
                lstm_out = tf.reduce_mean(lstm_concat, axis=1)  # [batch_size, lstm_hidden_size * 2]

                          
                fc_output = tf.layers.dense(lstm_out, self.fc_num_hidden, activation=tf.nn.relu)
                dropout = tf.nn.dropout(fc_output, self.keep_prob)
  

                W = tf.Variable(tf.truncated_normal(shape=[self.fc_num_hidden, num_class],
                                                    stddev=0.1, dtype=tf.float32), name="W")
                b = tf.Variable(tf.constant(value=0.1, shape=[num_class], dtype=tf.float32), name="b")
                self.logits = tf.nn.xw_plus_b(dropout, W, b, name="logits")
                scores = tf.sigmoid(self.logits)
               
        
                loss = tf.reduce_mean(tf.reduce_sum(
                    tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.y),axis = 1))
             
                        

            return scores, loss


        self.scores, self.loss = neural_net(input, reuse=False)
        
        
        ### adversarial training
        raw_perturb = tf.gradients(self.loss, input)[0]  # [batch, L, dim]
        print('input shape, {}'.format(input.shape[0]))
        print('raw_perturb shape, {}'.format(raw_perturb.shape))
        
        # normalize the whole batch
        
        self.perturb = perturb = self.adv_eps * tf.stop_gradient(
            tf.nn.l2_normalize(raw_perturb * tf.expand_dims(self.mask, axis=-1), dim=[0,1,2]))
        
        print('perturb shape, {}'.format(perturb.shape))
        
        self.perturb_inputs = perturb_inputs = input + perturb
        self.perturb_probs, self.perturb_loss = neural_net(perturb_inputs, reuse=True)  # optimize the loss with perturbed loss


      

       


In [None]:
!pip install nltk
!pip install fasttext
!pip install tflearn
!pip install sklearn
import nltk
nltk.download('punkt')

# Util function
*  load data 
*  generate embedding matrix
*  generate batch 

In [15]:
import fasttext
import pandas as pd
import numpy as np
import re
import json
from nltk.tokenize import word_tokenize


def clean_str(text):
    '''
    regular expression to clean text file

    '''

    text = re.sub(r"[_{},:.!?%’\'\"]", " ", str(text))
    text = re.sub(r"\s{2,}", " ", str(text))
    text = text.strip().lower()
    text = text.lower()
    text = text.replace('[', '')
    text = text.replace(']', '')

    return text

def create_onehot_labels(labels_index,num_labels):
    '''
    create onehot label vector

    :param labels_index: preset order of label
    :param num_labels: number of classes
    :return: onehot label vector
    '''
    label = [0] * num_labels
    for item in labels_index:

        label[int(item-1)] = 1


    return label


def cos_sim(vector_a, vector_b):
    """
    calculate the cosine similarity between two vectors

    :param vector_a: vector a
    :param vector_b: vector b
    :return: cosine similary
    """

    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    sim = num / denom
    return sim




def train_data_word2vec(TRAIN_PATH,num_class,vocab_size, embed_size, embedding_model):
    '''
    create the training set(train_x) and labels(train_y)

    :param TRAIN_PATH: training data file
    :param num_class: total number of classes(rows + attribute)
    :param vocab_size: number of total vacabulary in pretrained embedding
    :param embed_size: embedding size for each word(word vector dimension)
    :param embedding_model: pretrained embedding model file
    :return: content_index_list, word vector index in embedding matrix
             onehot_labels_list, word label vector
             trainset_embedding_matrix, oov word(from training dataset) embedding matrix
             oov_word, oov word in training dataset
    '''

    model = fasttext.load_model(embedding_model)
    vocab = dict([(word, model.get_word_id(word)) for word in model.get_words()])

    content_index_list = []
    onehot_labels_list = []
    label_num_list = []
    trainset_embedding_matrix = np.zeros((0, embed_size))
    oov_word = []

    count = 0

    # df = pd.read_csv(TRAIN_PATH, names=["content", "label1", "label2","label3"], sep=',', header=0,encoding='utf-8')
    with open(os.path.join(folder_path,TRAIN_PATH), encoding='utf-8') as f:
        for line in f.readlines():
            df = json.loads(line)
            content = df['value']

            result = []

            for item in word_tokenize(clean_str(content)):
                word2id = vocab.get(item)
                if word2id is None and item not in oov_word:
                    oov_word.append(item)

                    word_vec = model.get_word_vector(item)
                    trainset_embedding_matrix = np.insert(trainset_embedding_matrix,
                                                          len(trainset_embedding_matrix), values=word_vec, axis=0)
                    word2id = len(model.get_words()) + count
                    count += 1

                elif word2id is None and item in oov_word:
                    word2id = vocab_size + oov_word.index(item)

                result.append(word2id)
            content_index_list.append(result)

            label_list = df["label_index"]
            num_label = df['label_number']

            onehot_labels_list.append(create_onehot_labels(label_list, num_class))
            label_num_list.append(num_label)
    print('oov_word',len(oov_word),oov_word)
   
    with open('/content/trainset_embedding_matrix.npy', 'wb') as f:
        np.save(f, trainset_embedding_matrix)
    return content_index_list, onehot_labels_list, label_num_list,trainset_embedding_matrix,oov_word


def test_data_word2vec(TEST_PATH,num_class,vocab_size,embedding_model,oov_word):
    '''
    create the testing set(test_x) and labels(test_y)

    :param TEST_PATH: test dataset
    :param num_class: total number of classes(rows + attribute)
    :param vocab_size: number of total vocabulary in pretrained embedding
    :param embedding_model: pretrained embedding model file
    :param oov_word: oov word in training dataset

    :return: content_index_list, word vector index in embedding matrix
             onehot_labels_list, word label vector

    '''


    model = fasttext.load_model(embedding_model)
    vocab = dict([(word, model.get_word_id(word)) for word in model.get_words()])

    oov_vocab = dict([(word, oov_word.index(word)+vocab_size) for word in oov_word])
    whole_vocab ={}
    whole_vocab.update(vocab)
    whole_vocab.update(oov_vocab)

    # df = pd.read_csv(TEST_PATH, names=[ "content", "label1", "label2","label3"], sep=',', header=0,encoding='utf-8')

    content_index_list = []
    onehot_labels_list = []
    label_number_list = []
    oov_list = []

    with open(os.path.join(folder_path,TEST_PATH), encoding='utf-8') as f:
        for line in f.readlines():

            df = json.loads(line)
            content = df['value']

            result = []

            for item in word_tokenize(clean_str(content)):
                word2id = whole_vocab.get(item)
                if word2id is None:

                    word2id = 0
                    oov_list.append(item)
                result.append(word2id)
            content_index_list.append(result)


            label_list = df["label_index"]
            num_label = df['label_number']
            onehot_labels_list.append(create_onehot_labels(label_list, num_class))
            label_number_list.append(num_label)

  
    return content_index_list, onehot_labels_list,label_number_list


def load_word2vec_matrix(embedding_model):
    '''
    create pretrained word embedding matrix

    :param embedding_model: pretrained embedding model file

    :return:
             vocab_size, number of total vacabulary in pretrained embedding
             embedding_size, embedding size for each word(word vector dimension)
             embedding_matrix, word embedding matrix
    '''


    model = fasttext.load_model(embedding_model)
    vocab_size = (model.get_output_matrix()).shape[0]
    embedding_size = model.get_dimension()

    vocab = dict([(word, model.get_word_id(word)) for word in model.get_words()])

    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for word, index in vocab.items():
        if word is not None:
            embedding_matrix[index] = model[word]
    with open('/content/embedding_matrix.npy', 'wb') as f:
        np.save(f, embedding_matrix)
    return vocab_size, embedding_size, embedding_matrix



def batch_iter(inputs, outputs, batch_size, num_epochs):
    '''

    :param inputs: unbatched data
    :param outputs: batched data
    :param batch_size: size of every data batch
    :param num_epochs: number of epochs

    :return:
           A batch iterator for data set
    '''
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]






# Accuracy metrics
*   top-k based
*   threshold based

In [None]:

def get_onehot_label_topk(scores, top_num,threshold = 0.1):
    '''

    get the top k score from testing result,
    use threshold to filter the irrelevant data.

    :param scores:  predicted scores for each classification class
    :param top_num: number of labels for each data(corrosponding to top k scores)
    :param threshold: score of irrelevant data < threshold

    :return:
        predicted_onehot_labels: Predict labels (onehot format)
    '''

    predicted_onehot_labels = []
    score = np.ndarray.tolist(scores)
    # for score in scores:
    count = 0
    onehot_labels_list = [0] * len(score)
    max_num_index_list = list(map(score.index, heapq.nlargest(top_num, score)))
    for index, predict_score in enumerate(score):
        if predict_score >= threshold:

            count += 1
    if count < top_num:

        onehot_labels_list[-3] = 1
        onehot_labels_list[-2] = 1
        onehot_labels_list[-1] = 1
        predicted_onehot_labels.append(onehot_labels_list)
    else:
        for i in max_num_index_list:
            onehot_labels_list[i] = 1
        predicted_onehot_labels.append(onehot_labels_list)

    return predicted_onehot_labels

def get_onehot_label_threshold(scores, threshold):
    """
    Get the predicted one-hot labels based on the threshold.
    If there is no predict score greater than threshold, then choose the label which has the max predict score.
    Args:
        scores: The all classes predicted scores provided by network.
        threshold: The threshold (default: 0.5).
    Returns:
        predicted_onehot_labels: The predicted labels (one-hot).
    """
    predicted_onehot_labels = []
    score = np.ndarray.tolist(scores)
    # for score in scores:
    count = 0
    onehot_labels_list = [0] * len(score)
    for index, predict_score in enumerate(score):
        if predict_score >= threshold:
            onehot_labels_list[index] = 1
            count += 1
    if count == 0:
        max_score_index = score.index(max(score))
        onehot_labels_list[max_score_index] = 1
    predicted_onehot_labels.append(onehot_labels_list)
    return predicted_onehot_labels

# Training and Testing function

In [None]:

def train(train_x, train_y, test_x, test_y,vocab_size,
          embedding_size, pretrained_embedding, trainset_embedding_matrix):
    '''
    traing process + testing process

    :param train_x: training dataset
    :param train_y: training label
    :param test_x: testing dataset
    :param test_y: testing label
    :param vocab_size: number of vocabulary in embedding matrx
    :param embedding_size: embedding size for each word
    :param pretrained_embedding: pretrained word embedding matrix
    :param trainset_embedding_matrix: oov word(from train set) embedding matrix
    :param args:

    :return:
            print testing result by fixed epoch interval
    '''

    
    
    model = WordRNN(MAX_DOCUMENT_LEN, NUM_CLASS,vocab_size = vocab_size,embedding_size= embedding_size)
    model.build(MAX_DOCUMENT_LEN,NUM_CLASS,trainset_embedding=trainset_embedding_matrix)

    with tf.Session() as sess:

        
        
        # Define training procedure
        mask = np.zeros((BATCH_SIZE, MAX_DOCUMENT_LEN), dtype=np.float32)
        global_step = tf.Variable(0,trainable=False)
        optimizer = tf.train.AdamOptimizer(model.lr)
    
        
        train_op = optimizer.minimize(model.perturb_loss, global_step=global_step)
        
       
        # Initialize all variables
        feed_dict_emb = {
            model.x_init: np.float32(pretrained_embedding)
        }
        sess.run(tf.global_variables_initializer(),feed_dict=feed_dict_emb)


        def train_step(batch_x, batch_y):
            feed_dict = {
                model.x: batch_x,
                model.y: batch_y,
                model.adv_eps: ADV_EPS,
                model.mask: mask,
                model.keep_prob: 0.8,

            }
            _, step, loss = sess.run([train_op, global_step, model.perturb_loss], feed_dict=feed_dict)

            return loss

        def test_accuracy(test_x, test_y):
            '''

            :param test_x: testing dataset
            :param test_y: testing label
            :return:
                eval_loss: loss
                accuracy: accuracy
                ave_precison_score: average precison

            '''


            true_onehot_labels = []
            predicted_onehot_scores = []


            predicted_onehot_labels_t2 = []

            test_batches = batch_iter(test_x, test_y, BATCH_SIZE, 1)
            eval_loss, eval_counter = 0., 0


            for test_batch_x, test_batch_y in test_batches:
                scores, cur_loss = sess.run([model.scores, model.perturb_loss],
                                            feed_dict={model.x: test_batch_x, model.y: test_batch_y,
                                                       model.adv_eps: ADV_EPS,
                                                       
                                                       model.mask: mask,
                                                       model.keep_prob: 1.0})

                for labels,score in zip(test_batch_y,scores):
                    true_onehot_labels.append(labels)
                    predicted_onehot_scores.append(score)
                    
                    number_label = list(labels).count(1)
                    
                    #  using topk or threshold 

                    batch_predicted_onehot_labels = get_onehot_label_topk(scores=score, top_num=1)
                    # batch_predicted_onehot_labels = get_onehot_label_threshold(scores=score,threshold = THRESHOLD)
                    
                    predicted_onehot_labels_t2.append(batch_predicted_onehot_labels[0])
                    
                eval_loss = eval_loss + cur_loss
                eval_counter = eval_counter + 1

            #metrics
            eval_loss = float(eval_loss / eval_counter)

            accuracy = accuracy_score(np.array(true_onehot_labels), np.array(predicted_onehot_labels_t2))
            roc_auc = roc_auc_score(np.array(true_onehot_labels), np.array(predicted_onehot_labels_t2), average = 'micro')
            avg_precision = average_precision_score(np.array(true_onehot_labels), np.array(predicted_onehot_labels_t2),average = 'micro')
            precision = precision_score(np.array(true_onehot_labels), np.array(predicted_onehot_labels_t2),average = 'micro')
            recall = recall_score(np.array(true_onehot_labels), np.array(predicted_onehot_labels_t2),average = 'micro')
         
            f1 = f1_score(np.array(true_onehot_labels), np.array(predicted_onehot_labels_t2),average='micro')
            ham_loss = hamming_loss(np.array(true_onehot_labels), np.array(predicted_onehot_labels_t2))
            
            
            return eval_loss,avg_precision, accuracy,precision,recall,f1,roc_auc,ham_loss


        # Training loop
        start = time.time()

        batches = batch_iter(train_x, train_y, BATCH_SIZE, NUM_EPOCHS)

        st = time.time()
        steps_per_epoch = int(num_train / BATCH_SIZE)
        
        

        for batch_x, batch_y in batches:
            step = tf.train.global_step(sess, global_step)
            num_epoch = int(step / steps_per_epoch)

            loss = train_step(batch_x, batch_y)

            if step % 300 == 0:
            
            
                eval_loss,avg_precision, acc,precision,recall,f1,roc_auc,ham_loss = test_accuracy(test_x, test_y)
            
            
                print("epoch: {}, step: {}, loss: {}, steps_per_epoch: {}, batch size: {}".
                    format(num_epoch, step, eval_loss, steps_per_epoch, BATCH_SIZE))
            
                print("avg_precision:{}, accuracy:{}, precision: {}, f1: {}, recall: {}".format(avg_precision,acc,precision,f1,recall))
                print("time of one epoch: {}\n".format(time.time()-st))
                st = time.time()

        print('training time',time.time()-start)
        # #
        test_start_time = time.time()
        %time test_accuracy(test_x, test_y)
        print('testing time',time.time()-test_start_time)
        # print(eval_loss,acc)


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import json
import os
import numpy as np
import heapq
import time

from tflearn.data_utils import pad_sequences
from sklearn.metrics import average_precision_score,accuracy_score,roc_auc_score,precision_score, recall_score, f1_score,hamming_loss

# Taining the fasttext word emebdding model

In [None]:
# !pip install fasttext
# import fasttext
# model = fasttext.train_unsupervised('/content/drive/MyDrive/Research/dataset/machine_log_data/corpus.txt', 'skipgram',
#                                     epoch=10, minn=2, maxn=10, dim=150, thread=16)
# model.save_model('/content/machine.bin')

In [None]:
import json
import os 
MODEL_PATH = '/content/machine.bin'
folder_path = '/content/drive/MyDrive/Research/dataset/key_index_based_dataset/cell_based_dataset/machine_log_data/no_numerical_value'
TRAIN_PATH = "training_machine_log_data_with_column_label_prediction.json"
TEST_PATH = "testing_machine_log_data_with_column_label_prediction.json"
with open(os.path.join(folder_path,TRAIN_PATH), encoding='utf-8') as f:
  num_Train = len(f.readlines())
with open(os.path.join(folder_path,TEST_PATH), encoding='utf-8') as f:
  num_Test = len(f.readlines())

with open(os.path.join(folder_path,TRAIN_PATH), encoding='utf-8') as f1:
  train_data_json = [json.loads(i) for i in f1.readlines()]


dats = []
for i in train_data_json:
  dats.extend(i['label_index'])
NUM_LABELS = len(set(dats))
NUM_LABELS

In [None]:
NUM_CLASS = NUM_LABELS  

BATCH_SIZE = 100
NUM_EPOCHS = 100
MAX_DOCUMENT_LEN = 30
num_train = num_Train
num_test = num_Test 
THRESHOLD = 0.5
ADV_EPS = 0.05

In [None]:
vocab_size, embedding_size, embedding_matrix = load_word2vec_matrix(MODEL_PATH)
train_x_index_list, train_y, train_label_num_list,trainset_embedding_matrix, oov_word = train_data_word2vec(
    TRAIN_PATH,NUM_CLASS,vocab_size,embedding_size, MODEL_PATH)
train_x = pad_sequences(train_x_index_list, maxlen=MAX_DOCUMENT_LEN, value=0.)


test_x_index_list, test_y, test_label_num_list= test_data_word2vec(TEST_PATH,NUM_CLASS, vocab_size,MODEL_PATH, oov_word)
test_x = pad_sequences(test_x_index_list, maxlen=MAX_DOCUMENT_LEN, value=0.)

In [None]:
%time train(train_x, train_y, test_x, test_y,vocab_size,embedding_size, embedding_matrix,trainset_embedding_matrix)