In [1]:
import numpy as np
import pandas as pd

In [2]:
def get_malicious():
    import re
    records = []
    with open('./data/dga.txt') as f:
        records = re.findall(r'(\w+)\t+([\w.]+).*\n', f.read())
    df_malicious = pd.DataFrame({'Domain':[record[1] for record in records], 'Label':[record[0] for record in records]})
    return df_malicious

In [3]:
def get_benign():
    df_benign = pd.read_csv('./data/top-1m.csv', index_col = 0, header = None)
    df_benign.columns = ['Domain']
    df_benign['Label'] = 'benign'
    return df_benign

In [4]:
def prepare_data():
    import tldextract
    df_malicious = get_malicious()
    df_benign = get_benign()
    df_data = pd.concat([df_malicious, df_benign], axis = 0)
    df_data['Target'] = df_data['Label'].map(lambda x : 0 if x == 'benign' else 1)
    df_data['Domain'] = df_data['Domain'].map(lambda x : tldextract.extract(x).domain)
    df_data = df_data.drop_duplicates(subset = ['Domain'])
    return df_data

In [5]:
def make_data(df_data):
    df_data_small = pd.concat([df_data[df_data['Target'] == 0].sample(500000), df_data[df_data['Target'] == 1].sample(300000)], axis = 0)
    X = df_data_small['Domain'].values
    y = df_data_small['Target'].values
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2019, test_size = 0.2)
    from keras.preprocessing.sequence import pad_sequences
    from keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer(char_level = True)
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    X_train = pad_sequences(X_train, padding = 'post')
    X_test = pad_sequences(X_test, padding = 'post', maxlen = X_train.shape[1])
    return X_train, X_test, y_train, y_test, tokenizer.word_index

In [6]:
def build_model(words_num, max_length, feature_num):
    from keras.models import Sequential
    from keras.layers import Embedding, LSTM, Dense, Dropout, Activation
    model = Sequential()
    model.add(Embedding(input_dim = words_num, output_dim = feature_num, input_length = max_length))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    import keras.backend as K
    def calc_recall_score(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall


    def calc_precision_score(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision


    def calc_f1_score(y_true, y_pred):
        precision = calc_precision_score(y_true, y_pred)
        recall = calc_recall_score(y_true, y_pred)
        return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', 
                  metrics = ['acc', calc_recall_score, calc_precision_score, calc_f1_score])
    return model

In [7]:
def train(X_train, X_test, y_train, y_test, word_index):
    model = build_model(len(word_index) + 1, X_train.shape[1], 128)
    model.fit(X_train, y_train, batch_size = 128, epochs = 5, validation_split = 0.3)
    return model

In [8]:
df_data = prepare_data()

In [9]:
X_train, X_test, y_train, y_test, word_index = make_data(df_data)

Using TensorFlow backend.


In [42]:
model = train(X_train, X_test, y_train, y_test, word_index)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 448000 samples, validate on 192000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
import keras

class WeightedSelfAttention(keras.layers.Layer):
    r"""Y = \text{softmax}(XW + b) X
    
    See: https://arxiv.org/pdf/1708.00524.pdf
    """
    
    def __init__(self, use_bias = True, return_attention = False, **kwargs):
        super(WeightedSelfAttention, self).__init__(**kwargs)
        self.supports_masking = True
        self.use_bias = use_bias
        self.return_attention = return_attention
        self.W, self.b = None, None
        
    def get_config(self):
        config = {
            'use_bias' : self.use_bias,
            'return_attention': self.return_attention
        }
        base_config = super(WeightedSelfAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    def build(self, input_shape):
        self.W = self.add_weight(shape = (int(input_shape[2]), 1),
                                name = '{}_W'.format(self.name),
                                initializer = keras.initializers.get('uniform'))
        if self.use_bias:
            self.b = self.add_weight(shape=(1,),
                                    name = '{}_b'.format(self.name),
                                    initializer = keras.initializers.get('zeros'))
            super(WeightedSelfAttention, self).build(input_shape)
    
    def call(self, x, mask = None):
        logits = K.dot(x, self.W)
        if self.use_bias:
            logits += self.b
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis = -1, keepdims = True))
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        
        att_weights = ai / (K.sum(ai, axis = 1, keepdims = True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis = 1)
        if self.return_attention:
            return [result, att_weights]
        return result
    
    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return input_shape[0], output_len
    
    def compute_mask(self, _, input_mask = None):
        if self.return_attention:
            return [None, None]
        return None
    
    def get_custom_objects():
        return {'WeightedSelfAttention':WeightedSelfAttention}
    
    
def build_model_att(words_num, max_length, feature_num):
    from keras.models import Sequential, Model
    from keras.layers import Input, Embedding, LSTM, Dense, Dropout, Activation, Bidirectional
    inputs = Input(shape = (max_length,), name = 'Input')
    embd = Embedding(input_dim = words_num, output_dim = feature_num, input_length = max_length, name = 'Embedding')(inputs)
    lstm = keras.layers.Bidirectional(LSTM(units = 128, return_sequences = True, name = 'Bi-LSTM'))(embd)
    att = WeightedSelfAttention('Attention')(lstm)
    outputs = Dense(1, activation = 'sigmoid', name = 'Output')(att)
    model = Model(inputs = inputs, outputs = outputs)
    
    import keras.backend as K
    def calc_recall_score(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall


    def calc_precision_score(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision


    def calc_f1_score(y_true, y_pred):
        precision = calc_precision_score(y_true, y_pred)
        recall = calc_recall_score(y_true, y_pred)
        return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', 
                  metrics = ['acc', calc_recall_score, calc_precision_score, calc_f1_score])
    return model

In [36]:
def train_with_att(X_train, X_test, y_train, y_test, word_index):
    model = build_model_att(len(word_index) + 1, X_train.shape[1], 128)
    hist = model.fit(X_train, y_train, batch_size = 128, epochs = 5, validation_split = 0.3)
    return model, hist

In [38]:
model, hist = train_with_att(X_train, X_test, y_train, y_test, word_index)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 448000 samples, validate on 192000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
