In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM


In [None]:
dga = pd.read_csv('https://elephantscale-public.s3.amazonaws.com/data/dga/dga-dataset-clean.csv.gz')

dga['label'] = dga['label'].apply(lambda x: 'legit' if str(x).startswith("l") else 'dga') #Cleanup

dga


In [None]:
# source is not a number, so transform it into an number
dga['source_fact'] = pd.factorize(dga['source'])[0]

dga['label_fact'] = pd.factorize(dga['label'])[0]


In [None]:
dga

In [None]:
dga.describe()

In [None]:

# Extract data and labels
X = dga['domain']
labels = dga['label_fact']


# Generate a dictionary of valid characters
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

max_features = len(valid_chars) + 1
maxlen = np.max([len(x) for x in X])

 # Convert characters to int and pad
X = [[valid_chars[y] for y in x] for x in X]
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=maxlen)


In [None]:


def build_model(max_features, maxlen):
    """Build LSTM model"""
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop')

    return model

model = build_model(max_features, maxlen)


## Train LSTM Model
Let's try a basic tf/idf pipeline without using any of our other features

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X,labels, test_size=0.33, random_state=42)


In [None]:
model.fit(X_train, y_train)

In [None]:

t_probs = model.predict_proba(X_test)
t_auc = sklearn.metrics.roc_auc_score(y_test, t_probs)
t_auc

In [None]:
t_probs

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, t_probs > 0.5)

### Results

99+% accuracy is pretty good here.