In [None]:
import pandas as pd
import numpy as np
import gensim
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [None]:
dga = pd.read_csv('https://elephantscale-public.s3.amazonaws.com/data/dga/dga-dataset-words.csv.gz')
dga.words = dga.words.fillna('')
dga

In [None]:
# source is not a number, so transform it into an number
dga['source_fact'] = pd.factorize(dga['source'])[0]

# toplevel is not a number, so transform it into a number
dga['toplevel_fact'] = pd.factorize(dga['toplevel'])[0]

dga['label_fact'] = pd.factorize(dga['label'])[0]

# get length of site as a new engineered featrues
dga['url_length'] = dga['site'].apply(lambda x : len(x))

# get num of words as a new engineered featrues
dga['word_num'] = dga['words'].apply(lambda x : len(x.split()))

In [None]:
dga

In [None]:
dga.describe()

## TF/IDF Pipeline

Let's try a basic tf/idf pipeline without using any of our other features

In [None]:
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(dga['words'], dga['label'], test_size=0.33, random_state=42)
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)     

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predicted)

### Results

77% accuracy, not bad. But not great.  Looks like we were much better at identifying one class than the other.

## Extract features

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer())])

text_clf = text_clf.fit(dga['words'])
dga['tfidf'] = text_clf.transform(dga['words'])
tfidf = text_clf.transform(dga['words'])
dga

## Train/Test Split

Let's do a basic train/test split 80% training / 10% test

In [None]:
 
    
msk = np.random.rand(len(dga)) < 0.8
train = dga[msk]
test = dga[~msk]

train_tfidf = tfidf[msk]
test_tfidf = tfidf[~msk]



In [None]:
train

In [None]:
from scipy import sparse

text_features = train_tfidf
other_features = train[['source_fact', 'toplevel_fact', 'url_length', 'word_num']]
all_features = sparse.hstack((text_features, other_features)).tocsr()



In [None]:
print(dga.shape)
print(text_features.shape)
print(other_features.shape)
print(tfidf.shape)

In [None]:
mixed_classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42).fit(all_features, train['label'])

In [None]:

text_features_test = test_tfidf
other_features_test = test[['source_fact', 'toplevel_fact', 'url_length', 'word_num']]
all_features_test = sparse.hstack((text_features_test, other_features_test)).tocsr()

predicted =  mixed_classifier.predict(all_features_test)
np.mean(predicted == test['label'])   

In [None]:
## Cool 86% -- that's better.
confusion_matrix(test['label'], predicted)

## Results

86%
Results are much more balanced too. The engineered features must have helped.


TODO:

We should try some other methods, like random forest classifier or a DNN classifier.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
rf.fit(all_features, train['label'])
predicted_rf = rf.predict(all_features_test)
np.mean(predicted_rf == test['label'])