## BERT Classifier to detect DGA domains.
### Author: Abdulkarim Abdulkadir

In [1]:
import pandas as pd
import numpy as np
import os
import utils
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import ktrain

In [2]:
dgaLocation = 'generated_domains/'
dgaDomains = [dga for dga in os.listdir(dgaLocation) if dga.endswith(r".csv")]
print("Total amount of DGA types: ", len(dgaDomains))
benignDomains = 'benign_domains/top-1m.csv'

Total amount of DGA types:  19


Now we will create our dataset. We will have three columns: domain, type and class. The domain is the url link, the type is the name of the DGA and the class indicates if it is a dga or not. If the class is '1' then it is a DGA domain, otherwise class '0' is a benign domain.

In [3]:
dataset = pd.DataFrame()
for i, dga in enumerate(dgaDomains):
    dgaDataFrame = pd.read_csv(dgaLocation + dga)
    dgaDataFrame.insert(1,'type',dga.split(".")[0])
    dgaDataFrame.insert(2,'class',1)
    dataset = dataset.append(dgaDataFrame, ignore_index=True)
datasetTrimmed = pd.DataFrame()
datasetTrimmed = shuffle(dataset)
datasetTrimmed = datasetTrimmed[:67500]
benignDataFrame = pd.read_csv(benignDomains)
benignDataFrame.insert(1, 'type', 'benign')
benignDataFrame.insert(1, 'class', 0)
datasetTrimmed = datasetTrimmed.append(shuffle(benignDataFrame[:82500]), ignore_index=True)
datasetTrimmed = shuffle(datasetTrimmed)
datasetTrimmed = datasetTrimmed.reset_index(drop=True)

In [4]:
print("Total amount of DGA domains: ", datasetTrimmed['class'].value_counts()[1])
print("Total amount of benign domains: ", datasetTrimmed['class'].value_counts()[0])
print("Total amount of DGA domains: ", len(datasetTrimmed))
datasetTrimmed.to_csv('dataset', index=False)

Total amount of DGA domains:  67500
Total amount of benign domains:  82500
Total amount of DGA domains:  150000


In [5]:
dataset = pd.read_csv('dataset')
X = dataset['domain']

labels = dataset['class']
class_names = labels.unique()
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    labels,
                                                    test_size=0.20,
                                                    random_state=69)

In [6]:
print("Rows in x_train %d : " % len(x_train))
print("Rows in y_train: ", y_train.value_counts())

Rows in x_train 120000 : 
Rows in y_train:  0    65810
1    54190
Name: class, dtype: int64


In [7]:
model_name = 'distilbert-base-uncased'
trn, val, preproc = ktrain.text.texts_from_array(x_train=x_train.tolist(), y_train=y_train.tolist(),
                                      x_test=x_test.tolist(), y_test=y_test.tolist(),
                                      class_names=class_names,
                                      preprocess_mode='distilbert',
                                      maxlen=350)

preprocessing train...
language: en
train sequence lengths:
	mean : 1
	95percentile : 1
	99percentile : 1


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 1
	95percentile : 1
	99percentile : 1


task: text classification


In [8]:
model = ktrain.text.text_classifier('distilbert', train_data=trn, preproc=preproc)

Is Multi-Label? False
maxlen is 350


In [1]:
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.lr_find(max_epochs=4)
learner.lr_plot()

NameError: name 'ktrain' is not defined

In [None]:
predictor = ktrain.get_predictor(learner.model,preproc=preproc)
predictor.save('model/')