# Contanimate DNS Data

In [113]:
"""
Make dataset pipeline
"""
import pandas as pd
import numpy as np
import os
from collections import Counter
import math

In [61]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from dga.models.dga_classifier import DGAClassifier
from dga.datasets.domain_dataset import DomainDataset

In [44]:
!pip install tldextract



In [45]:
import tldextract

In [78]:
df = pd.read_csv("../data/raw/dns.csv")

a_aaaa_df = df.loc[(df.qtype_name == 'A') | (df.qtype_name == 'AAAA')]

# Take subset by nxdomain response
nxdomain_df = a_aaaa_df.loc[(df['rcode_name'] == 'NXDOMAIN')]

# Drop subset from full records 
a_aaaa_df = a_aaaa_df[a_aaaa_df['rcode_name'] != 'NXDOMAIN'] 

In [79]:
# Load known DGAs
mal_df = pd.read_csv("../data/processed/validation.csv")
mal_df = mal_df.loc[mal_df['label'] == 1]

In [80]:
# Inject dga domains randomly
nxdomain_df['query'] = np.random.choice(list(mal_df['domain'].values), len(nxdomain_df))

In [92]:
# Put dataset back together
a_aaaa_df = pd.concat([a_aaaa_df, nxdomain_df])
# a_aaaa_df['domain_name'] = a_aaaa_df['query'].str.replace('www.', '')

a_aaaa_df.drop(['QR', 'AA', 'TC', 'RD', 'Z', 'answers'], axis=1, inplace=True)
a_aaaa_df.sort_values(by=['ts'])
# a_aaaa_df['domain_name'].unique()
a_aaaa_df = a_aaaa_df.reset_index(drop=True)

In [93]:
def extract_domain(url):
    return tldextract.extract(url).domain

a_aaaa_df['domain'] = a_aaaa_df['query'].apply(extract_domain)

In [94]:
def extract_tld(url):
    return tldextract.extract(url).suffix

a_aaaa_df['tld'] = a_aaaa_df['query'].apply(extract_tld)

In [95]:
a_aaaa_df['domain_name'] = a_aaaa_df['domain'] + '.' + a_aaaa_df['tld']

In [96]:
a_aaaa_df.head()

Unnamed: 0,ts,uid,id_orig_h,id_orig_p,id_resp_h,id_resp_p,proto,port,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,TTLs,rejected,domain_name,domain,tld
0,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,3550,www.apple.com,1,C_INTERNET,28,AAAA,-,-,-,F\r\n,apple.com,apple,com
1,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,3550,www.apple.com,1,C_INTERNET,28,AAAA,-,-,-,F\r\n,apple.com,apple,com
2,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,35599,www.apple.com,1,C_INTERNET,28,AAAA,-,-,-,F\r\n,apple.com,apple,com
3,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,35599,www.apple.com,1,C_INTERNET,28,AAAA,-,-,-,F\r\n,apple.com,apple,com
4,1331901000.0,CN3iol3Ge5ULjbEFph,192.168.202.93,61184,172.19.1.100,53,udp,40931,www.apple.com,1,C_INTERNET,1,A,-,-,-,F\r\n,apple.com,apple,com


In [97]:
model_dir = '../models/'
model_info = {}
model_info_path = os.path.join(model_dir, '1595825381_dga_model_info.pth')

with open(model_info_path, 'rb') as f:
    model_info = torch.load(f)

print("model_info: {}".format(model_info))

# Determine the device and construct the model.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DGAClassifier(input_features=model_info['input_features'],
                      hidden_dim=model_info['hidden_dim'],
                      n_layers=model_info['n_layers'],
                      output_dim=model_info['output_dim'],
                      embedding_dim=model_info['embedding_dim'],
                      batch_size=model_info['batch_size'])

# Load the stored model parameters.
model_path = os.path.join(model_dir, '1595825381_dga_model.pth')
with open(model_path, 'rb') as f:
    model.load_state_dict(torch.load(f))

# set to eval mode, could use no_grad
model.to(device).eval()

model_info: {'input_features': 68, 'hidden_dim': 30, 'n_layers': 2, 'embedding_dim': 5, 'batch_size': 32, 'output_dim': 1}


DGAClassifier(
  (embedding): Embedding(68, 5)
  (rnn): RNN(5, 30, num_layers=2, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=30, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [98]:
def entropy(s):
    p, lns = Counter(s), float(len(s))
    return -sum( count/lns * math.log(count/lns, 2) for count in p.values())

In [99]:
def pad_collate_pred(batch):
    x_lens = [len(x) for x in batch]
    xx_pad = pad_sequence(batch, batch_first=True, padding_value=0)
    return xx_pad, x_lens

In [100]:
def get_predict_loader(batch_size, df):
    print("Getting test and train data loaders.")
    dataset = DomainDataset(df, train=False)
    predict_dl = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_pred)
    return predict_dl

In [101]:
def get_prediction(df):
    predict_dl = get_predict_loader(1000, df)
    classes = {0: 'Benign', 1: 'DGA'}
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch_num, (x_padded,  x_lens) in enumerate(predict_dl):
            output = model(x_padded, x_lens)
            y_hat = torch.round(output.data)
            predictions += [classes[int(key)] for key in y_hat.flatten().numpy()]

    return predictions

In [102]:
a_aaaa_df = a_aaaa_df[~a_aaaa_df['domain_name'].str.contains('\(')].reset_index(drop=True)

In [107]:
a_aaaa_df = a_aaaa_df[~a_aaaa_df['domain_name'].str.contains(',')].reset_index(drop=True)

In [108]:
a_aaaa_df[['domain_name']]

Unnamed: 0,domain_name
0,apple.com
1,apple.com
2,apple.com
3,apple.com
4,apple.com
...,...
301027,suunthoodivettewl.com
301028,unmrhfjrmmswr.biz
301029,litzwsqtokmfud.org
301030,derivativearenot.club


In [115]:
a_aaaa_df['dga'] = get_prediction(a_aaaa_df[['domain_name']])

Getting test and train data loaders.


In [121]:
a_aaaa_df['entropy'] = a_aaaa_df['domain_name'].apply(entropy)

In [118]:
print(a_aaaa_df.shape)
a_aaaa_df.head(25)

(301032, 22)


Unnamed: 0,ts,uid,id_orig_h,id_orig_p,id_resp_h,id_resp_p,proto,port,query,qclass,...,qtype_name,rcode,rcode_name,TTLs,rejected,domain_name,domain,tld,dga,entropy
0,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,3550,www.apple.com,1,...,AAAA,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
1,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,3550,www.apple.com,1,...,AAAA,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
2,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,35599,www.apple.com,1,...,AAAA,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
3,1331901000.0,Cgrcup1c5uGRx428V7,192.168.202.93,60821,172.19.1.100,53,udp,35599,www.apple.com,1,...,AAAA,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
4,1331901000.0,CN3iol3Ge5ULjbEFph,192.168.202.93,61184,172.19.1.100,53,udp,40931,www.apple.com,1,...,A,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
5,1331901000.0,CN3iol3Ge5ULjbEFph,192.168.202.93,61184,172.19.1.100,53,udp,40931,www.apple.com,1,...,A,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
6,1331901000.0,CN3iol3Ge5ULjbEFph,192.168.202.93,61184,172.19.1.100,53,udp,25983,www.apple.com,1,...,A,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
7,1331901000.0,CN3iol3Ge5ULjbEFph,192.168.202.93,61184,172.19.1.100,53,udp,25983,www.apple.com,1,...,A,-,-,-,F\r\n,apple.com,apple,com,Benign,2.947703
8,1331901000.0,CDzHPo17B429xLtaVb,192.168.202.97,59011,156.154.70.22,53,udp,58389,www.comodo.com,1,...,A,-,-,-,F\r\n,comodo.com,comodo,com,Benign,2.121928
9,1331901000.0,CDzHPo17B429xLtaVb,192.168.202.97,59011,156.154.70.22,53,udp,58389,www.comodo.com,1,...,A,-,-,-,F\r\n,comodo.com,comodo,com,Benign,2.121928


In [119]:
a_aaaa_df.to_csv('../data/processed/demo_dns_logs.csv', index=False)