# DGA Detection, Feature Engineering

This notebook is broken down into the following tasks:

* Clean and pre-process the data.
* Standardization and normalization of numerical variables
* Define features for harmonic sequencing
* Create train/test `.csv` files that hold the relevant features and class labels for train/test data points


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
import whois

In [4]:
from datetime import datetime


def days_since_creation(domain):
    """
    Gets days since creation
    """
#     try:
    # Get creation date of Domain 
    creation_date = domain.creation_date

    # Handling exceptions
    if type(creation_date) is list:
        creation_date = creation_date[0]
    elif str(creation_date).find('Aug'):
        creation_date = "1996-07-01"
    delta = (datetime.today().date() - datetime.strptime(creation_date, '%Y-%m-%d').date()).days
    return delta
#     except:
#         return -1
    
def days_since_updated(domain):
    """
    Gets days since creation
    """
#     try:
    # Get creation date of Domain 
    updated_date = domain.updated_date

    # Handling exceptions
    if type(updated_date) is list:
        updated_date = updated_date[0]
    elif str(updated_date).find('Aug'):
        updated_date = "1996-07-01"
    delta = (datetime.today().date() - datetime.strptime(updated_date, '%Y-%m-%d').date()).days
    return delta
#     except:
#         return -1
    
def days_till_expired(domain):
    """
    Gets days since creation
    """
#     try:
    # Get creation date of Domain 
    expiration_date = domain.expiration_date
    # Handling exceptions
    if type(expiration_date) is list:
        expiration_date = expiration_date[0]
    elif str(expiration_date).find('Aug'):
        expiration_date = "1996-07-01"
    delta = (datetime.today().date() - datetime.strptime(expiration_date, '%Y-%m-%d').date()).days
    return abs(delta)
#     except:
#         return -1
    
def whois_features(domain_name):
#     try:
    whois_info = whois.whois(domain_name)
    print(whois_info)
    return [days_since_creation(whois_info), days_since_updated(whois_info), days_till_expired(whois_info)]
#     except:
#         return [-1, -1, -1]
        

In [5]:
# whois_features("popcornvod.com")

In [9]:
# whois_features("google.com")

In [10]:
#has_registrarname
#has_contactemail
#days_since_created
#days_since_updated
#days_until_expiration
#has_registrant_info
#has_admincontact_info
#has_billingcontact_info

In [11]:
# whois.whois("google.com").creation_date

In [12]:
# whois.whois("google.com")

In [13]:
# should be the name of directory you created to save your features data
data_dir = 'data'

In [14]:
# take a look at some matsnu example domains
from dga import matsnu

for i in range(10):
    print(matsnu.generate_domain())

establishmenthesitatedetailed.com
familyropeinfluencecakelunch.com
peopledoubtreportfaultbreak.com
partytermprogramkickhost.com
brainconditionwashnegotiate.com
eyereasondealfearsendsize.com
requirementallowremember.com
tackleinstallreservecontribute.com
kindintendmatterwordclub.com
racetrustmilkdiscoverbirth.com


In [15]:
# matsnu domains
matsnu_list = []

for i in range(20000):
    matsnu_list.append(matsnu.generate_domain())
    
matsnu_df = pd.DataFrame(matsnu_list, columns=['domain'])

print("Matsnu Shape:", matsnu_df.shape)

matsnu_df['label'] = 'dga'

matsnu_df.head()

Matsnu Shape: (20000, 1)


Unnamed: 0,domain,label
0,stuffregistershelterpitch.com,dga
1,cookiedisappointedreserve.com,dga
2,insectpraypayestimatespite.com,dga
3,mediahurrydealgolfbringemploy.com,dga
4,yardlandapologizedetermine.com,dga


In [16]:
# save in data file
matsnu_df.to_csv(data_dir + "/matsnu.csv")

In [17]:
# alex top 1 million domains
alexa_df = pd.read_csv(data_dir + "/alexa_top_1m.csv", names=['domain'])

alexa_df['label'] = 'benign'

alexa_df.head()

Unnamed: 0,domain,label
1,google.com,benign
2,youtube.com,benign
3,facebook.com,benign
4,baidu.com,benign
5,wikipedia.org,benign


In [18]:
train_df = pd.concat([alexa_df.iloc[:20000], matsnu_df.iloc[:20000]], axis=0).reset_index(drop=True)

In [19]:
train_df.head()

Unnamed: 0,domain,label
0,google.com,benign
1,youtube.com,benign
2,facebook.com,benign
3,baidu.com,benign
4,wikipedia.org,benign


In [6]:
# test[['days_since_creation', 'days_since_updated', 'days_till_expired']] = test.apply(
#     lambda x: whois_features(test['domain']), 
#     axis=1
# )

In [8]:
# test['domain'].apply(lambda x: whois_features(x))

In [7]:
# test = train_df.sample(10)

In [20]:
# !pip uninstall whois

In [21]:
# import whois

In [22]:
# dir(whois.query)

In [23]:
!pip install sklearn



In [24]:
# import labelencoder
from sklearn.preprocessing import LabelEncoder

# instantiate labelencoder object
le = LabelEncoder()

In [25]:
# apply le on categorical feature columns
train_df['label'] = le.fit_transform(train_df['label'])

train_df['label'].head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: label, dtype: int32