In [1]:
import pandas as pd
from numpy import prod
import math
import numpy as np

In [2]:
dga_domain_file = 'DGA_domains.csv'
non_dga_domain_file = 'non_DGA_domains.csv'
validation_dga_domain_file = 'validation_DGA_domains.csv'
validation_non_dga_domain_file = 'validation_non_DGA_domains.csv'
test_domains_file = 'test_domains.csv'

In [3]:
n_gram_split = [2,3]

In [4]:
def get_domain_names_without_tld(df):
    df = pd.DataFrame(df[0].str.split('.').apply(lambda s:s[0]))
    df.columns = ['domain']
    return df

In [5]:
# Read non_dga_domain_file
df_non_dga = pd.read_csv(non_dga_domain_file, sep=',',header=None)
df_non_dga = get_domain_names_without_tld(df_non_dga).drop_duplicates(keep='first')

# Read dga_domain_file
df_dga = pd.read_csv(dga_domain_file, sep=',',header=None)
df_dga = get_domain_names_without_tld(df_dga).drop_duplicates(keep='first')

In [6]:
def split_domain_names_acc_to_ngram_param(df,n_gram_split):
    # Get number of substrings for a particular split length
    df['len'] = df['domain'].astype(str).map(len)
    max_len = df['len'].max()
    list_of_counts_of_various_ngrams = {}
    for i in n_gram_split:
        new_df = pd.DataFrame()
        for j in range(0,max_len):
            con = df['len'] >= i+j
            new_df = new_df.append(pd.DataFrame(df.loc[con]['domain'].str[j:j+i]))
        counts = new_df['domain'].value_counts().rename_axis('domains_substr').reset_index(name='counts')
        list_of_counts_of_various_ngrams[i] = counts
    return list_of_counts_of_various_ngrams

In [7]:
""" Generate n gram splits of domains for DGA and non DGA datasets"""
list_of_counts_of_various_ngrams_non_dga = split_domain_names_acc_to_ngram_param(df_non_dga,n_gram_split)
list_of_counts_of_various_ngrams_dga = split_domain_names_acc_to_ngram_param(df_dga,n_gram_split)

In [8]:
sum_of_all_substring_counts = 0
for k,v in list_of_counts_of_various_ngrams_non_dga.items():
    sum_of_all_substring_counts = sum_of_all_substring_counts + list_of_counts_of_various_ngrams_non_dga[k]['counts'].sum()
for k,v in list_of_counts_of_various_ngrams_dga.items():
    sum_of_all_substring_counts = sum_of_all_substring_counts + list_of_counts_of_various_ngrams_dga[k]['counts'].sum()
print(sum_of_all_substring_counts)

1500784


In [9]:
def get_substrings(domain,split):
    sub_strs = []
    for i in range(0,len(domain)-split+1):
        sub_strs.append(domain[i:i+split])
    return sub_strs

In [10]:
def  get_score_from_training_dataset(training_data,sub_strs):
#     print(training_data)
    sub_str_scores = {}
    sum_of_all_substring_counts = training_data['counts'].sum()
    for sub_str in sub_strs:
        if len(training_data.loc[training_data['domains_substr'] == sub_str]) != 0:
            
            count = training_data.loc[training_data['domains_substr'] == sub_str]['counts'].iloc[0]
        else:
            
            count = 0
        if sub_str in sub_str_scores:
            sub_str_scores[sub_str] = sub_str_scores[sub_str] + count
        else:
            sub_str_scores[sub_str] = count
    temp_scores = {}
    for k, v in sub_str_scores.items():
        if(v==0):
            temp_scores[k] = 0
        else:
            temp_scores[k] = (v) / sum_of_all_substring_counts
    
    return temp_scores

In [11]:
""" Determine threshold """
thresholds = [0.10,0.15,0.20,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7]
dict_of_thresholds_and_fpr_fnr = {}
probability_list = []
df_test = pd.read_csv(validation_non_dga_domain_file, sep=',',header=None)
df_test = get_domain_names_without_tld(df_test).drop_duplicates(keep='first')
count = 0
# Split test domains into substrings
for index, row in df_test.iterrows():
    sum_scores_dga = 0
    sum_scores_non_dga = 0

    for split in n_gram_split:
        sub_strs = get_substrings(row["domain"],split)

        # Get scores for substrings from non_dga training dataset
        sub_str_scores_non_dga = get_score_from_training_dataset(list_of_counts_of_various_ngrams_non_dga[split],sub_strs)
        scores_non_dga = list(sub_str_scores_non_dga.values())
        sum_scores_non_dga = sum_scores_non_dga + sum(scores_non_dga)

        # Get scores for substrings from dga training dataset
        sub_str_scores_dga = get_score_from_training_dataset(list_of_counts_of_various_ngrams_dga[split],sub_strs)
        scores_dga = list(sub_str_scores_dga.values())
        sum_scores_dga = sum_scores_dga + sum(scores_dga)
        
    if (sum_scores_dga + sum_scores_non_dga) == 0:
        print(row["domain"] + " origin is uncertain.")
    else:
        p_dga = sum_scores_dga/(sum_scores_dga + sum_scores_non_dga)
        p_non_dga = sum_scores_non_dga/(sum_scores_non_dga + sum_scores_dga)
        probability_list.append([p_dga,p_non_dga])
for threshold in thresholds:
    p_array = np.array(probability_list)[:,0]
    count = (p_array>threshold).sum()
    fpr = count/len(df_test)
    dict_of_thresholds_and_fpr_fnr[threshold] = [fpr,-1]
        
df_test = pd.read_csv(validation_dga_domain_file, sep=',',header=None)
df_test = get_domain_names_without_tld(df_test).drop_duplicates(keep='first')
probability_list = []
count = 0
# Split test domains into substrings
for index, row in df_test.iterrows():
    sum_scores_dga = 0
    sum_scores_non_dga = 0

    for split in n_gram_split:
        sub_strs = get_substrings(row["domain"],split)

        # Get scores for substrings from non_dga training dataset
        sub_str_scores_non_dga = get_score_from_training_dataset(list_of_counts_of_various_ngrams_non_dga[split],sub_strs)
        scores_non_dga = list(sub_str_scores_non_dga.values())
        sum_scores_non_dga = sum_scores_non_dga + sum(scores_non_dga)

        # Get scores for substrings from dga training dataset
        sub_str_scores_dga = get_score_from_training_dataset(list_of_counts_of_various_ngrams_dga[split],sub_strs)
        scores_dga = list(sub_str_scores_dga.values())
        sum_scores_dga = sum_scores_dga + sum(scores_dga)
    if (sum_scores_dga + sum_scores_non_dga) == 0:
        print(row["domain"] + " origin is uncertain.")
    else:
        p_dga = sum_scores_dga/(sum_scores_dga + sum_scores_non_dga)
        p_non_dga = sum_scores_non_dga/(sum_scores_non_dga + sum_scores_dga)
        probability_list.append([p_dga,p_non_dga])
for threshold in thresholds:
    p_array = np.array(probability_list)[:,0]
    count = (p_array<threshold).sum()
    fnr = count/len(df_test)
    dict_of_thresholds_and_fpr_fnr[threshold][1] = fnr
df_fpr_fnr_thresh = pd.DataFrame.from_dict(dict_of_thresholds_and_fpr_fnr)
df_fpr_fnr_thresh.index = ['FPR','FNR']
df_fpr_fnr_thresh

Unnamed: 0,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70
FPR,1.0,0.881188,0.465347,0.247525,0.138614,0.09901,0.059406,0.049505,0.039604,0.039604,0.009901,0.0,0.0
FNR,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.095238,0.238095,0.333333,0.428571,0.571429


In [13]:
"""Set threshold and classify new data"""
threshold = 0.35
df_test = pd.read_csv(test_domains_file, sep=',',header=None)
df_test = get_domain_names_without_tld(df_test).drop_duplicates(keep='first')
count = 0
# Split test domains into substrings
for index, row in df_test.iterrows():
    sum_scores_dga = 0
    sum_scores_non_dga = 0
    
    for split in n_gram_split:
        sub_strs = get_substrings(row["domain"],split)
        
        # Get scores for substrings from non_dga training dataset
        sub_str_scores_non_dga = get_score_from_training_dataset(list_of_counts_of_various_ngrams_non_dga[split],sub_strs)
        scores_non_dga = list(sub_str_scores_non_dga.values())
        sum_scores_non_dga = sum_scores_non_dga + sum(scores_non_dga)
        
        # Get scores for substrings from dga training dataset
        sub_str_scores_dga = get_score_from_training_dataset(list_of_counts_of_various_ngrams_dga[split],sub_strs)
        scores_dga = list(sub_str_scores_dga.values())
        sum_scores_dga = sum_scores_dga + sum(scores_dga)
    if (sum_scores_dga + sum_scores_non_dga) == 0:
        print(row["domain"] + " origin is uncertain.")
    else:
        p_dga = sum_scores_dga/(sum_scores_dga + sum_scores_non_dga)
        p_non_dga = sum_scores_non_dga/(sum_scores_non_dga + sum_scores_dga)
        print("P(DGA)= "+str(p_dga))
#         print("P(NON-DGA)= "+str(p_non_dga))
       
        if p_dga >= threshold:
            print(row["domain"] + " is generated by DGA.")
        else:
            print(row["domain"] + " is not generated by DGA.")
    print("#####")

P(DGA)= 0.30063821944681335
6ped2nd3yp is not generated by DGA.
#####
P(DGA)= 0.7272150322721345
7fkm2r4pzi is generated by DGA.
#####
P(DGA)= 0.6655234887504209
bzvlwmputpz is generated by DGA.
#####
P(DGA)= 0.7756749861436567
bzvnjpks is generated by DGA.
#####
P(DGA)= 0.7304263750449718
c29kub68m69avkukycwi45hsb68gqctcufybz is generated by DGA.
#####
P(DGA)= 0.6021101759288413
c29nrnuhwjvd60pqk37ate41pynzh64o61atg43 is generated by DGA.
#####
P(DGA)= 0.6680638244410797
c39p22n20ozowiqc59lvnsb18c59dsk17pse31ks is generated by DGA.
#####
P(DGA)= 0.18927146184009436
rediff is not generated by DGA.
#####
P(DGA)= 0.2525020168396866
kaixin001 is not generated by DGA.
#####
P(DGA)= 0.4524070667116768
java is generated by DGA.
#####
P(DGA)= 0.23960113258288945
download is not generated by DGA.
#####
P(DGA)= 0.17814489660468907
google is not generated by DGA.
#####
P(DGA)= 0.3508064029827192
it168 is not generated by DGA.
#####
P(DGA)= 0.2137127351954888
cam4 is not generated by DGA.
#####
P