- 使用ngram进行恶意域名识别
- 参考论文：https://www.researchgate.net/publication/330843380_Malicious_Domain_Names_Detection_Algorithm_Based_on_N_-Gram

In [1]:
import numpy as np
import pandas as pd
import tldextract
import matplotlib.pyplot as plt
import os
import re
import time
from scipy import sparse
%matplotlib inline

## 加载数据

- 加载正常域名

In [2]:
df_benign_domain = pd.read_csv('top-1m.csv', index_col=0, header=None).reset_index(drop=True)

In [3]:
df_benign_domain.columns = ['domain']

In [4]:
df_benign_domain['label'] = 0

- 加载恶意域名

In [5]:
df_malicious_domain = pd.read_csv('malicious-domain.csv', engine='python', header=None)

In [6]:
df_malicious_domain = df_malicious_domain[[1]]

In [7]:
df_malicious_domain.columns = ['domain']

In [8]:
df_malicious_domain = df_malicious_domain[df_malicious_domain['domain'] != '-']

In [9]:
df_malicious_domain['label'] = 1

In [10]:
df_domain = pd.concat([df_benign_domain, df_malicious_domain], axis=0)

In [11]:
def remove_tld(domain):
    ext = tldextract.extract(domain)
    if ext.subdomain != '':
        domain = ext.subdomain + '.' + ext.domain
    else:
        domain = ext.domain
    return domain

In [12]:
df_domain['domain'] = df_domain['domain'].map(lambda x: tldextract.extract(x).domain)

## 提取ngram特征

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
domain_list = df_domain[df_domain['label'] == 0]['domain'].values.tolist()

In [15]:
benign_text_str = '.'.join(domain_list)

In [16]:
benign_text = re.split(r'[.-]', benign_text_str)

In [17]:
benign_text = list(filter(lambda x: len(x) >= 3, benign_text))

In [69]:
def get_ngram_weight_dict(benign_text):
    cv = CountVectorizer(ngram_range = (3, 7), analyzer='char')
    cv.fit(benign_text)
    feature_names = cv.get_feature_names()
    benign_text_vectors = cv.transform(benign_text)
    ngram_count = benign_text_vectors.sum(axis=0)
    window_sizes = np.array(list(map(lambda x: len(x), feature_names)))
    ngram_weights = np.log2(np.multiply(ngram_count, window_sizes))
    ngram_weights = sparse.csr_matrix(ngram_weights)
    feature_names = cv.get_feature_names()
    ngram_weights_dict = dict()
    for ngram, weight in zip(feature_names, ngram_weights.toarray()[0].tolist()):
        ngram_weights_dict[ngram] = weight
    return ngram_weights_dict

In [70]:
ngram_weights_dict = get_ngram_weight_dict(benign_text)

## 计算域名的信誉值

In [71]:
def get_reputation_value(ngram_weights_dict, domain):
    if len(domain) < 3:
        return 1000
    domains = re.split(r'[.-]', domain)
    reputation = 0
    for domain in domains:
        for window_size in range(3, 8):
            for i in range(len(domain) - window_size + 1):
                reputation += ngram_weights_dict.get(domain[i:i+window_size], 0)
    return reputation

In [72]:
start = time.time()
df_domain['reputation'] = df_domain['domain'].map(lambda x: get_reputation_value(ngram_weights_dict, x))
end = time.time()
print('cost time : {}'.format(end - start))

cost time : 16.099355936050415


In [75]:
import joblib

In [76]:
joblib.dump(ngram_weights_dict, 'ngram_weights_dict.m', a
           )

['ngram_weights_dict.m']

In [78]:
import sys

In [79]:
sys.getsizeof(ngram_weights_dict)

167772264

In [73]:
df_domain[df_domain['label'] == 0]['reputation'].describe()

count    768588.000000
mean        253.291323
std         186.056861
min           0.000000
25%         115.290124
50%         218.319232
75%         346.338493
max        2347.132833
Name: reputation, dtype: float64

In [74]:
df_domain[df_domain['label'] == 1]['reputation'].describe()

count    1712.000000
mean      218.276114
std       196.666275
min         0.000000
25%        61.773118
50%       165.462918
75%       306.836576
max      1104.137633
Name: reputation, dtype: float64