- 使用ngram进行恶意域名识别
- 参考论文：https://www.researchgate.net/publication/330843380_Malicious_Domain_Names_Detection_Algorithm_Based_on_N_-Gram

In [1]:
import numpy as np
import pandas as pd
import tldextract
import matplotlib.pyplot as plt
import os
import re
import time
from scipy import sparse
%matplotlib inline

## 加载数据

- 加载正常域名

In [2]:
df_benign_domain = pd.read_csv('top-1m.csv', index_col=0, header=None).reset_index(drop=True)

In [3]:
df_benign_domain.columns = ['domain']

In [4]:
df_benign_domain['label'] = 0

- 加载恶意域名

In [5]:
df_malicious_domain = pd.read_csv('malicious-domain.csv', engine='python', header=None)

In [6]:
df_malicious_domain = df_malicious_domain[[1]]

In [7]:
df_malicious_domain.columns = ['domain']

In [8]:
df_malicious_domain = df_malicious_domain[df_malicious_domain['domain'] != '-']

In [9]:
df_malicious_domain['label'] = 1

In [10]:
df_domain = pd.concat([df_benign_domain, df_malicious_domain], axis=0)

In [11]:
def remove_tld(domain):
    ext = tldextract.extract(domain)
    if ext.subdomain != '':
        domain = ext.subdomain + '.' + ext.domain
    else:
        domain = ext.domain
    return domain

In [12]:
df_domain['domain'] = df_domain['domain'].map(lambda x: tldextract.extract(x).domain)

## 提取ngram特征

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
domain_list = df_domain[df_domain['label'] == 0]['domain'].values.tolist()

In [15]:
benign_text_str = '.'.join(domain_list)

In [16]:
benign_text = re.split(r'[.-]', benign_text_str)

In [17]:
benign_text = list(filter(lambda x: len(x) >= 3, benign_text))

In [45]:
def get_ngram_weight_dict(benign_text):
    cv = CountVectorizer(ngram_range = (3, 7), analyzer='char', max_features=100000)
    cv.fit(benign_text)
    feature_names = cv.get_feature_names()
    benign_text_vectors = cv.transform(benign_text)
    ngram_count = benign_text_vectors.sum(axis=0)
    window_sizes = np.array(list(map(lambda x: len(x), feature_names)))
    ngram_weights = np.multiply(np.log2(ngram_count), window_sizes)
    ngram_weights = sparse.csr_matrix(ngram_weights)
    feature_names = cv.get_feature_names()
    ngram_weights_dict = dict()
    for ngram, weight in zip(feature_names, ngram_weights.toarray()[0].tolist()):
        ngram_weights_dict[ngram] = weight
    return ngram_weights_dict

In [46]:
ngram_weights_dict = get_ngram_weight_dict(benign_text)

## 计算域名的信誉值

In [47]:
def get_reputation_value(ngram_weights_dict, domain):
    if len(domain) < 3:
        return 1000
    domains = re.split(r'[.-]', domain)
    reputation = 0
    domain_len = 0
    for domain in domains:
        domain_len += len(domain)
        for window_size in range(3, 8):
            for i in range(len(domain) - window_size + 1):
                reputation += ngram_weights_dict.get(domain[i:i+window_size], 0)
    reputation = reputation / domain_len
    return reputation

In [48]:
get_reputation_value(ngram_weights_dict, 'google')

60.98470329782399

In [49]:
get_reputation_value(ngram_weights_dict, 'ta0ba0')

2.377443751081734

In [53]:
get_reputation_value(ngram_weights_dict, 'dskdjisuowerwdfskdfj000')

15.426971096928998

In [54]:
start = time.time()
df_domain['reputation'] = df_domain['domain'].map(lambda x: get_reputation_value(ngram_weights_dict, x))
end = time.time()
print('cost time : {}'.format(end - start))

cost time : 9.624819040298462


In [55]:
df_domain[df_domain['label'] == 0]['reputation'].describe()

count    768588.000000
mean         62.794891
std          73.951918
min           0.000000
25%          34.411518
50%          53.056123
75%          76.226724
max        1000.000000
Name: reputation, dtype: float64

In [56]:
df_domain[df_domain['label'] == 1]['reputation'].describe()

count    1712.000000
mean       52.669588
std        67.904200
min         0.000000
25%        23.658419
50%        47.085018
75%        67.917240
max      1000.000000
Name: reputation, dtype: float64

## 保存模型文件

In [57]:
import joblib

In [58]:
joblib.dump(ngram_weights_dict, 'ngram_weights_dict.m', compress=4)

['ngram_weights_dict.m']