# Web安全之机器学习
## 第8章 逻辑回归算法
### 8.3 示例：使用逻辑回归算法检测Java溢出攻击

#### 1. 数据搜集和数据清洗
这次我们仍然使用ADFA-LD数据集，不过只使用其中攻击数据集中的JAVA溢出攻击的相关数据。

In [1]:
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

def load_normal_data(rootdir):
    file_list = os.listdir(rootdir)
    x = []
    for file in file_list:
        cmd = ""
        path = os.path.join(rootdir,file)
        with open(path,'r') as f:
            for line in f.readlines():
                cmd += line.strip()
        x.append(cmd)
    return x

def load_attack_data(rootdir):
    x = []
    file_list = os.listdir(rootdir)
    for file in file_list:
        dir_path = os.path.join(rootdir,file)
        if os.path.isdir(dir_path):
            pattern = re.compile("../data/ADFA-LD/Attack_Data_Master/Java_Meterpreter_\d+")
            if re.match(pattern,dir_path):
                dir_path += "/"
                x += load_attack_data(dir_path)
        else :
            with open(dir_path,'r') as f:
                x.append(f.read().strip())
    return x

#### 2.特征化
使用词集模型进行向量化

In [2]:
def get_feature():
    x1 = load_normal_data("../data/ADFA-LD/Training_Data_Master/")
    x2 = load_attack_data("../data/ADFA-LD/Attack_Data_Master/")
    x = x1+x2
    y = [0]*len(x1)+[1]*len(x2)

    cv = CountVectorizer(decode_error='ignore',min_df=1)
    X = cv.fit_transform(x).toarray()
    return X,y

#### 3.训练样本和效果验证

In [3]:
def main():
    X,Y = get_feature()

    clf = LogisticRegression(C=1e5)

    score = cross_val_score(clf,X,Y,cv=10)
    print("score=",score)
main()

score= [ 0.92783505  0.94845361  0.95876289  0.90625     0.96842105  0.77894737
  0.95789474  0.95789474  0.96842105  0.94736842]
