In [1]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import argparse
import os
from tqdm import tqdm

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 训练阶段

In [8]:
def generate_train_data(name):
    num_sessions = 0
    inputs = []
    outputs = []
    with open(name, 'r') as f:
        for line in tqdm(f,"loading data"):
            num_sessions += 1
            seq = [0]+list(map(lambda n: n, map(int, line.strip().split())))+[30]
            line = tuple(seq)
            
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])
    print('Number of sessions({}): {}'.format(name, num_sessions))
    print('Number of seqs({}): {}'.format(name, len(inputs)))
    dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))
    return dataset


class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, num_keys)

    def forward(self, x):
        h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, :, :])
        return out

In [9]:
def train(model,dataloader,current_epoch=0,num_epochs=10):
    start_time = time.time()
    for epoch in range(current_epoch,current_epoch+num_epochs):  # Loop over the dataset multiple times
        train_loss = 0
        for step, (seq, label) in enumerate(dataloader):
            # Forward pass
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            label1= seq[:,1:,:].cpu().long()
            label2 = label.view(-1,1,1)
            label = torch.cat([label1,label2],1).view(-1,window_size)
            label = label.reshape(label.size(0)*label.size(1))
            output = model(seq)
            output = output.reshape(output.size(0)*output.size(1),-1)
            loss = criterion(output, label.to(device))

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            writer.add_graph(model, seq)
        print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, current_epoch+num_epochs, train_loss / total_step))
        writer.add_scalar('train_loss', train_loss / total_step, epoch + 1)
    elapsed_time = time.time() - start_time
    print('elapsed_time: {:.3f}s'.format(elapsed_time))

In [10]:
# Hyperparameters
num_classes = 31
num_epochs = 300
batch_size = 2048
input_size = 1
model_dir = 'model'
log = 'bd_test_total_loss_batch_size={}_epoch={}'.format(str(batch_size), str(num_epochs))
num_layers = 2
hidden_size = 64
window_size = 10
file_dir = 'data/'

In [12]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)
train_dataset = generate_train_data(file_dir+'hdfs_train')
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
writer = SummaryWriter(log_dir='log/' + log)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Train the model
total_step = len(dataloader)

loading data: 4855it [00:00, 6741.44it/s]


Number of sessions(data/hdfs_train): 4855
Number of seqs(data/hdfs_train): 56285


In [13]:
train(model,dataloader,num_epochs=10)
if not os.path.isdir(model_dir):
    os.makedirs(model_dir)
torch.save(model.state_dict(), model_dir + '/' + log + '.pt')
writer.close()
print('Finished Training')

Epoch [1/10], train_loss: 2.5356
Epoch [2/10], train_loss: 1.6723
Epoch [3/10], train_loss: 1.1599
Epoch [4/10], train_loss: 0.8698
Epoch [5/10], train_loss: 0.7130
Epoch [6/10], train_loss: 0.6048
Epoch [7/10], train_loss: 0.4954
Epoch [8/10], train_loss: 0.3651
Epoch [9/10], train_loss: 0.2875
Epoch [10/10], train_loss: 0.2538
elapsed_time: 105.884s
Finished Training


### 简单检测一下训练的结果

In [107]:
correct = 0
num_of_seq = 0
for step, (seq, label) in tqdm(enumerate(dataloader),desc="测试对下一标签预测准确率"):
    # Forward pass
    seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
    label1= seq[:,1:,:].cpu().long()
    label2 = label.view(-1,1,1)
    label = torch.cat([label1,label2],1).view(-1,window_size)
    label = label.reshape(label.size(0)*label.size(1))
    output = model(seq)
    output = output.reshape(output.size(0)*output.size(1),-1)
    predicted = torch.argsort(output, 1)[:, -3:].cpu()
    num_of_seq+=len(label)
    for i in range(len(label)):

    #     print(label[i],predicted[i])
        if label[i] in predicted[i]:
    #         print(label[i],predicted[i])
            correct+=1   
            if label[i] == 30:
                print(label[i-10:i+1])
print('对下一标签预测准确率为: '+str(correct/num_of_seq))

测试对下一标签预测准确率: 0it [00:00, ?it/s]

tensor([10, 25, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([22, 10,  8, 10,  8, 10,  8, 25, 25, 25, 30])
tensor([25, 25, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([20, 10,  8, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([20,  8, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([20, 10,  8, 10,  8, 25, 25, 10,  8, 25, 30])
tensor([22,  2,  3,  2, 22, 22, 22, 20, 20, 20, 30])
tensor([ 2, 10,  8, 10,  8, 10,  8, 25, 25, 25, 30])
tensor([ 3, 25, 10, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([ 2,  2,  3,  2, 22, 22, 22, 20, 20, 20, 30])
tensor([20, 25, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([25, 10,  8, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([25, 25, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([22, 25, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([22,  2,  3,  2, 22, 22, 22, 20, 20, 20, 30])
tensor([ 3,  8, 10,  8, 22, 22, 22, 20, 20, 20, 30])
tensor([20, 10,  8, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([20, 10,  8,  1, 22, 22, 22, 20, 20, 20, 30])
tensor([20,  2,  3,  2, 22, 22, 22, 20, 20, 20

测试对下一标签预测准确率: 0it [00:02, ?it/s]

tensor([22,  2,  2,  3, 22, 22, 22, 20, 20, 20, 30])
tensor([20,  3,  3,  2, 22, 22, 22, 20, 20, 20, 30])
tensor([ 8,  2,  3,  2, 22, 22, 22, 20, 20, 20, 30])
tensor([20, 25, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([ 8, 10,  8, 10,  8, 10,  8, 25, 25, 25, 30])
tensor([10, 10,  8, 10,  8, 10,  8, 25, 25, 25, 30])
tensor([30,  3,  2,  2, 22, 22, 22, 20, 20, 20, 30])
tensor([ 8, 25, 10,  8, 22, 22, 22, 20, 20, 20, 30])
tensor([25, 25, 25, 25, 22, 22, 22, 20, 20, 20, 30])
tensor([10,  2,  1,  1, 22, 22, 22, 20, 20, 20, 30])
tensor([20, 10,  8, 10,  8, 25, 25, 10,  8, 25, 30])





KeyboardInterrupt: 

In [67]:

print(seq.reshape(1,-1)[0][70:140])
print(label.reshape(1,-1)[0][70:140])

tensor([ 0.,  4., 21.,  4.,  4., 10.,  8., 10.,  8., 25.,  8., 25., 25., 25.,
        22., 22., 22., 20., 20., 20.,  0., 21.,  4.,  4.,  4., 25., 25., 10.,
         8., 10., 21.,  4.,  4.,  4., 25., 25., 25., 10.,  8., 10.,  3.,  3.,
         3.,  2., 22., 22., 22., 20., 20., 20.,  8., 10.,  8., 25., 25., 25.,
        22., 22., 22., 20.,  4., 10.,  8., 10.,  8., 25., 25., 10.,  8., 25.],
       device='cuda:0')
tensor([ 4, 21,  4,  4, 10,  8, 10,  8, 25, 25, 25, 25, 25, 22, 22, 22, 20, 20,
        20, 30, 21,  4,  4,  4, 25, 25, 10,  8, 10,  8,  4,  4,  4, 25, 25, 25,
        10,  8, 10,  8,  3,  3,  2, 22, 22, 22, 20, 20, 20, 30, 10,  8, 25, 25,
        25, 22, 22, 22, 20, 20, 10,  8, 10,  8, 25, 25, 10,  8, 25,  1])


### 清理缓存释放空间 

In [16]:
import gc
gc.collect()
torch.cuda.empty_cache()

# 测试阶段

In [32]:
def generate_test_data(name):
    hdfs = set()
    # hdfs = []
    with open('data/' + name, 'r') as f:
        for ln in f.readlines():
            ln = [0]+list(map(lambda n: n, map(int, ln.strip().split())))+[30]
            ln = ln + [-1] * (window_size + 1 - len(ln))
            hdfs.add(tuple(ln))
            # hdfs.append(tuple(ln))
    session_to_seq = []
    seqs = []
    labels = []
    seq_count = 0
    for line in tqdm(hdfs, "normal:"):
        session = []
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seqs.append(seq)
            session.append(seq_count)
            labels.append(label)
            seq_count += 1
        session_to_seq.append(session)
    print('Number of sessions({}): {}'.format(name, len(session_to_seq)))
    print('Number of seqs({}): {}'.format(name, len(seqs)))
    dataset = TensorDataset(torch.tensor(seqs, dtype=torch.float), torch.tensor(labels))

    # print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return session_to_seq, dataset, seqs

In [15]:
model = Model(input_size, hidden_size, num_layers, num_classes)
model.load_state_dict(torch.load(model_dir + '/' + log + '.pt'))
model.to(device)
model.eval()

Model(
  (lstm): LSTM(1, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=31, bias=True)
)

In [33]:
batch_size = 10000
test_normal_session, test_normal_dataset, test_normal_seq = generate_test_data('hdfs_test_normal')
normal_dataloader = DataLoader(test_normal_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_abnormal_session, test_abnormal_dataset,test_abnormal_seq = generate_test_data('hdfs_test_abnormal')
abnormal_dataloader = DataLoader(test_abnormal_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

normal:: 100%|████████████████████████████████████████████████████████████████| 14177/14177 [00:00<00:00, 22030.95it/s]


Number of sessions(hdfs_test_normal): 14177
Number of seqs(hdfs_test_normal): 269570


normal:: 100%|██████████████████████████████████████████████████████████████████| 4123/4123 [00:00<00:00, 19548.57it/s]

Number of sessions(hdfs_test_abnormal): 4123
Number of seqs(hdfs_test_abnormal): 88410





### 快速预测

In [18]:
# fast predict
def fast_predict(model,normal_dataloader,abnormal_dataloader):
    TP = 0
    FP = 0
    num_candidates=5
    # Test the model
    start_time = time.time()
    test_normal_result = []
    test_abnormal_result = []
    with torch.no_grad():
        result = []
        with torch.no_grad():
            for step, (seq, labels) in tqdm(enumerate(normal_dataloader), desc='normal'):
                seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
                output = model(seq).cpu()

                predicted = torch.argsort(output[:,-1,:], 1)[:,-num_candidates:]
                for i, label in enumerate(labels):
                    if label not in predicted[i]:
                        test_normal_result.append(True)
                    else:
                        test_normal_result.append(False)
    for session in test_normal_session:
        for seq_id in session:
            if test_normal_result[seq_id] == True:
                FP += 1
                break

    with torch.no_grad():
        for step, (seq, labels) in tqdm(enumerate(abnormal_dataloader), desc='normal'):
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            output = model(seq).cpu()

            predicted = torch.argsort(output[:,-1,:], 1)[:,-num_candidates:]
            for i, label in enumerate(labels):
                if label not in predicted[i]:
                    test_abnormal_result.append(True)
                else:
                    test_abnormal_result.append(False)
        for session in test_abnormal_session:
            for seq_id in session:
                if test_abnormal_result[seq_id] == True:
                    TP += 1
                    break
    elapsed_time = time.time() - start_time
    print('elapsed_time: {:.3f}s'.format(elapsed_time))
    # Compute precision, recall and F1-measure
    FN = len(test_abnormal_session) - TP
    P = 100 * TP / (TP + FP)
    R = 100 * TP / (TP + FN)
    F1 = 2 * P * R / (P + R)
    print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
    print('Finished Predicting')

In [19]:
fast_predict(model,normal_dataloader,abnormal_dataloader)

normal: 27it [00:40,  1.48s/it]
normal: 9it [00:12,  1.43s/it]

elapsed_time: 52.979s
false positive (FP): 1513, false negative (FN): 31, Precision: 73.006%, Recall: 99.248%, F1-measure: 84.128%
Finished Predicting





# 生成序列

In [254]:
import random
def generate_seq(start,window_size=10,num_candidates=5):
    bg = start.size(1) 
    for i in range(bg,bg+window_size):
#         start = torch.FloatTensor(start)
        seq = start.clone().detach().view(-1, i, input_size).to(device)
        output = model(seq).cpu()
        predicted = torch.argsort(output[-1], 1)[-1, -num_candidates:]
        nxt = random.randint(0,num_candidates-1)
        start = torch.cat([start,predicted[nxt].reshape(1,-1).float()],1)
    return start,predicted,output

In [259]:
def generate_seq(start,window_size=10,num_candidates=5):
    bg = start.size(1) 
    for i in range(bg,bg+window_size):
#         start = torch.FloatTensor(start)
        seq = start.clone().detach().view(-1, i, input_size).to(device)
        output = model(seq).cpu()[:,-1,:]
        output = output.reshape(-1)
        predicted = torch.argsort(output)[-num_candidates:]
        nxt = random.randint(0,num_candidates-1)
        start = torch.cat([start,predicted[nxt].reshape(1,-1).float()],1)
    return start,predicted,output

In [260]:
softmax = nn.Softmax(dim=0)

In [261]:
t = torch.FloatTensor([0,1]).reshape(1,-1)
t,predicted,output = generate_seq(t,1,30)

In [263]:
output.shape

torch.Size([31])

In [221]:
prob = softmax(output.reshape(-1))
print(prob)

tensor([2.3170e-08, 9.2716e-05, 2.3222e-04, 1.1103e-04, 7.3306e-01, 4.2008e-05,
        2.2293e-08, 2.1097e-08, 3.2225e-05, 2.1193e-08, 2.9007e-04, 2.3265e-08,
        2.2084e-08, 2.4458e-08, 2.2978e-08, 4.1722e-05, 2.3293e-08, 6.9324e-05,
        2.5993e-08, 2.2954e-08, 4.8183e-05, 2.6592e-01, 1.2300e-05, 2.1440e-08,
        4.2136e-05, 1.0084e-05, 2.1850e-08, 2.1220e-08, 2.2338e-08, 2.5002e-08,
        3.0127e-07], grad_fn=<SoftmaxBackward>)


In [266]:
t = torch.FloatTensor([0,4,21,4]).reshape(1,-1)
max_len = 60
while t.size(1)<max_len:
    t,predicted,output = generate_seq(t,1,3)
    prob = softmax(output)
    print(t.int().cpu().numpy()[0])
    print("预测的序号排序:",end=' ')
    print(predicted)
    print("对应的可能性:",end=' ')
    print(prob[predicted])
    print()
    if 30 in t[0]:
        break
print(t.int().cpu().numpy()[0])
pattern.add(tuple(t.int().cpu().numpy()[0]))

[ 0  4 21  4  8]
预测的序号排序: tensor([ 8, 10,  4])
对应的可能性: tensor([1.9306e-04, 3.0414e-03, 9.9664e-01], grad_fn=<IndexBackward>)

[ 0  4 21  4  8  8]
预测的序号排序: tensor([ 8, 25, 10])
对应的可能性: tensor([0.1988, 0.2978, 0.4863], grad_fn=<IndexBackward>)

[ 0  4 21  4  8  8 25]
预测的序号排序: tensor([ 8, 25, 10])
对应的可能性: tensor([0.0050, 0.4196, 0.5752], grad_fn=<IndexBackward>)

[ 0  4 21  4  8  8 25 25]
预测的序号排序: tensor([ 8, 25, 10])
对应的可能性: tensor([0.0015, 0.1662, 0.8322], grad_fn=<IndexBackward>)

[ 0  4 21  4  8  8 25 25 10]
预测的序号排序: tensor([ 8, 25, 10])
对应的可能性: tensor([0.0067, 0.2272, 0.7661], grad_fn=<IndexBackward>)

[ 0  4 21  4  8  8 25 25 10 25]
预测的序号排序: tensor([25, 10,  8])
对应的可能性: tensor([0.0011, 0.1003, 0.8986], grad_fn=<IndexBackward>)

[ 0  4 21  4  8  8 25 25 10 25 10]
预测的序号排序: tensor([25,  8, 10])
对应的可能性: tensor([8.8978e-04, 1.2369e-02, 9.8655e-01], grad_fn=<IndexBackward>)

[ 0  4 21  4  8  8 25 25 10 25 10  8]
预测的序号排序: tensor([ 3,  8, 10])
对应的可能性: tensor([6.0998e-04, 9.5934e-02, 9.0262e

In [224]:
print(predicted)

tensor([ 9, 27, 23, 26, 12,  6, 28, 19, 14,  0, 11, 16, 13, 29, 18, 30, 25, 22,
         8, 15,  5, 24, 20, 17,  1,  3,  2, 10, 21,  4])


In [222]:
prob[predicted.reshape(-1)]

tensor([2.1193e-08, 2.1220e-08, 2.1440e-08, 2.1850e-08, 2.2084e-08, 2.2293e-08,
        2.2338e-08, 2.2954e-08, 2.2978e-08, 2.3170e-08, 2.3265e-08, 2.3293e-08,
        2.4458e-08, 2.5002e-08, 2.5993e-08, 3.0127e-07, 1.0084e-05, 1.2300e-05,
        3.2225e-05, 4.1722e-05, 4.2008e-05, 4.2136e-05, 4.8183e-05, 6.9324e-05,
        9.2716e-05, 1.1103e-04, 2.3222e-04, 2.9007e-04, 2.6592e-01, 7.3306e-01],
       grad_fn=<IndexBackward>)

In [239]:
pattern = set()
for i in range(100):
    t = torch.FloatTensor([0]).reshape(1,-1)
    max_len = 60
    while t.size(1)<max_len:
        t,predicted,output = generate_seq(t,1,2)
        if 30 in t[0]:
            break
    print(t.int().cpu().numpy()[0])
    pattern.add(tuple(t.int().cpu().numpy()[0]))

[ 0 21  4 35 66 35 35 35 35 35 66 35 35 35 35 66 66 66 66 66 35 66 66 35
 66 66 35 66 35 66 35 66 66 35 66 66 66 66 66 35 35 35 66 66 66 66 35 35
 66 35 66 35 35 35 66 66 35 66 66 35]
[  0  21  35  35   4  35  35  35   4 258  35  35  35  35  35  35 258  35
  35 258  35 258 258  35  35  35  35  35 258 258 258 258  35 258  35  35
 258  35 258 258  35 258  35  35  35  35 258 258  35 258  35  35 258  35
 258  35 258 258  35 258]
[ 0  4 52 35 52 35 52 35 52 35 35 52 35 35 52 52 52 35 35 52 35 52 52 52
 35 52 35 35 35 35 35 35 52 35 52 35 35 35 35 52 35 52 52 52 52 35 35 52
 52 35 52 52 52 35 35 52 52 35 52 52]
[ 0  4 52 52 35 52 35 52 35 52 35 52 52 52 52 52 52 35 35 52 52 52 52 35
 52 52 52 35 35 52 52 52 52 35 52 35 35 52 52 52 52 52 35 35 52 35 35 35
 52 52 52 35 52 52 35 52 52 35 52 52]
[ 0 21 35  4 97 97 97 97 35 35 35 35 97 35 35 97 35 35 97 35 97 35 35 97
 97 97 35 35 35 35 97 35 97 35 97 97 97 35 97 35 97 97 35 97 35 35 97 97
 35 97 97 97 35 97 97 97 35 35 35 97]
[ 0  4 52 52 52 35 

KeyboardInterrupt: 

In [190]:
with torch.no_grad():
    for step, (seq, labels) in tqdm(enumerate(normal_dataloader), desc='normal'):
        break
for s in seq[100:110]:
    t = s[:1].reshape(1,-1)
    res,_ = generate_seq(t)
    print(s)
    print(res)

    print(s[t.size(1):])
    print(res[0,t.size(1):])
    print()

normal: 0it [00:00, ?it/s]


tensor([ 8., 10.,  8., 10.,  8.,  1.,  2.,  2.,  3.,  2.])
tensor([[ 8.,  8., 25.,  2.,  2., 22.,  1., 22.,  1.,  1.,  3.]])
tensor([10.,  8., 10.,  8.,  1.,  2.,  2.,  3.,  2.])
tensor([ 8., 25.,  2.,  2., 22.,  1., 22.,  1.,  1.,  3.])

tensor([10.,  8., 10.,  8.,  1.,  2.,  2.,  3.,  2.,  3.])
tensor([[10.,  3.,  1.,  3.,  3., 22., 22., 24., 22.,  5.,  5.]])
tensor([ 8., 10.,  8.,  1.,  2.,  2.,  3.,  2.,  3.])
tensor([ 3.,  1.,  3.,  3., 22., 22., 24., 22.,  5.,  5.])

tensor([ 8., 10.,  8.,  1.,  2.,  2.,  3.,  2.,  3.,  2.])
tensor([[ 8.,  2.,  2.,  3.,  2.,  3.,  1., 21.,  1., 22.,  2.]])
tensor([10.,  8.,  1.,  2.,  2.,  3.,  2.,  3.,  2.])
tensor([ 2.,  2.,  3.,  2.,  3.,  1., 21.,  1., 22.,  2.])

tensor([10.,  8.,  1.,  2.,  2.,  3.,  2.,  3.,  2.,  2.])
tensor([[10.,  8.,  2., 22., 25., 20., 22., 30., 30., 20., 30.]])
tensor([8., 1., 2., 2., 3., 2., 3., 2., 2.])
tensor([ 8.,  2., 22., 25., 20., 22., 30., 30., 20., 30.])

tensor([8., 1., 2., 2., 3., 2., 3., 2., 2., 2.])
tens