In [1]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import argparse
import os
from tqdm import tqdm

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
def generate_train_data(name):
    num_sessions = 0
    inputs = []
    outputs = []
    with open(name, 'r') as f:
        for line in tqdm(f,"loading data"):
            num_sessions += 1
            seq = [0]+list(map(lambda n: n, map(int, line.strip().split())))+[30]+[31]*(window_size-1)
            line = tuple(seq)
            
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])
    print('Number of sessions({}): {}'.format(name, num_sessions))
    print('Number of seqs({}): {}'.format(name, len(inputs)))
    dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))
    return dataset

In [3]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_keys)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, :, :])
        return out

In [4]:
def train(model,dataloader,current_epoch=0,num_epochs=10):
    start_time = time.time()
    for epoch in range(current_epoch,current_epoch+num_epochs):  # Loop over the dataset multiple times
        train_loss = 0
        for step, (seq, label) in enumerate(dataloader):
            # Forward pass
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            label1= seq[:,1:,:].cpu().long()
            label2 = label.view(-1,1,1)
            label = torch.cat([label1,label2],1).view(-1,window_size)
            label = label.reshape(label.size(0)*label.size(1))
            output = model(seq)
            output = output.reshape(output.size(0)*output.size(1),-1)
            loss = criterion(output, label.to(device))

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
#             writer.add_graph(model, seq)
        print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, current_epoch+num_epochs, train_loss / total_step))
#         writer.add_scalar('train_loss', train_loss / total_step, epoch + 1)
    elapsed_time = time.time() - start_time
    print('elapsed_time: {:.3f}s'.format(elapsed_time))

In [5]:
# Hyperparameters
num_classes = 32
num_epochs = 300
batch_size = 2048
input_size = 1
model_dir = 'model'
window_size = 10
file_dir = 'data_official'
log = 'file_dir={}_version={}'.format(file_dir,'old_padding')
num_layers = 2
hidden_size = 64

model = Model(input_size, hidden_size, num_layers, num_classes).to(device)

In [7]:
train_dataset = generate_train_data(file_dir+'/hdfs_train')
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
# writer = SummaryWriter(log_dir='log/' + log)
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=31)
optimizer = optim.Adam(model.parameters())
# Train the model
total_step = len(dataloader)

loading data: 4855it [00:00, 12190.99it/s]


Number of sessions(data_official/hdfs_train): 4855
Number of seqs(data_official/hdfs_train): 99602


In [19]:
model.train()
# model.load_state_dict(torch.load(model_dir + '/' + log + '.pt'))
train(model,dataloader,current_epoch=100,num_epochs=300)
if not os.path.isdir(model_dir):
    os.makedirs(model_dir)
torch.save(model.state_dict(), model_dir + '/' + log + '.pt')
writer.close()
print('Finished Training')

Epoch [101/400], train_loss: 0.4311
Epoch [102/400], train_loss: 0.4317
Epoch [103/400], train_loss: 0.4309
Epoch [104/400], train_loss: 0.4309
Epoch [105/400], train_loss: 0.4309
Epoch [106/400], train_loss: 0.4305
Epoch [107/400], train_loss: 0.4308
Epoch [108/400], train_loss: 0.4308
Epoch [109/400], train_loss: 0.4302
Epoch [110/400], train_loss: 0.4302
Epoch [111/400], train_loss: 0.4301
Epoch [112/400], train_loss: 0.4304
Epoch [113/400], train_loss: 0.4300
Epoch [114/400], train_loss: 0.4299
Epoch [115/400], train_loss: 0.4298
Epoch [116/400], train_loss: 0.4298
Epoch [117/400], train_loss: 0.4515
Epoch [118/400], train_loss: 0.5675
Epoch [119/400], train_loss: 0.4365
Epoch [120/400], train_loss: 0.4322
Epoch [121/400], train_loss: 0.4314
Epoch [122/400], train_loss: 0.4307
Epoch [123/400], train_loss: 0.4304
Epoch [124/400], train_loss: 0.4301
Epoch [125/400], train_loss: 0.4299
Epoch [126/400], train_loss: 0.4297
Epoch [127/400], train_loss: 0.4294
Epoch [128/400], train_loss:

KeyboardInterrupt: 

In [6]:
model.load_state_dict(torch.load(model_dir + '/' + log + '.pt'))
train_dataset = generate_train_data(file_dir+'/hdfs_test_normal')
dataloader = DataLoader(train_dataset, batch_size=20000, shuffle=True, pin_memory=True)

loading data: 553365it [00:36, 15277.38it/s]


Number of sessions(data_official/hdfs_test_normal): 553365
Number of seqs(data_official/hdfs_test_normal): 11345956


测试对下一标签预测准确率: 5it [02:08, 26.05s/it]

KeyboardInterrupt: 

In [7]:
correct = 0
num_of_seq = 0
model.eval()
with torch.no_grad():
    for step, (seq, label) in tqdm(enumerate(dataloader),desc="测试对下一标签预测准确率"):
        # Forward pass
        seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
        label1= seq[:,1:,:].cpu().long()
        label2 = label.view(-1,1,1)
        label = torch.cat([label1,label2],1).view(-1,window_size)
        label = label.reshape(label.size(0)*label.size(1))
        output = model(seq)
        output = output.reshape(output.size(0)*output.size(1),-1)
        predicted = torch.argsort(output, 1)[:, -6:].cpu()
        num_of_seq+=len(label)
        for i in range(len(label)):
            if label[i]==31:
                num_of_seq-=1
                continue
        #     print(label[i],predicted[i])
            if label[i] in predicted[i]:
        #         print(label[i],predicted[i])
                correct+=1
    #         else:
    #             print(label[i-3:i],label[i],predicted[i])
    print('对下一标签预测准确率为: '+str(correct/num_of_seq))


测试对下一标签预测准确率: 0it [00:00, ?it/s][A
测试对下一标签预测准确率: 1it [00:25, 25.71s/it][A
测试对下一标签预测准确率: 2it [00:50, 25.50s/it][A
测试对下一标签预测准确率: 3it [01:11, 24.00s/it][A

KeyboardInterrupt: 

In [16]:
print(seq[3000%2048-1])
print(label[3000:3005])

tensor([[ 9.],
        [11.],
        [ 9.],
        [26.],
        [ 2.],
        [ 4.],
        [ 4.],
        [ 3.],
        [23.],
        [23.]], device='cuda:0')
tensor([9, 4, 4, 4, 4])


In [23]:
if not os.path.isdir(model_dir):
    os.makedirs(model_dir)
torch.save(model.state_dict(), model_dir + '/' + log + '.pt')
# writer.close()
print('Finished Training')

Finished Training


In [9]:
def generate_test_data(name,window_size=10):
    hdfs = set()
    # hdfs = []
    with open('data/' + name, 'r') as f:
        for ln in f.readlines():
            ln = [0]+list(map(lambda n: n, map(int, ln.strip().split())))+[30]
            ln = ln + [-1] * (window_size + 1 - len(ln))
            hdfs.add(tuple(ln))
            # hdfs.append(tuple(ln))
    session_to_seq = []
    seqs = []
    labels = []
    seq_count = 0
    for line in tqdm(hdfs, "normal:"):
        session = []
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seqs.append(seq)
            session.append(seq_count)
            labels.append(label)
            seq_count += 1
        session_to_seq.append(session)
    print('Number of sessions({}): {}'.format(name, len(session_to_seq)))
    print('Number of seqs({}): {}'.format(name, len(seqs)))
    dataset = TensorDataset(torch.tensor(seqs, dtype=torch.float), torch.tensor(labels))

    # print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return session_to_seq, dataset, seqs,labels

In [10]:
# fast predict
def fast_predict(model,normal_dataloader,abnormal_dataloader,num_candidates=5,window_size=10):
    TP = 0
    FP = 0
    # Test the model
    start_time = time.time()
    test_normal_result = []
    test_abnormal_result = []
    with torch.no_grad():
        result = []
        with torch.no_grad():
            for step, (seq, labels) in tqdm(enumerate(normal_dataloader), desc='normal'):
                seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
                output = model(seq).cpu()

                predicted = torch.argsort(output[:,-1,:], 1)[:,-num_candidates:]
                for i, label in enumerate(labels):
                    if label not in predicted[i]:
                        test_normal_result.append(True)
                    else:
                        test_normal_result.append(False)
    for session in test_normal_session:
        for seq_id in session:
            if test_normal_result[seq_id] == True:
                FP += 1
                break

    with torch.no_grad():
        for step, (seq, labels) in tqdm(enumerate(abnormal_dataloader), desc='abnormal'):
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            output = model(seq).cpu()

            predicted = torch.argsort(output[:,-1,:], 1)[:,-num_candidates:]
            for i, label in enumerate(labels):
                if label not in predicted[i]:
                    test_abnormal_result.append(True)
                else:
                    test_abnormal_result.append(False)
        for session in test_abnormal_session:
            for seq_id in session:
                if test_abnormal_result[seq_id] == True:
                    TP += 1
                    break
    elapsed_time = time.time() - start_time
    print('elapsed_time: {:.3f}s'.format(elapsed_time))
    # Compute precision, recall and F1-measure
    FN = len(test_abnormal_session) - TP
    P = 100 * TP / (TP + FP)
    R = 100 * TP / (TP + FN)
    F1 = 2 * P * R / (P + R)
    print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
    print('Finished Predicting')
    return test_normal_result,test_abnormal_result

In [11]:
# model.load_state_dict(torch.load(model_dir + '/' + log + '.pt'))
model.eval()
batch_size = 10000
window_size = 10
test_normal_session, test_normal_dataset, test_normal_seq,test_normal_label = generate_test_data('hdfs_test_normal',window_size)
normal_dataloader = DataLoader(test_normal_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_abnormal_session, test_abnormal_dataset,test_abnormal_seq,test_abnormal_label = generate_test_data('hdfs_test_abnormal',window_size)
abnormal_dataloader = DataLoader(test_abnormal_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)




normal::   0%|                                                                               | 0/14177 [00:00<?, ?it/s][A[A

normal::  19%|████████████                                                     | 2637/14177 [00:00<00:00, 25125.06it/s][A[A

normal::  36%|███████████████████████▍                                         | 5109/14177 [00:00<00:00, 24788.66it/s][A[A

normal::  53%|██████████████████████████████████▏                              | 7459/14177 [00:00<00:00, 23846.68it/s][A[A

normal::  68%|████████████████████████████████████████████▍                    | 9691/14177 [00:00<00:00, 23319.12it/s][A[A

normal::  86%|███████████████████████████████████████████████████████▏        | 12213/14177 [00:00<00:00, 23376.58it/s][A[A

normal:: 100%|████████████████████████████████████████████████████████████████| 14177/14177 [00:00<00:00, 22235.86it/s][A[A


Number of sessions(hdfs_test_normal): 14177
Number of seqs(hdfs_test_normal): 269570




normal::   0%|                                                                                | 0/4123 [00:00<?, ?it/s][A[A

normal:: 100%|██████████████████████████████████████████████████████████████████| 4123/4123 [00:00<00:00, 20584.00it/s][A[A

Number of sessions(hdfs_test_abnormal): 4123
Number of seqs(hdfs_test_abnormal): 88410





In [12]:
test_normal_result,test_abnormal_result = fast_predict(model,normal_dataloader,abnormal_dataloader,10,window_size)



normal: 0it [00:00, ?it/s][A[A

normal: 1it [00:02,  2.21s/it][A[A

normal: 2it [00:03,  1.94s/it][A[A

normal: 3it [00:04,  1.75s/it][A[A

normal: 4it [00:06,  1.87s/it][A[A

normal: 5it [00:08,  1.72s/it][A[A

normal: 6it [00:10,  1.75s/it][A[A

normal: 7it [00:11,  1.66s/it][A[A

normal: 8it [00:13,  1.75s/it][A[A

normal: 9it [00:14,  1.65s/it][A[A

normal: 10it [00:16,  1.64s/it][A[A

normal: 11it [00:18,  1.70s/it][A[A

normal: 12it [00:19,  1.61s/it][A[A

normal: 13it [00:21,  1.68s/it][A[A

normal: 14it [00:22,  1.55s/it][A[A

normal: 15it [00:24,  1.62s/it][A[A

normal: 16it [00:26,  1.54s/it][A[A

normal: 17it [00:27,  1.49s/it][A[A

normal: 18it [00:29,  1.58s/it][A[A

normal: 19it [00:30,  1.47s/it][A[A

normal: 20it [00:32,  1.57s/it][A[A

normal: 21it [00:33,  1.49s/it][A[A

normal: 22it [00:35,  1.60s/it][A[A

normal: 23it [00:36,  1.52s/it][A[A

normal: 24it [00:38,  1.47s/it][A[A

normal: 25it [00:39,  1.55s/it][A[A

elapsed_time: 56.581s
false positive (FP): 320, false negative (FN): 1374, Precision: 89.573%, Recall: 66.675%, F1-measure: 76.446%
Finished Predicting





In [None]:
test_normal_result,test_abnormal_result = fast_predict(model,normal_dataloader,abnormal_dataloader,10,window_size)

In [15]:
with open('FP.txt','w') as f:
    for i in range(len(test_normal_result)):
        if test_normal_result[i]:
            f.write(str(test_normal_seq[i])+'\n')
# test_normal_result[0]

In [18]:
with open('FT.txt','w') as f:
    for i in range(len(test_abnormal_result)):
        if test_abnormal_result[i]:
            f.write(str(test_abnormal_seq[i])+'\n')
# test_normal_result[0]

In [20]:
with open('FT.txt','w') as f:
    for session in test_abnormal_session:
        for seq_id in session:
            if test_abnormal_result[seq_id] == True:
#                 TP += 1
                f.write(str(test_abnormal_seq[seq_id])+'\n')
                break

In [13]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [5]:
model.load_state_dict(torch.load(model_dir + '/' + log + '.pt'))
model.to(device)
model.eval()

Model(
  (lstm): LSTM(1, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=32, bias=True)
)

In [6]:
import random

In [10]:
def generate_seq(start,window_size=10,num_candidates=5,scope=None):
    if isinstance(start,list):
        start = torch.FloatTensor(start).reshape(1,-1)
    bg = start.size(1) 
    if scope==None:
        scope=num_candidates
    for i in range(bg,bg+window_size):
#         start = torch.FloatTensor(start)
        seq = start.clone().detach().view(-1, i, input_size).to(device)
        output = model(seq).cpu()[:,-1,:]
        output = output.reshape(-1)
        predicted = torch.argsort(output)[-num_candidates:]
        nxt = random.randint(1,scope)
        start = torch.cat([start,predicted[-nxt].reshape(1,-1).float()],1)
    return start,predicted,output

In [7]:
softmax = nn.Softmax(dim=0)

In [8]:
def showNext(t,num_candidates=10,ts=0.005):
    _,predicted,output = generate_seq(t,1,num_candidates)
    prob = softmax(output)
    scope = 0
    for i in range(num_candidates):
        if prob[predicted[num_candidates-i-1]]<ts:
            scope=num_candidates-i-1
#             print(scope)
            break
            
    #     print(t.int().cpu().numpy()[0])
    predicted = predicted[scope+1:].cpu().numpy().tolist()
    prob = prob[predicted].detach().cpu().numpy().tolist()
    print("预测的序号排序:",end=' ')
    print(predicted)
    print("对应的可能性:",end=' ')
    print(prob)
    return predicted,prob

In [11]:
seq = [21]
predicted,prob = showNext(seq,ts=0.001)

预测的序号排序: [5, 11, 30, 21]
对应的可能性: [0.001603407901711762, 0.00365528860129416, 0.3520480692386627, 0.6404494643211365]


In [None]:
t = torch.FloatTensor([0]).reshape(1,-1)
max_len = 60
pattern = set()
while t.size(1)<max_len:
    t,predicted,output = generate_seq(t,1,3,1)
    prob = softmax(output)
    print(t.int().cpu().numpy()[0])
    print("预测的序号排序:",end=' ')
    print(predicted)
    print("对应的可能性:",end=' ')
    print(prob[predicted])
    print()
    if 30 in t[0]:
        break
print(t.int().cpu().numpy()[0])
pattern.add(tuple(t.int().cpu().numpy()[0]))

# 发现并发结构

In [40]:
seq = [21]
pre_predict,prob = showNext(seq,ts=0.005)

预测的序号排序: [30, 9, 11, 23, 5, 21]
对应的可能性: [0.006809363141655922, 0.011109961196780205, 0.032306522130966187, 0.10339812934398651, 0.13575124740600586, 0.6980828642845154]


In [162]:
concorrent_set = set()
for i,event in enumerate(pre_predict):
    cur_seq = seq+[event]
    cur_predicted,prob = showNext(cur_seq,ts=0.001)
    for j in pre_predict[i+1:]:
        pre1,_ = showNext(seq+[j])
        if event in pre1 and j in cur_predicted:
            concorrent_set.add(event)  
            concorrent_set.add(j)   

预测的序号排序: [11, 5]
对应的可能性: [0.0012148055247962475, 0.9984245300292969]
预测的序号排序: [22, 5]
对应的可能性: [0.24863998591899872, 0.7509623765945435]
预测的序号排序: [22, 5]
对应的可能性: [0.24863998591899872, 0.7509623765945435]


In [163]:
concorrent_set

{5, 22}

In [102]:
concorrent_set = list(concorrent_set)
seq+[concorrent_set[0]]

[0, 5]

In [131]:
concorrent_set

set()

In [160]:
seq

[0, 5, 5, 5]

In [166]:
concurrent_path = [i for i in seq]
concurrent_path_list = []
concorrent_tmp = list(concorrent_set)
for i,event in enumerate(concorrent_tmp):
    con_path = seq+[event]
    print(con_path)
    pre,_ = showNext(con_path)
    pre = set(pre)
    print(pre)
    concorrent_set.remove(event)
    while pre!=concorrent_set:
        for j in concorrent_set:
            pre.remove(j)
        con_path = con_path+[pre.pop()]
        pre,_ = showNext(con_path)
        pre = set(pre)
    concorrent_set.add(event)
    concurrent_path.extend(con_path[len(seq):])

[0, 5]
预测的序号排序: [22, 5]
对应的可能性: [0.24863998591899872, 0.7509623765945435]
{5, 22}
预测的序号排序: [22, 5]
对应的可能性: [0.34808650612831116, 0.6518101096153259]
预测的序号排序: [22]
对应的可能性: [0.9969695210456848]
[0, 22]
预测的序号排序: [5]
对应的可能性: [0.9984245300292969]
{5}


In [32]:
def generate_train_data(name):
    num_sessions = 0
    inputs = []
    outputs = []
    sessions = []
    with open(name, 'r') as f:
        for line in tqdm(f,"loading data"):
            num_sessions += 1
            seq = [0]+list(map(lambda n: n, map(int, line.strip().split())))+[30]
            sessions.append(seq)
            line = tuple(seq)
            
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])
    print('Number of sessions({}): {}'.format(name, num_sessions))
    print('Number of seqs({}): {}'.format(name, len(inputs)))
    dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))
    return dataset,sessions

In [33]:
dataset,sessions = generate_train_data('./data/hdfs_train')

loading data: 4855it [00:00, 18530.29it/s]


Number of sessions(./data/hdfs_train): 4855
Number of seqs(./data/hdfs_train): 56285


In [34]:
count_event = dict()
for session in sessions:
    event_set = list(set(session))
    for event in event_set:
        if event not in count_event:
            count_event[event]=0
        count_event[event]=count_event[event]+1
print(count_event)
for event in count_event:
    count_event[event] = count_event[event]/len(sessions)
print(count_event)

{0: 4855, 5: 4855, 9: 4855, 11: 4855, 21: 3976, 22: 4855, 23: 3960, 26: 4855, 30: 4855, 2: 721, 3: 1227, 4: 1153, 6: 17, 16: 17, 18: 17, 25: 17}
{0: 1.0, 5: 1.0, 9: 1.0, 11: 1.0, 21: 0.8189495365602472, 22: 1.0, 23: 0.815653964984552, 26: 1.0, 30: 1.0, 2: 0.14850669412976314, 3: 0.25272914521112255, 4: 0.23748712667353244, 6: 0.003501544799176107, 16: 0.003501544799176107, 18: 0.003501544799176107, 25: 0.003501544799176107}


In [64]:
def find_concurrent(seq):
    concorrent_set = set()
    concurrent_path_list= []
    pre_predict,prob = showNext(seq,ts=0.01)
    if seq[-1] in pre_predict:
        pre_predict.remove(seq[-1])
    if len(pre_predict)<2:
        return concorrent_set
    for i,event in enumerate(pre_predict):
        cur_seq = seq+[event]
        cur_predicted,prob = showNext(cur_seq,ts=0.01)
        for j in pre_predict[i+1:] :
            pre1,_ = showNext(seq+[j])
            if event in pre1 and j in cur_predicted:
                concorrent_set.add(event)  
                concorrent_set.add(j) 
    concurrent_path = [i for i in seq]
    concorrent_tmp = list(concorrent_set)
    for i,event in enumerate(concorrent_tmp):
        con_path = seq+[event]
        print(con_path)
        pre,_ = showNext(con_path)
        pre = set(pre)
        print(pre)
        concorrent_set.remove(event)
        while pre!=concorrent_set and not pre.issubset(concorrent_set):
            for j in concorrent_set:
                if j in pre:
                    pre.remove(j)
            con_path = con_path+[pre.pop()]
            pre,_ = showNext(con_path)
            pre = set(pre)
        concorrent_set.add(event)
        concurrent_path.extend(con_path[len(seq):])
        concurrent_path_list.append(con_path)
    merge,_ = showNext(concurrent_path)
    return concorrent_set,merge,concurrent_path_list

In [62]:
showNext([0,5,22,5,5])

预测的序号排序: [9, 26, 11]
对应的可能性: [0.01762796938419342, 0.036271460354328156, 0.9450459480285645]


([9, 26, 11], [0.01762796938419342, 0.036271460354328156, 0.9450459480285645])

In [65]:
find_concurrent([0,5,22,5,5])

预测的序号排序: [9, 26, 11]
对应的可能性: [0.01762796938419342, 0.036271460354328156, 0.9450459480285645]
预测的序号排序: [9, 26, 11]
对应的可能性: [0.05149395763874054, 0.3727625012397766, 0.575558602809906]
预测的序号排序: [9, 11, 26]
对应的可能性: [0.006680076010525227, 0.4850876033306122, 0.5082016587257385]
预测的序号排序: [11, 9]
对应的可能性: [0.010778561234474182, 0.988262951374054]
预测的序号排序: [11, 26]
对应的可能性: [0.4850876033306122, 0.5082016587257385]
预测的序号排序: [11, 9]
对应的可能性: [0.010778561234474182, 0.988262951374054]
预测的序号排序: [11, 9]
对应的可能性: [0.010778561234474182, 0.988262951374054]
[0, 5, 22, 5, 5, 9]
预测的序号排序: [9, 26, 11]
对应的可能性: [0.05149395763874054, 0.3727625012397766, 0.575558602809906]
{9, 26, 11}
预测的序号排序: [9, 26, 11]
对应的可能性: [0.011504905298352242, 0.2595791518688202, 0.7285224199295044]
预测的序号排序: [26, 11]
对应的可能性: [0.09336795657873154, 0.9018100500106812]
[0, 5, 22, 5, 5, 26]
预测的序号排序: [9, 11, 26]
对应的可能性: [0.006680076010525227, 0.4850876033306122, 0.5082016587257385]
{9, 26, 11}
预测的序号排序: [9, 11, 26]
对应的可能性: [0.01052768062800169,

({9, 11, 26},
 [11, 9],
 [[0, 5, 22, 5, 5, 9, 9, 9],
  [0, 5, 22, 5, 5, 26, 26, 26],
  [0, 5, 22, 5, 5, 11, 11, 11]])

In [177]:
def find_loop(seq):
    pass

In [178]:
def find_branch(seq):
    pass

In [68]:
seq = [11, 9, 11, 9, 11, 9, 26, 26, 26]
concorrent_set = set()
concurrent_path_list= []
pre_predict,prob = showNext(seq,ts=0.01)
if seq[-1] in pre_predict:
    pre_predict.remove(seq[-1])
for i,event in enumerate(pre_predict):
    cur_seq = seq+[event]
    cur_predicted,prob = showNext(cur_seq,ts=0.01)
    for j in pre_predict[i+1:]:
        pre1,_ = showNext(seq+[j])
        if event in pre1 and j in cur_predicted:
            concorrent_set.add(event)  
            concorrent_set.add(j) 
concurrent_path = [i for i in seq]


预测的序号排序: [2, 3, 4, 30, 23]
对应的可能性: [0.05369764193892479, 0.05526859685778618, 0.06050111725926399, 0.2377098798751831, 0.5877823829650879]
预测的序号排序: [2, 4, 3, 30, 23]
对应的可能性: [0.08384238183498383, 0.11713362485170364, 0.14755822718143463, 0.20119619369506836, 0.4497317969799042]
预测的序号排序: [2, 23, 4, 3]
对应的可能性: [0.028665073215961456, 0.033793333917856216, 0.3921320140361786, 0.5424069762229919]
预测的序号排序: [4, 3]
对应的可能性: [0.3911248743534088, 0.603672981262207]
预测的序号排序: [25, 11, 4, 2, 3, 30, 23]
对应的可能性: [0.005198594182729721, 0.020473534241318703, 0.05300910398364067, 0.08968948572874069, 0.1056489497423172, 0.34301844239234924, 0.3784100115299225]
预测的序号排序: [23]
对应的可能性: [0.9999033212661743]
预测的序号排序: [2, 23, 4, 3]
对应的可能性: [0.028665073215961456, 0.033793333917856216, 0.3921320140361786, 0.5424069762229919]
预测的序号排序: [4, 3]
对应的可能性: [0.3911248743534088, 0.603672981262207]
预测的序号排序: [25, 11, 4, 2, 3, 30, 23]
对应的可能性: [0.005198594182729721, 0.020473534241318703, 0.05300910398364067, 0.0896894857287406

In [70]:
concorrent_set.remove(30)

In [71]:
concorrent_tmp = list(concorrent_set)
for i,event in enumerate(concorrent_tmp):
    con_path = seq+[event]
    print("cur_path:",con_path)
    pre,_ = showNext(con_path)
    pre = set(pre)
    print(pre)
    concorrent_set.remove(event)
    while pre!=concorrent_set and not pre.issubset(concorrent_set):
        for j in concorrent_set:
            if j in pre:
                pre.remove(j)
        con_path = con_path+[pre.pop()]
        pre,_ = showNext(con_path)
        pre = set(pre)
    concorrent_set.add(event)
    concurrent_path.extend(con_path[len(seq):])
    concurrent_path_list.append(con_path)
merge,_ = showNext(concurrent_path)

cur_path: [11, 9, 11, 9, 11, 9, 26, 26, 26, 2]
预测的序号排序: [2, 4, 3, 30, 23]
对应的可能性: [0.08384238183498383, 0.11713362485170364, 0.14755822718143463, 0.20119619369506836, 0.4497317969799042]
{2, 3, 4, 23, 30}
预测的序号排序: [3, 23, 2, 4]
对应的可能性: [0.02893654815852642, 0.045643649995326996, 0.38048607110977173, 0.5404440760612488]
预测的序号排序: [18, 2, 3, 4]
对应的可能性: [0.007617548108100891, 0.012384964153170586, 0.021838082000613213, 0.9568265080451965]
预测的序号排序: [3, 2, 4, 23]
对应的可能性: [0.005741815082728863, 0.0421714261174202, 0.08599156141281128, 0.8649125099182129]
预测的序号排序: [3, 2, 23, 30, 4]
对应的可能性: [0.0055436501279473305, 0.00710405083373189, 0.025522660464048386, 0.02755546011030674, 0.9339141249656677]
预测的序号排序: [3, 30, 18, 23, 2, 4]
对应的可能性: [0.00868134293705225, 0.008983652107417583, 0.011883899569511414, 0.03815518319606781, 0.3577229678630829, 0.5738712549209595]
预测的序号排序: [18, 3, 23, 2, 4]
对应的可能性: [0.005704065319150686, 0.009191661141812801, 0.015055007301270962, 0.01958266645669937, 0.949583470821

预测的序号排序: [6, 23, 30, 5, 4, 3]
对应的可能性: [0.008524603210389614, 0.011513873934745789, 0.013502707704901695, 0.013898352161049843, 0.022668810561299324, 0.9280903935432434]
预测的序号排序: [4, 3, 23, 5]
对应的可能性: [0.011033434420824051, 0.12491286545991898, 0.35380470752716064, 0.5010837316513062]
预测的序号排序: [3, 23, 5]
对应的可能性: [0.024538634344935417, 0.10347607731819153, 0.8638625144958496]
预测的序号排序: [2, 23, 4, 3, 5]
对应的可能性: [0.01334332674741745, 0.014628302305936813, 0.05419773980975151, 0.1446402221918106, 0.7704848051071167]
预测的序号排序: [30, 2, 4, 3]
对应的可能性: [0.005858671385794878, 0.008532148785889149, 0.3840721547603607, 0.5989634990692139]
预测的序号排序: [30, 4, 3]
对应的可能性: [0.005984411109238863, 0.47227945923805237, 0.5205335021018982]
预测的序号排序: [30, 3, 4]
对应的可能性: [0.008617321960628033, 0.2426387071609497, 0.7486110329627991]
预测的序号排序: [30, 4, 3]
对应的可能性: [0.13855521380901337, 0.2444543093442917, 0.6167486906051636]
预测的序号排序: [4, 30, 3]
对应的可能性: [0.06648334860801697, 0.09203603118658066, 0.8411422967910767]
预测的序

预测的序号排序: [3, 4]
对应的可能性: [0.033352818340063095, 0.9665488600730896]
预测的序号排序: [3, 4]
对应的可能性: [0.033189352601766586, 0.9667131304740906]
预测的序号排序: [3, 4]
对应的可能性: [0.03303997591137886, 0.9668632745742798]
预测的序号排序: [3, 4]
对应的可能性: [0.032902851700782776, 0.9670009016990662]
预测的序号排序: [3, 4]
对应的可能性: [0.03277641162276268, 0.9671278595924377]
预测的序号排序: [3, 4]
对应的可能性: [0.032659418880939484, 0.9672455191612244]
预测的序号排序: [3, 4]
对应的可能性: [0.0325508750975132, 0.9673545956611633]
预测的序号排序: [3, 4]
对应的可能性: [0.03244968503713608, 0.9674562215805054]
预测的序号排序: [3, 4]
对应的可能性: [0.03235539793968201, 0.9675511121749878]
预测的序号排序: [3, 4]
对应的可能性: [0.032267119735479355, 0.9676398038864136]
预测的序号排序: [3, 4]
对应的可能性: [0.032184239476919174, 0.96772301197052]
预测的序号排序: [3, 4]
对应的可能性: [0.032106515020132065, 0.9678011536598206]
预测的序号排序: [3, 4]
对应的可能性: [0.032033346593379974, 0.9678747057914734]
预测的序号排序: [3, 4]
对应的可能性: [0.031964465975761414, 0.9679438471794128]
预测的序号排序: [3, 4]
对应的可能性: [0.031899593770504, 0.9680091142654419]
预测的序号排

KeyboardInterrupt: 

In [74]:
print(concurrent_path_list)

[[11, 9, 11, 9, 11, 9, 26, 26, 26, 2, 2, 2, 18, 2, 2, 2, 2, 2, 23, 23, 23, 21, 21, 21, 30, 18, 2, 30, 2, 21, 2, 21, 21, 2, 2, 2, 2, 2], [11, 9, 11, 9, 11, 9, 26, 26, 26, 3, 3, 3, 3, 3, 3, 3, 3, 3, 23, 23, 23, 21, 21, 21, 30, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]


In [66]:
showNext([26,26,26,23,23,23,21,21,21])

预测的序号排序: [30]
对应的可能性: [0.9997605681419373]


([30], [0.9997605681419373])

In [77]:
find_concurrent([22, 5, 5, 5])

预测的序号排序: [9, 11, 26]
对应的可能性: [0.01060451753437519, 0.3045552968978882, 0.6845642328262329]
预测的序号排序: [9, 11, 26]
对应的可能性: [0.10653172433376312, 0.18710242211818695, 0.7063077688217163]
预测的序号排序: [9]
对应的可能性: [0.9990488886833191]
预测的序号排序: [9, 11, 26]
对应的可能性: [0.008123203180730343, 0.06586132943630219, 0.92600017786026]
预测的序号排序: [9]
对应的可能性: [0.9990488886833191]
预测的序号排序: [9, 11, 26]
对应的可能性: [0.008123203180730343, 0.06586132943630219, 0.92600017786026]
预测的序号排序: [11, 26]
对应的可能性: [0.06586132943630219, 0.92600017786026]
[22, 5, 5, 5, 9]
预测的序号排序: [9, 11, 26]
对应的可能性: [0.10653172433376312, 0.18710242211818695, 0.7063077688217163]
{9, 26, 11}
预测的序号排序: [9, 26, 11]
对应的可能性: [0.02290080487728119, 0.4846383333206177, 0.4920712411403656]
预测的序号排序: [9, 26, 11]
对应的可能性: [0.007098402362316847, 0.10752785205841064, 0.8851501941680908]
预测的序号排序: [9, 26, 11]
对应的可能性: [0.007510153576731682, 0.2285967767238617, 0.7633930444717407]
预测的序号排序: [11, 26]
对应的可能性: [0.04344955459237099, 0.9561227560043335]
[22, 5, 5, 5, 26]
预测

({9, 11, 26},
 [3, 30, 4, 26, 2, 23, 11],
 [[22, 5, 5, 5, 9, 9, 9, 9, 9], [22, 5, 5, 5, 26, 26, 26], [22, 5, 5, 5, 11]])

In [49]:
concorrent_set

{9, 11}

In [50]:
concorrent_set.issubset({1,9,11})

True

In [None]:
global_wf = []

In [81]:
workflow_construction()

预测的序号排序: [22, 5]
对应的可能性: [0.2586425244808197, 0.7405361533164978]


[0]

In [83]:
concorrent_set.copy()

{2, 3}

In [106]:
def recognize_branch(seq,next_event):
    concurrent_group=[]
    event_to_group = [False for _ in next_event]
    for i,event in enumerate(next_event):
        if event_to_group[i]:
            continue
        cur_seq = seq+[event]
        cur_group = [event]
        concurrent_group.append(cur_group)
        event_to_group[i] = True
        cur_predicted,_ = showNext(cur_seq,ts=0.01)
        for j,event2 in enumerate(next_event[i+1:]):
            pre1,_ = showNext(seq+[event2])
            if event in pre1 and event2 in cur_predicted:
                event_to_group[j+i+1]=True
                concurrent_group[-1].append(event2)
    return concurrent_group

In [109]:
recognize_branch([0],[22,5])

预测的序号排序: [5]
对应的可能性: [0.9994300007820129]
预测的序号排序: [22, 5]
对应的可能性: [0.2932291626930237, 0.705966591835022]
[True, True]


[[22, 5]]

In [136]:
def workflow_construction(seq=[0],end={30}):
    next_event,_ = showNext(seq,ts=0.01)
    if set(next_event)==end:
        return seq
    while seq[-1] in next_event:
        seq =seq+[seq[-1]]
        next_event,_ = showNext(seq,ts=0.01)
    if len(next_event)>=2:
        concorrent_group = recognize_branch(seq,next_event)
    for group in concorrent_group:
        if len(group)>2:
            print("合并",seq,group)
            new_seq,end = res.append(find_merge_point(seq,set(group)))
            workflow_construction(new_seq,set(end))
        else:
            workflow_construction(seq+[group[0]],{30})
    return res

In [137]:
def find_merge_point(seq,concorrent_group):
    concorrent_set = set(concorrent_group)
    concurrent_path = [i for i in seq]
    for i,event in enumerate(concorrent_group):
        con_path = seq+[event]
        # print("cur_path:",con_path)
        pre,_ = showNext(con_path)
        pre = set(pre)
        # print(pre)
        concorrent_set.remove(event)
        while pre!=concorrent_set and not pre.issubset(concorrent_set):
            for j in concorrent_set:
                if j in pre:
                    pre.remove(j)
            con_path = con_path+[pre.pop()]
            pre,_ = showNext(con_path)
            pre = set(pre)
        concorrent_set.add(event)
        concurrent_path.extend(con_path[len(seq):])
    merge,_ = showNext(concurrent_path)
    return concurrent_path,merge

In [138]:
print(workflow_construction([0],{30}))

预测的序号排序: [22, 5]
对应的可能性: [0.2586425244808197, 0.7405361533164978]
预测的序号排序: [5]
对应的可能性: [0.9994300007820129]
预测的序号排序: [22, 5]
对应的可能性: [0.2932291626930237, 0.705966591835022]
[True, True]
预测的序号排序: [5]
对应的可能性: [0.9994300007820129]


UnboundLocalError: local variable 'concorrent_group' referenced before assignment

In [132]:
find_merge_point([0],{22,5})

预测的序号排序: [22, 5]
对应的可能性: [0.2932291626930237, 0.705966591835022]
预测的序号排序: [22, 5]
对应的可能性: [0.42156782746315, 0.5777947306632996]
预测的序号排序: [22]
对应的可能性: [0.9989638328552246]
预测的序号排序: [5]
对应的可能性: [0.9994300007820129]
预测的序号排序: [26, 9, 11]
对应的可能性: [0.015241644345223904, 0.021479245275259018, 0.9622746109962463]


([0, 5, 5, 5, 22], [26, 9, 11])