In [1]:
import time
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

In [2]:
num_classes = 32
batch_size = 20000
input_size = 1
model_dir = 'model'
window_size = 10
num_layers = 2
hidden_size = 64
file_dir = 'data_official'
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_keys)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, :, :])
        return out


In [4]:
def generate_test_data(name,window_size=10):
    hdfs = set()
    # hdfs = []
    with open(name, 'r') as f:
        for ln in f.readlines():
            ln = [0]+list(map(lambda n: n, map(int, ln.strip().split())))+[30]
            ln = ln + [-1] * (window_size + 1 - len(ln))
            hdfs.add(tuple(ln))
            # hdfs.append(tuple(ln))
    session_to_seq = []
    seqs = []
    labels = []
    seq_count = 0
    for line in tqdm(hdfs, "normal:"):
        session = []
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seqs.append(seq)
            session.append(seq_count)
            labels.append(label)
            seq_count += 1
        session_to_seq.append(session)
    print('Number of sessions({}): {}'.format(name, len(session_to_seq)))
    print('Number of seqs({}): {}'.format(name, len(seqs)))
    dataset = TensorDataset(torch.tensor(seqs, dtype=torch.float), torch.tensor(labels))

    return session_to_seq, dataset, seqs,labels

# fast predict
def fast_predict(model,normal_dataloader,abnormal_dataloader,num_candidates=5,window_size=10):
    TP = 0
    FP = 0
    # Test the model
    start_time = time.time()
    test_normal_result = []
    test_abnormal_result = []
    with torch.no_grad():
        with torch.no_grad():
            for step, (seq, labels) in tqdm(enumerate(normal_dataloader), desc='normal'):
                seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
                output = model(seq).cpu()

                predicted = torch.argsort(output[:,-1,:], 1)[:,-num_candidates:]
                for i, label in enumerate(labels):
                    if label not in predicted[i]:
                        test_normal_result.append(True)
                    else:
                        test_normal_result.append(False)
    for session in test_normal_session:
        for seq_id in session:
            if test_normal_result[seq_id] == True:
                FP += 1
                break

    with torch.no_grad():
        for step, (seq, labels) in tqdm(enumerate(abnormal_dataloader), desc='abnormal'):
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            output = model(seq).cpu()

            predicted = torch.argsort(output[:,-1,:], 1)[:,-num_candidates:]
            for i, label in enumerate(labels):
                if label not in predicted[i]:
                    test_abnormal_result.append(True)
                else:
                    test_abnormal_result.append(False)
        for session in test_abnormal_session:
            for seq_id in session:
                if test_abnormal_result[seq_id] == True:
                    TP += 1
                    break
    elapsed_time = time.time() - start_time
    print('elapsed_time: {:.3f}s'.format(elapsed_time))
    # Compute precision, recall and F1-measure
    FN = len(test_abnormal_session) - TP
    P = 100 * TP / (TP + FP)
    R = 100 * TP / (TP + FN)
    F1 = 2 * P * R / (P + R)
    print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
    print('Finished Predicting')
    return test_normal_result,test_abnormal_result

In [5]:
def generate_test_data(name,window_size=10):
    hdfs = set()
    # hdfs = []
    with open(name, 'r') as f:
        for ln in f.readlines():
            ln = [0]+list(map(lambda n: n, map(int, ln.strip().split())))+[30]
            ln = ln + [-1] * (window_size + 1 - len(ln))
            hdfs.add(tuple(ln))
            # hdfs.append(tuple(ln))
    session_to_seq = []
    seqs = []
    labels = []
    seq_count = 0
    for line in tqdm(hdfs, "normal:"):
        session = []
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seqs.append(seq)
            session.append(seq_count)
            labels.append(label)
            seq_count += 1
        session_to_seq.append(session)
    print('Number of sessions({}): {}'.format(name, len(session_to_seq)))
    print('Number of seqs({}): {}'.format(name, len(seqs)))
    dataset = TensorDataset(torch.tensor(seqs, dtype=torch.float), torch.tensor(labels))

    return session_to_seq, dataset, seqs,labels

# fast predict
def fast_predict(model,normal_dataloader,abnormal_dataloader,num_candidates=5,window_size=10,ts=0.001):
    TP = 0
    FP = 0
    softmax = nn.Softmax(dim = 1)
    # Test the model
    start_time = time.time()
    test_normal_result = []
    test_abnormal_result = []
    with torch.no_grad():
        with torch.no_grad():
            for step, (seq, labels) in tqdm(enumerate(normal_dataloader), desc='normal'):
                seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
                output = model(seq).cpu()
                output = output[:,-1,:]
                prob = softmax(output)
                predicted = torch.argsort(output, 1)[:,-num_candidates:]
                for i, label in enumerate(labels):
                    if label not in predicted[i] or prob[i][label]<ts:
                        test_normal_result.append(True)
                    else:
                        test_normal_result.append(False)
    for session in test_normal_session:
        for seq_id in session:
            if test_normal_result[seq_id] == True:
                FP += 1
                break

    with torch.no_grad():
        for step, (seq, labels) in tqdm(enumerate(abnormal_dataloader), desc='abnormal'):
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            output = model(seq).cpu()
            output = output[:,-1,:]
            prob = softmax(output)
            predicted = torch.argsort(output, 1)[:,-num_candidates:]
#             predicted = torch.argsort(output[:,-1,:], 1)[:,-num_candidates:]
            for i, label in enumerate(labels):
                if label not in predicted[i] or prob[i][label]<ts:
                    test_abnormal_result.append(True)
                else:
                    test_abnormal_result.append(False)
        for session in test_abnormal_session:
            for seq_id in session:
                if test_abnormal_result[seq_id] == True:
                    TP += 1
                    break
    elapsed_time = time.time() - start_time
    print('elapsed_time: {:.3f}s'.format(elapsed_time))
    # Compute precision, recall and F1-measure
    FN = len(test_abnormal_session) - TP
    P = 100 * TP / (TP + FP)
    R = 100 * TP / (TP + FN)
    F1 = 2 * P * R / (P + R)
    print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
    print('Finished Predicting')
    return test_normal_result,test_abnormal_result

In [6]:
def output_result(name,sessions,seqs,results): 
    with open(name+'.txt','w') as f:
        for session in sessions:
            pos = False
            seq = -1
            for seq_id in session:
                if results[seq_id]==True:
                    seq = seq_id
                    break
            if seq==-1 and name=='FN':
                session_events = []
                session_events.extend(seqs[session[0]][1:])
                for seq_id in session[1:]:
                    session_events.append(seqs[seq_id][-1])
                f.write(' '.join(list(map(str,session_events)))+'\n')
            elif seq!=-1 and name=='FP':
                session_events = []
                session_events.extend(seqs[session[0]][1:])
                for seq_id in session[1:]:
                    session_events.append(seqs[seq_id][-1])
                f.write(' '.join(list(map(str,session_events)))+'\n')
                f.write(' '.join(list(map(str,seqs[seq])))+'\n')
            

In [7]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)
model_name = 'data_dir={}_version={}'.format('data_official', 'v0.3')

In [10]:
model_name

'data_dir=data_dev_version=v0.0'

In [8]:
# model_name ='add_padding_batch_size=2048_epoch=300_window_size=10'
model_name = 'data_dir={}_version={}'.format('data_dev', 'v0.0')
file_dir = 'data_dev'

In [9]:
model.load_state_dict(torch.load(model_dir + '/' + model_name + '.pt'))
model.to(device)
model.eval()

Model(
  (lstm): LSTM(1, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=32, bias=True)
)

In [11]:
batch_size = 20000
window_size = 10
test_normal_session, test_normal_dataset, test_normal_seq, test_normal_label = generate_test_data(
    file_dir+'/hdfs_test_normal', window_size)
normal_dataloader = DataLoader(test_normal_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_abnormal_session, test_abnormal_dataset, test_abnormal_seq, test_abnormal_label = generate_test_data(
    file_dir+'/hdfs_test_abnormal', window_size)
abnormal_dataloader = DataLoader(test_abnormal_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

normal:: 100%|████████████████████████████████████████████████████████████████| 14195/14195 [00:00<00:00, 20632.21it/s]


Number of sessions(data_dev/hdfs_test_normal): 14195
Number of seqs(data_dev/hdfs_test_normal): 269979


normal:: 100%|███████████████████████████████████████████████████████████████████| 4121/4121 [00:00<00:00, 8749.69it/s]


Number of sessions(data_dev/hdfs_test_abnormal): 4121
Number of seqs(data_dev/hdfs_test_abnormal): 88562


In [12]:
test_normal_result, test_abnormal_result = fast_predict(model, normal_dataloader, abnormal_dataloader, 10,
                                                        window_size,0.0001)

normal: 14it [01:22,  5.90s/it]
abnormal: 5it [00:22,  4.45s/it]

elapsed_time: 104.908s
false positive (FP): 918, false negative (FN): 151, Precision: 81.219%, Recall: 96.336%, F1-measure: 88.134%
Finished Predicting





In [23]:
test_normal_seq[0]

(0, 1, 1, 2, 1, 3, 4, 3, 4, 4)

In [60]:
test_normal_result, test_abnormal_result = fast_predict(model, normal_dataloader, abnormal_dataloader, 10,
                                                        window_size,0)


normal: 0it [00:00, ?it/s][A
normal: 1it [00:05,  5.30s/it][A
normal: 2it [00:10,  5.40s/it][A
normal: 3it [00:15,  5.29s/it][A
normal: 4it [00:20,  5.14s/it][A
normal: 5it [00:25,  5.05s/it][A
normal: 6it [00:30,  4.93s/it][A
normal: 7it [00:34,  4.87s/it][A
normal: 8it [00:40,  5.02s/it][A
normal: 9it [00:45,  5.13s/it][A
normal: 10it [00:51,  5.26s/it][A
normal: 11it [00:56,  5.22s/it][A
normal: 12it [01:01,  5.32s/it][A
normal: 13it [01:07,  5.35s/it][A
normal: 14it [01:10,  5.02s/it][A

abnormal: 0it [00:00, ?it/s][A
abnormal: 1it [00:04,  4.95s/it][A
abnormal: 2it [00:09,  4.83s/it][A
abnormal: 3it [00:14,  4.87s/it][A
abnormal: 4it [00:19,  4.86s/it][A
abnormal: 5it [00:21,  4.25s/it][A

elapsed_time: 91.692s
false positive (FP): 389, false negative (FN): 1581, Precision: 86.728%, Recall: 61.654%, F1-measure: 72.073%
Finished Predicting





In [40]:
# name,sessions,seqs,results
name = 'FN'
sessions = test_abnormal_session.copy()
seqs = test_abnormal_seq.copy()
results = test_abnormal_result.copy()

In [43]:
# name,sessions,seqs,results
name = 'TP'
sessions = test_abnormal_session.copy()
seqs = test_abnormal_seq.copy()
results = test_abnormal_result.copy()

In [38]:
# name,sessions,seqs,results
name = 'FP'
sessions = test_normal_session.copy()
seqs = test_normal_seq.copy()
results = test_normal_result.copy()

In [44]:
with open(name+'.txt','w') as f:
    for i,session in enumerate(sessions):
        pos = False
        seq = -1
        for seq_id in session:
            if results[seq_id]==True:
                seq = seq_id
                break
        if seq==-1 and ('FN' in name or name=='TN' in name):
            session_events = []
            session_events.extend(seqs[session[0]][1:])
            for seq_id in session[1:]:
                session_events.append(seqs[seq_id][-1])
            f.write(' '.join(list(map(str,session_events)))+'\n')
        elif seq!=-1 and ('FP' in name or 'TP' in name):
            f.write(str(i)+' ')
            session_events = []
            session_events.extend(seqs[session[0]][1:])
            for seq_id in session[1:]:
                session_events.append(seqs[seq_id][-1])
            f.write(' '.join(list(map(str,session_events)))+'\n')
            f.write(' '.join(list(map(str,seqs[seq])))+'\n')