In [8]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import argparse
import os
from tqdm import tqdm

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
def generate(name):
    num_sessions = 0
    inputs = []
    outputs = []
    with open(name, 'r') as f:
        for line in tqdm(f,"loading data"):
            num_sessions += 1
            line = tuple(map(lambda n: n - 1, map(int, line.strip().split())))
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])
    print('Number of sessions({}): {}'.format(name, num_sessions))
    print('Number of seqs({}): {}'.format(name, len(inputs)))
    dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))
    return dataset


class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_0 = nn.LSTMCell(input_size, hidden_size)
        self.cell_1 = nn.LSTMCell(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, num_keys)

    def forward(self, x):
        h0 = torch.zeros(x.size(1), self.hidden_size).to(device)
        c0 = torch.zeros(x.size(1), self.hidden_size).to(device)
        h1 = torch.zeros(x.size(1), self.hidden_size).to(device)
        c1 = torch.zeros(x.size(1), self.hidden_size).to(device)
        for xt in x:
            h0, c0 = self.cell_0(xt, (h0, c0))  # l0层直接接受xt输入
            h1, c1 = self.cell_1(h0, (h1, c1))  # l1层接受l0层的输出h为输入

        out = self.fc(h1)
        return out

In [10]:
# Hyperparameters
num_classes = 28
num_epochs = 300
batch_size = 1024
input_size = 1
model_dir = 'model'
log = 'Lstm_Cell_Adam_batch_size={}_epoch={}'.format(str(batch_size), str(num_epochs))
num_layers = 2
hidden_size = 64
window_size = 10
file_dir = 'data/'

In [4]:
model = Model(input_size, hidden_size, num_layers, num_classes)
model.load_state_dict(torch.load(model_dir + '/' + log + '.pt'))
model.to(device)

Model(
  (cell_0): LSTMCell(1, 64)
  (cell_1): LSTMCell(64, 64)
  (fc): Linear(in_features=64, out_features=28, bias=True)
)

In [6]:
model.eval()

Model(
  (cell_0): LSTMCell(1, 64)
  (cell_1): LSTMCell(64, 64)
  (fc): Linear(in_features=64, out_features=28, bias=True)
)

In [13]:
def generate_test_data(name):
    hdfs = set()
    # hdfs = []
    with open('data/' + name, 'r') as f:
        for ln in f.readlines():
            ln = list(map(lambda n: n - 1, map(int, ln.strip().split())))
            ln = ln + [-1] * (window_size + 1 - len(ln))
            hdfs.add(tuple(ln))
            # hdfs.append(tuple(ln))
    session_to_seq = []
    seqs = []
    labels = []
    seq_count = 0
    for line in tqdm(hdfs, "normal:"):
        session = []
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seqs.append(seq)
            session.append(seq_count)
            labels.append(label)
            seq_count += 1
        session_to_seq.append(session)
    print('Number of sessions({}): {}'.format(name, len(session_to_seq)))
    print('Number of seqs({}): {}'.format(name, len(seqs)))
    dataset = TensorDataset(torch.tensor(seqs, dtype=torch.float), torch.tensor(labels))

    # print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return session_to_seq, dataset, hdfs

In [14]:
test_normal_session, test_normal_dataset,hdfs = generate_test_data('hdfs_test_normal')

normal:: 100%|████████████████████████████████████████████████████████████████| 14177/14177 [00:00<00:00, 16351.09it/s]


Number of sessions(hdfs_test_normal): 14177
Number of seqs(hdfs_test_normal): 241216


In [25]:
print(list(hdfs)[8])
seq = list(list(hdfs)[8])

(21, 4, 4, 4, 10, 8, 8, 25, 25, 25, 10, 10, 8, 3, 2, 2, 2, 3, 2, 3, 2, 2, 2, 3, 2, 22, 22, 22, 20, 20, 20)


In [43]:
# The numpy patterns is transformed into a tesor-type and reshaped
pattern = torch.FloatTensor(seq)
pattern = pattern.view(1,-1)

In [59]:
window_size = 8

In [66]:
for window_size in range(1,20):
    test_data = pattern[:,:window_size]
    start = test_data.clone().detach().view(-1, window_size, input_size).permute(1,0,2).to(device)
    output = model(start).cpu()
    predicted = torch.argsort(output, 1)[:,-10:]
    print('window size:'+ str(window_size))
    print(test_data)
    print(pattern)
    print(predicted)
    print()

window size:1
tensor([[21.]])
tensor([[21.,  4.,  4.,  4., 10.,  8.,  8., 25., 25., 25., 10., 10.,  8.,  3.,
          2.,  2.,  2.,  3.,  2.,  3.,  2.,  2.,  2.,  3.,  2., 22., 22., 22.,
         20., 20., 20.]])
tensor([[ 8,  2,  1, 25,  3, 10, 24, 20, 17, 22]])

window size:2
tensor([[21.,  4.]])
tensor([[21.,  4.,  4.,  4., 10.,  8.,  8., 25., 25., 25., 10., 10.,  8.,  3.,
          2.,  2.,  2.,  3.,  2.,  3.,  2.,  2.,  2.,  3.,  2., 22., 22., 22.,
         20., 20., 20.]])
tensor([[10,  5, 22, 20, 24,  4,  8, 15,  2,  3]])

window size:3
tensor([[21.,  4.,  4.]])
tensor([[21.,  4.,  4.,  4., 10.,  8.,  8., 25., 25., 25., 10., 10.,  8.,  3.,
          2.,  2.,  2.,  3.,  2.,  3.,  2.,  2.,  2.,  3.,  2., 22., 22., 22.,
         20., 20., 20.]])
tensor([[20, 10, 24, 21, 25,  1, 15, 22,  2,  3]])

window size:4
tensor([[21.,  4.,  4.,  4.]])
tensor([[21.,  4.,  4.,  4., 10.,  8.,  8., 25., 25., 25., 10., 10.,  8.,  3.,
          2.,  2.,  2.,  3.,  2.,  3.,  2.,  2.,  2.,  3.,  2.,

In [None]:
# Define the softmax function
softmax = nn.Softmax(dim=1)

# Randomly is selected the index from the set of sequences
start = logseq[0]
seq = [start]
full_prediction = []

# The prediction starts, it is going to be predicted a given
# number of characters
for i in range(20):

    # The numpy patterns is transformed into a tesor-type and reshaped
    pattern = torch.from_numpy(seq).type(torch.LongTensor)
    pattern = pattern.view(1,-1)
    with torch.no_grad():
        start = pattern.clone().detach().view(-1, len(pattern), input_size).permute(1,0,2).to(device)
        output = model(start).cpu()
        predicted = torch.argsort(output, 1)[:,-num_candidates:]
        print(predicted)

    # The full prediction is saved
    full_prediction = np.append(full_prediction, arg_max)

print("Prediction: \n")
print(''.join([idx_to_char[value] for value in full_prediction]), "\"")