In [2]:
import numpy as np
import torch
from torch.utils.data import DataLoader

from ADFALD_READ import ADFA_DataLoader
from ignite_trainer import SkipGramTrainer

import SkipGram_dataset
from skip_gram import co_matrix_add_SkipGram, BCE_loss_func

# 하이퍼파라미터 설정
start_window = 1
end_window = -1
hidden_size = 10
batch_size = 128
max_epoch = 100
learning_rate = 0.001

%load_ext autoreload
%autoreload 2


### 데이터 읽기, 전처리


In [58]:
loader = ADFA_DataLoader()

normal_data, normal_label, attack_data, attack_label  = loader.get_all_data()

corpus, word_to_id, id_to_word = loader.preprocess(normal_data + attack_data)

vocab_size = len(word_to_id)


window_size_list = [(3, 3), (5, 5), (1, 0), (3, 0), (5, 0)]

for window_size in window_size_list :

    contexts, target = loader.create_contexts_target(corpus, window_size)

    co_matrix = loader.create_co_matrix(corpus, vocab_size, window_size)

    for idx, vector in enumerate(co_matrix):
        co_matrix[idx] /= np.sum(vector)

    add_corpus = []


    for i in corpus :
        add_corpus += i


    skipgram_loader = SkipGram_dataset.SkipGram_Dataset(contexts, target, add_corpus)

    torch.save(skipgram_loader, "./w " + str(window_size) + "_skip_dataset")

[[ 0  0  1 ... 14 55 10]
 [ 0  1  0 ... 10 11 44]
 [ 1  0  2 ... 52 59 18]
 ...
 [19 18 18 ... 18  4 22]
 [18 18 20 ... 13 54 59]
 [18 20 10 ... 20 33 56]]
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]
[[ 0  0  1 ...  4 26 54]
 [ 0  1  0 ... 32 10 26]
 [ 1  0  2 ...  0 19 23]
 ...
 [52 19 10 ... 59 14 19]
 [19 10 10 ... 26 54  8]
 [10 10 19 ... 10 65 17]]
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]
[[ 0 14 58 33 17 89]
 [ 0 31 10 55 54  4]
 [ 1 13 37 10  9 19]
 ...
 [19 56 26 16 19  8]
 [ 0 54  9 65 14 18]
 [19 10 20 19 13 37]]
[[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 ...
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]]
[[  0   0   1 ...   5  10  98]
 [  0   1   0 ... 132  23  22]
 [  1   0   2 ...  19  31  70]
 ...
 [ 20  10  19 ...  54  52  13]
 [ 10  19   0 ...  20  54  59]
 [ 19   0  19 ...  28  59  16]]
[[1 1 1 ... 0 0 0]
 [1 1 1 .

### 모델 생성

In [96]:
#데이터에 네가티브 셈플링 해주기


window_size_list = [(1, 1), (3, 3), (5, 5), (1, 0), (3, 0), (5, 0)]


skipgram = torch.load("./w " + str(window_size_list[0]) + "_skip_dataset")

skipgram_loader = DataLoader(skipgram, batch_size = batch_size)


model = co_matrix_add_SkipGram(vocab_size, hidden_size, torch.tensor(co_matrix))
optim = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_func = BCE_loss_func()

save_name = "./" + str(hidden_size) + str(window_size) + "_skipgram.model"

trainer = SkipGramTrainer(model, optim, loss_func, skipgram_loader, save_name)

trainer.run(max_epoch = 100)

torch.save(model, save_name)


trainer.print_loss_graph()


Epoch 1/100 : 50 - batch loss: 4.7903
Epoch 1/100 : 100 - batch loss: 4.6432
Epoch 1/100 : 150 - batch loss: 4.3169
Epoch 1/100 : 200 - batch loss: 4.1366
Epoch 1/100 : 250 - batch loss: 4.2726
Epoch 1/100 : 300 - batch loss: 3.5154
Epoch 1/100 : 350 - batch loss: 3.0440
Epoch 1/100 : 400 - batch loss: 4.2024
Epoch 1/100 : 450 - batch loss: 1.9612
Epoch 1/100 : 500 - batch loss: 4.3042
Epoch 1/100 : 550 - batch loss: 2.7005
Epoch 1/100 : 600 - batch loss: 3.0124
Epoch 1/100 : 650 - batch loss: 3.7194
Epoch 1/100 : 700 - batch loss: 2.4326
Epoch 1/100 : 750 - batch loss: 3.6600
Epoch 1/100 : 800 - batch loss: 3.2790
Epoch 1/100 : 850 - batch loss: 2.9288
Epoch 1/100 : 900 - batch loss: 1.5251
Epoch 1/100 : 950 - batch loss: 3.7011
Epoch 1/100 : 1000 - batch loss: 3.9348
Epoch 1/100 : 1050 - batch loss: 3.4524
Epoch 1/100 : 1100 - batch loss: 3.1631
Epoch 1/100 : 1150 - batch loss: 3.4936
Epoch 1/100 : 1200 - batch loss: 3.0293
Epoch 1/100 : 1250 - batch loss: 2.9420
Epoch 1/100 : 1300 -

Engine run is terminating due to exception: 


Epoch 16/100 : 333650 - batch loss: 3.7663


KeyboardInterrupt: 

In [9]:
from dataset import ptb

window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

# 데이터 읽기
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)

loader = ADFA_DataLoader()

target = corpus[window_size:-window_size]
contexts = []

for idx in range(window_size, len(corpus)-window_size):
    cs = []
    for t in range(-window_size, window_size + 1):
        if t == 0:
            continue
        cs.append(corpus[idx + t])
    contexts.append(cs)

co_matrix = np.zeros((vocab_size, vocab_size))

skipgram = SkipGram_dataset.SkipGram_Dataset(contexts, target, corpus)

In [None]:
torch.save(skipgram, "./dataset/(5,5)test_dataset")

In [None]:
skipgram = torch.load("./dataset/(5,5)test_dataset")

skipgram_loader = DataLoader(skipgram, batch_size = batch_size)


model = co_matrix_add_SkipGram(vocab_size, hidden_size, torch.tensor(co_matrix))
optim = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_func = BCE_loss_func()

save_name = "./test.model"

trainer = SkipGramTrainer(model, optim, loss_func, skipgram_loader, save_name)

trainer.run(max_epoch = 100)

torch.save(model, save_name)


trainer.print_loss_graph()