In [42]:
# 输出每个cell的运行时间
%load_ext autotime
# https://github.com/cpcloud/ipython-autotime

# TorchText学习

In [43]:
import pickle
import torch
from torchtext import data
from tqdm import tqdm
from sklearn.utils import shuffle

time: 1.07 ms


## 构建自定义DataSet

In [44]:
class DGA2019(data.Dataset):
    def __init__(self,path,test = True):
        
        tokenize = lambda x : [c for c in x]
        self._text_field = data.Field(sequential=True, 
                                      tokenize=tokenize,
                                      lower=True)

        self._label_field = data.Field(sequential=False, use_vocab=False)
        self.ds_len = 0
        
        fields = [("url",self._text_field),
                 ("label",self._label_field)]
        examples = []
        
        print('read data from:{}'.format(path))
        with open(path,"rb") as f:
            urls_data,label_data = pickle.load(f)
        self.ds_len = len(urls_data)
        
        if test:
            for url in urls_data:
                examples.append(data.Example.fromlist([url,None],fields))
        else:
            for url,label in zip(urls_data,label_data):
                 examples.append(data.Example.fromlist([url,label],fields))

        # 调用super调用父类构造方法，产生标准Dataset
        super(DGA2019,self).__init__(examples,fields)
    
    @staticmethod
    def sort_key(ex):
        return len(ex.text)
    
    def __len__(self):
        return self.ds_len

time: 7.92 ms


In [45]:
base_root = "pkl_data/"
train_root = base_root + "train_data"
val_root = base_root + "val_data"
test_root = base_root + "test_data"

time: 843 µs


In [46]:
train=DGA2019(train_root,test=False)
valid=DGA2019(val_root,test=False)
test=DGA2019(test_root)

read data from:pkl_data/train_data
read data from:pkl_data/val_data
read data from:pkl_data/test_data
time: 2min 6s


In [47]:
TEXT = train.fields['url']
TEXT.build_vocab(train)

time: 11.9 s


In [48]:
len(TEXT.vocab)

41

time: 9.98 ms


## 初始化vocab.vector

In [49]:
import torch
matrix = torch.randn(len(TEXT.vocab),128)
TEXT.vocab.set_vectors(TEXT.vocab.stoi,matrix,128)
TEXT.vocab.vectors.size()

torch.Size([41, 128])

time: 64.4 ms


In [50]:
print(TEXT.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x1f1236cc0>>, {'<unk>': 0, '<pad>': 1, 'o': 2, 'e': 3, '.': 4, 'c': 5, 'a': 6, 'm': 7, 'i': 8, 'n': 9, 'r': 10, 't': 11, 's': 12, 'u': 13, 'l': 14, 'd': 15, 'b': 16, 'p': 17, 'g': 18, 'h': 19, 'y': 20, 'k': 21, 'f': 22, 'v': 23, 'w': 24, 'x': 25, 'q': 26, 'j': 27, 'z': 28, '1': 29, '2': 30, '-': 31, '4': 32, '3': 33, '5': 34, '8': 35, '6': 36, '7': 37, '0': 38, '9': 39, '_': 40})
time: 2.66 ms


In [59]:
from torchtext.data import Iterator,BucketIterator
train_iter = data.BucketIterator(dataset=train, batch_size=128, shuffle=True, 
                                 sort_within_batch=False, repeat=False)
valid_iter = data.BucketIterator(dataset=valid, batch_size=128, shuffle=True, 
                                 sort_within_batch=False, repeat=False)

time: 2.81 ms


In [60]:
len(train_iter)

14204

time: 4.37 ms


In [53]:
# 接下来就是构造一个LSTM模型，然后训练一下
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

weight_matrix=TEXT.vocab.vectors
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()
        self.word_embedding=nn.Embedding(len(TEXT.vocab),128)
        # 给Embedding进行初始化
        self.word_embedding.weight.data.copy_(weight_matrix)
        
        self.lstm=nn.LSTM(input_size=128,hidden_size=64,num_layers=1)
        self.decoder=nn.Linear(64,2)
    
    def forward(self, sentence):
        embeds=self.word_embedding(sentence)
        print(embeds.shape)
        lstm_out=self.lstm(embeds)[0]
        print(lstm_out.shape)
        final=lstm_out[-1]
        y=self.decoder(final)
        return y

time: 9.98 ms


In [54]:
for idx, batch in enumerate(train_iter):
    if idx==0:
        print(batch)
        text, label = batch.url, batch.label
        print(text.shape, label.shape)
    else:
        break


[torchtext.data.batch.Batch of size 8]
	[.url]:[torch.LongTensor of size 32x8]
	[.label]:[torch.LongTensor of size 8]
torch.Size([32, 8]) torch.Size([8])
time: 9.32 s


In [55]:
model=LSTM()
model.train()
optimizer=optim.Adam(filter(lambda p:p.requires_grad,model.parameters()),lr=0.01)
crition=F.cross_entropy

for epoch,batch in enumerate(train_iter):
    optimizer.zero_grad()
    predicted=model(batch.url)
    loss=crition(predicted,batch.label)
    loss.backward()
    optimizer.step()
    print(loss.item())

torch.Size([23, 8, 128])
torch.Size([23, 8, 64])
0.6059950590133667
torch.Size([30, 8, 128])
torch.Size([30, 8, 64])
0.7442505955696106
torch.Size([23, 8, 128])
torch.Size([23, 8, 64])
0.671790599822998
torch.Size([21, 8, 128])
torch.Size([21, 8, 64])
0.8364723324775696
torch.Size([25, 8, 128])
torch.Size([25, 8, 64])
0.5900936126708984
torch.Size([30, 8, 128])
torch.Size([30, 8, 64])
0.623916745185852
torch.Size([25, 8, 128])
torch.Size([25, 8, 64])
0.658275842666626
torch.Size([27, 8, 128])
torch.Size([27, 8, 64])
0.6357868313789368
torch.Size([26, 8, 128])
torch.Size([26, 8, 64])
0.6163193583488464
torch.Size([28, 8, 128])
torch.Size([28, 8, 64])
0.8279600143432617
torch.Size([26, 8, 128])
torch.Size([26, 8, 64])
0.6172653436660767
torch.Size([30, 8, 128])
torch.Size([30, 8, 64])
0.7244459390640259
torch.Size([26, 8, 128])
torch.Size([26, 8, 64])
0.622336745262146
torch.Size([27, 8, 128])
torch.Size([27, 8, 64])
0.7078301906585693
torch.Size([23, 8, 128])
torch.Size([23, 8, 64])
0.6

KeyboardInterrupt: 

time: 6.69 s
