In [1]:
import torch
import torch.utils.data as Data
import torch as t
from torch import nn
import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models import Word2Vec, FastText
import re
import os
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import accuracy_score
import copy

from m import f1_for_car, BOW, BasicModule

In [2]:
embedding_dim = 300
USE_CUDA=True
EPOCH = 30           # 训练整批数据多少次
BATCH_SIZE = 128
LR = 0.002         # 学习率

In [3]:
# 以训练数据为例
data_path_dir = 'data'
data = pd.read_csv(os.path.join(data_path_dir,'cuishou_intent3.csv'),sep='\t')
data.columns = ['content','label']

data_tmp = data.copy(deep=True)

embedding_dim = 300
USE_CUDA=True
d_ = {}
for key, value in enumerate(set(data_tmp.label)):
    d_[value] = key
data_tmp['label'] = data_tmp['label'].apply(lambda x : d_.get(x))

y_train = np.array(data_tmp.label.tolist())
# 构造embedding字典
bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(), min_count=1, maxlen=30) # 长度补齐或截断固定长度30

vocab_size = len(bow.word2idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format('data/ft_wv.txt')

embedding_matrix = np.zeros((vocab_size+1,300))
for key, value in bow.word2idx.items():
    if key in word2vec.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab
        embedding_matrix[value] = word2vec.get_vector(key)
    else:
        embedding_matrix[value] = [0] * embedding_dim

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.011 seconds.
Prefix dict has been built succesfully.
   Word Count: 100%|██████████| 53850/53850 [00:00<00:00, 423136.87it/s]
Doc To Number: 100%|██████████| 53850/53850 [00:00<00:00, 157872.15it/s]


In [4]:
# np.save('save/embedding_matrix',arr=embedding_matrix)

In [5]:
X_train = copy.deepcopy(bow.doc2num)
y_train = copy.deepcopy(y_train)

# 构建LSTM模型

In [6]:
class Config(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''

    loss = 'multilabelloss'
    model='LSTMText' 
    num_classes = 21 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 1000 # 全连接层隐藏元数目
    kmax_pooling = 30 # k
    hidden_size = 128 #LSTM hidden size
    num_layers= 2 #LSTM layers
    inception_dim = 256 #inception的卷积核数
    
    # vocab_size = 11973 # num of chars
    vocab_size = vocab_size # num of words 
    content_seq_len = 30 #描述长度 word为100 char为200
    static = False
    use_pretrained_embedding = True
    embedding_path = 'save/embedding_matrix.npy'

opt = Config()

In [7]:
# 相当于把seq_len压缩成k个'词'
# dim共三个维度，这里取2即seq_len那个维度，100->k
def kmax_pooling(x, dim, k):
    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
    return x.gather(dim, index)

class Embed_Layer(BasicModule):

    def __init__(self, embedding_matrix=None, vocab_size=None, embedding_dim=300):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(vocab_size+1,embedding_dim)
        if opt.use_pretrained_embedding:
#             self.encoder.weight.data.copy_(t.from_numpy(np.load(embedding_path))) # 方法一，加载np.save的npy文件
            self.encoder.weight.data.copy_(t.from_numpy(embedding_matrix)) # 方法二
    def forward(self, x):
        return self.encoder(x)

class LSTMText(BasicModule): 
    def __init__(self, opt):
        super(LSTMText, self).__init__()
        self.model_name = 'LSTMText'
        self.opt=opt

#         self.encoder = torch.nn.Embedding(self.opt.vocab_size+1, self.opt.embedding_dim)

        self.content_lstm =torch.nn.LSTM(input_size = self.opt.embedding_dim,\
                            hidden_size = self.opt.hidden_size,
                            num_layers = self.opt.num_layers,
                            bias = True,
                            batch_first = False,
                            dropout = 0.5, # dropout
                            bidirectional = True
                            )


    def forward(self, content):
        # torch.Size([128, 30, 150])
        if self.opt.static:
            title=title.detach()
            content=content.detach()
        
        '''
        lstm输入的时候需要转成(seq_len, batch, embedding_dim）这种维度（用permute转）<br>
        output，每个时刻的LSTM网络的最后一层的输出，维度（seq_len, batch, hidden_size * num_directions）|双向lstm所以输出的hidden_size维度要乘以2<br>
        lstm的输出为output, (hn, cn) 的元组<br>
        这里取第一个就是output(100,64,256)，第二个是元组其中的第一个hn就是最后时刻的隐层状态hn(4,64,128)
        这里的4就是(2层num_layers*双向)lstm得到
        '''
        content_out = self.content_lstm(content.permute(1,0,2))[0].permute(1,2,0)
        content_conv_out = kmax_pooling((content_out),2,self.opt.kmax_pooling)
        conv_out = content_conv_out
        return conv_out
    
class Dense_Layer(BasicModule):
    def __init__(self, opt=opt):
        super(Dense_Layer, self).__init__()
        self.opt = opt
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(self.opt.kmax_pooling*(self.opt.hidden_size*2),self.opt.linear_hidden_size),
            torch.nn.Dropout(0.2), # dropout
            torch.nn.BatchNorm1d(self.opt.linear_hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.opt.linear_hidden_size, self.opt.num_classes),
            torch.nn.Softmax(dim=-1)
        )
    def forward(self, x):
        reshaped = x.view(x.size(0), -1)
        softmax = self.fc((reshaped))
        return softmax
    
class Net_main(BasicModule):
    def __init__(self, opt=opt):
        super(Net_main, self).__init__()
        self.embed_layer = Embed_Layer(embedding_matrix, vocab_size)
        self.lstmtext = LSTMText(opt)
        self.dense_layer = Dense_Layer(opt)
    def forward(self, x):
        content1 = self.embed_layer(x)
        content2 = self.lstmtext(content1)
        res = self.dense_layer(content2)
        return res

In [8]:
label_tensor = torch.from_numpy(np.array(y_train)).long()
content_tensor = torch.from_numpy(np.array(X_train)).long()

torch_dataset = Data.TensorDataset(content_tensor, label_tensor)
train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

In [9]:
# model, optimizer, loss_func
# m = LSTMText(opt)
m = Net_main(opt)
optimizer = torch.optim.Adam(m.parameters(), lr=LR)   # optimize all lstm parameters;Adam比较好用
loss_func = torch.nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
if USE_CUDA:
    m.cuda()
    loss_func.cuda()

In [10]:
it = 1
for epoch in tqdm_notebook(range(EPOCH)):
    for step, (content, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        if USE_CUDA:
            content, b_y = content.cuda(), b_y.cuda()
        output = m(content)
        loss = loss_func(output, b_y)
        if it % 100 == 0:
            print('training loss: ', loss.cpu().data.numpy().tolist())
            print('train acc ', accuracy_score(b_y.cpu().data.numpy().tolist(), np.argmax(output.cpu().data.numpy().tolist(), axis=1)))
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        it += 1

training loss:  2.246884822845459
train acc  0.8828125
training loss:  2.249523639678955
train acc  0.875
training loss:  2.256722927093506
train acc  0.8671875
training loss:  2.2097420692443848
train acc  0.9140625


Process Process-12:
Process Process-11:
Process Process-9:
Process Process-13:
Process Process-14:
Process Process-10:
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kw

KeyboardInterrupt: 