In [1]:
import torch as t
import torch.utils.data as Data
from torch import nn
import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models import Word2Vec, FastText
import re
import os
from collections import Counter, OrderedDict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import accuracy_score
import copy

from m import f1_for_car, BOW, BasicModule

In [2]:
embedding_dim = 300
USE_CUDA=True
EPOCH = 20           # 训练整批数据多少次
BATCH_SIZE = 128
LR = 0.002         # 学习率

# 1. 构造embedding字典

In [3]:
# 以训练数据为例
data_path_dir = 'data'
data = pd.read_csv(os.path.join(data_path_dir,'cuishou_intent3.csv'),sep='\t')
data.columns = ['content','label']

data_tmp = data.copy(deep=True)

# label数值化(编码)
d_ = {}
for key, value in enumerate(set(data_tmp.label)):
    d_[value] = key
data_tmp['label'] = data_tmp['label'].apply(lambda x : d_.get(x))

y_all = np.array(data_tmp.label.tolist())
# 构造embedding字典
bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(), min_count=1, maxlen=30) # 长度补齐或截断固定长度30

vocab_size = len(bow.word2idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format('data/ft_wv.txt')

embedding_matrix = np.zeros((vocab_size+1,300))
for key, value in bow.word2idx.items():
    if key in word2vec.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab
        embedding_matrix[value] = word2vec.get_vector(key)
    else:
        embedding_matrix[value] = [0] * embedding_dim

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.976 seconds.
Prefix dict has been built succesfully.
   Word Count: 100%|██████████| 53850/53850 [00:00<00:00, 402615.50it/s]
Doc To Number: 100%|██████████| 53850/53850 [00:00<00:00, 156173.72it/s]


In [4]:
# np.save('save/embedding_matrix',arr=embedding_matrix)

# 2. 数据处理 train_test_split 装入tensor

In [5]:
X = copy.deepcopy(bow.doc2num)
y = copy.deepcopy(y_all)
skf = StratifiedKFold(n_splits=5,shuffle=True)
for train_idx, val_idx in skf.split(X,y):
    pass

X_train = X[train_idx]
y_train = y[train_idx]
X_val = X[val_idx]
y_val = y[val_idx]

# 数据处理成tensor
train_label_tensor = t.from_numpy(np.array(y_train)).long()
train_content_tensor = t.from_numpy(np.array(X_train)).long()

train_torch_dataset = Data.TensorDataset(train_content_tensor, train_label_tensor)
train_loader = Data.DataLoader(
        dataset=train_torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

val_label_tensor = t.from_numpy(np.array(y_val)).long()
val_content_tensor = t.from_numpy(np.array(X_val)).long()

val_torch_dataset = Data.TensorDataset(val_content_tensor, val_label_tensor)
val_loader = Data.DataLoader(
        dataset=val_torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

if USE_CUDA:
    val_content_tensor = val_content_tensor.cuda()
    val_label_tensor = val_label_tensor.cuda()

# 3. 构建Inception模型

In [6]:
# 配置文件
class Config(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''

    loss = 'multilabelloss'
    model='LSTMText' 
    num_classes = 21 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 100 # 全连接层隐藏元数目
    kmax_pooling = 30 # k
    hidden_size = 128 # LSTM hidden size
    num_layers= 2 # LSTM layers
    inception_dim = 256 # inception的卷积核数
    
    # vocab_size = 11973 # num of chars
    vocab_size = vocab_size # num of words 
    content_seq_len = 30 # 描述长度 word为100 char为200
    static = False
    use_pretrained_embedding = True
    embedding_path = 'save/embedding_matrix.npy'

opt = Config()

In [7]:
class Embed_Layer(BasicModule):

    def __init__(self, embedding_matrix=None, opt=None):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(opt.vocab_size+1, opt.embedding_dim)
        if opt.use_pretrained_embedding:
#             self.encoder.weight.data.copy_(t.from_numpy(np.load(embedding_path))) # 方法一，加载np.save的npy文件
#             self.encoder.weight.data.copy_(t.from_numpy(embedding_matrix)) # 方法二
            self.encoder.weight = nn.Parameter(t.from_numpy(embedding_matrix).float(), requires_grad=False)
    def forward(self, x):
        return self.encoder(x)

class Ince(nn.Module):
    def __init__(self, cin, co, relu=True, norm=True):
        super(Ince, self).__init__()
        assert (co % 4 == 0)
        cos = [co // 4] * 4
        self.activa = nn.Sequential()
        if norm:
            self.activa.add_module('norm', nn.BatchNorm1d(co))
        if relu:
            self.activa.add_module('relu', nn.ReLU(True))
        self.branch1 = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin, cos[0], 1, stride=1)),
        ]))
        self.branch2 = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin, cos[1], 1)),
            ('norm1', nn.BatchNorm1d(cos[1])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[1], cos[1], 3, stride=1, padding=1)),
        ]))
        self.branch3 = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin, cos[2], 3, padding=1)),
            ('norm1', nn.BatchNorm1d(cos[2])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[2], cos[2], 5, stride=1, padding=2)),
        ]))
        self.branch4 = nn.Sequential(OrderedDict([
            # ('pool',nn.MaxPool1d(2)),
            ('conv3', nn.Conv1d(cin, cos[3], 3, stride=1, padding=1)),
        ]))

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)
        result = self.activa(t.cat((branch1, branch2, branch3, branch4), 1))
        return result


class Dense_Layer(BasicModule):
    def __init__(self, opt=opt):
        super(Dense_Layer, self).__init__()
        self.opt = opt
        self.fc = nn.Sequential(
            nn.Linear(200, opt.linear_hidden_size),
            nn.BatchNorm1d(opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(100, opt.num_classes),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        reshaped = x.view(x.size(0), -1)
        softmax = self.fc((reshaped))
        return softmax
    
class Net_Main(BasicModule):
    def __init__(self, opt=opt):
        super(Net_Main, self).__init__()
        self.embed_layer = Embed_Layer(embedding_matrix, opt)
        # Inception layer
        self.inception = nn.Sequential(
            Ince(opt.embedding_dim, 200),
            Ince(200, 200),
            nn.MaxPool1d(opt.content_seq_len)
        )
        self.dense_layer = Dense_Layer(opt)
    def forward(self, x):
        content1 = self.embed_layer(x) # content1.size() (batch_size, seq_len, embed_dim)
        content2 = self.inception(content1.permute(0,2,1))
        res = self.dense_layer(content2)
        return res

# 4. 跑模型

In [8]:
# model, optimizer, loss_func
m = Net_Main(opt)
optimizer = t.optim.Adam(m.parameters(), lr=LR)   # optimize all lstm parameters;Adam比较好用
loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
if USE_CUDA:
    m.cuda()
    loss_func.cuda()

In [9]:
it = 1
for epoch in tqdm_notebook(range(EPOCH)):
    for batch_id, (data, target) in enumerate(train_loader):
        if USE_CUDA:
            data, target = data.cuda(), target.cuda() # 数据载入GPU
        output = m(data)
        loss = loss_func(output, target)
#         if it % 200 == 0:
#             print('training loss: ', loss.cpu().data.numpy().tolist())
#             print('training acc: ', accuracy_score(target.cpu().data.numpy(), np.argmax(output.cpu().data.numpy(),axis=1)))
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        it += 1
    val_output = m(val_content_tensor)
    print('val acc: ', accuracy_score(val_label_tensor.cpu().data.numpy(), np.argmax(val_output.cpu().data.numpy(),axis=1)))
    print('epoch {}....................................'.format(epoch))
    del val_output

val acc:  0.9520535216502509
epoch 0....................................
val acc:  0.9774205538004088
epoch 1....................................
val acc:  0.9842036796134548
epoch 2....................................
val acc:  0.9878275413491916
epoch 3....................................
val acc:  0.9843895186768259
epoch 4....................................
val acc:  0.9872700241590783
epoch 5....................................
val acc:  0.9891284147927895
epoch 6....................................


Process Process-64:
Process Process-63:
Process Process-61:
Traceback (most recent call last):
Process Process-59:
Process Process-58:
Process Process-62:
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process-57:
Traceback (most recent call last):
Process Process-60:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstr

KeyboardInterrupt: 