In [1]:
import torch.utils.data as Data
import torch as t
from torch.optim import Adam
from torch import nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
import os
import gensim
from gensim.models import Word2Vec, FastText
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# tfidf or countvec for lr or svm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import copy

from m import f1_for_car, BOW, BasicModule
from rnn_revised import *

### 用胶囊网络看下效果

In [2]:
import torch
import torch as t
import torch.nn as nn
import torch.nn.functional as F

USE_CUDA = True
embedding_dim = 300
embedding_path = '../save/embedding_matrix.npy' # or False, not use pre-trained-matrix
use_pretrained_embedding = True
BATCH_SIZE = 128
gru_len = 128
Routings = 5
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.25
rate_drop_dense = 0.28
LR = 0.001
T_epsilon=1e-7
num_classes = 21


class Embed_Layer(nn.Module):

    def __init__(self, embedding_matrix=None, vocab_size=None, embedding_dim=300):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(vocab_size+1,embedding_dim)
        if use_pretrained_embedding:
            # self.encoder.weight.data.copy_(t.from_numpy(np.load(embedding_path))) # 方法一，加载np.save的npy文件
#             self.encoder.weight.data.copy_(t.from_numpy(embedding_matrix)) # 方法二
            self.encoder.weight = nn.Parameter(t.from_numpy(embedding_matrix).float(), requires_grad=False)
    def forward(self, x, dropout_p=0.25):
#         return nn.Dropout(p=dropout_p)(self.encoder(x))
        return self.encoder(x)

class GRU_Layer(nn.Module):

    def __init__(self):
        super(GRU_Layer, self).__init__()
        self.gru = nn.LSTM(input_size=300,
                         hidden_size=gru_len,
                         bidirectional=True)
#         self.gru = RNNHardSigmoid('GRU',input_size=300,
#                          hidden_size=gru_len,
#                          bidirectional=True)
    # 这步很关键，需要像keras一样用glorot_uniform和orthogonal_uniform初始化参数
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for t in ih:
            nn.init.xavier_uniform_(t)
        for t in hh:
            nn.init.orthogonal_(t)
        for t in b:
            nn.init.constant_(t, 0)
    
    def forward(self, x):
        return self.gru(x)

# core caps_layer with squash func
class Caps_Layer(nn.Module):
    def __init__(self, input_dim_capsule=gru_len*2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                activation='default',**kwargs):
        super(Caps_Layer, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size # 暂时没用到
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(nn.init.xavier_normal_(torch.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(torch.randn(BATCH_SIZE, input_dim_capsule,self.num_capsule * self.dim_capsule)) #64即batch_size

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = torch.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3) # 转成(batch_size,num_capsule,input_num_capsule,dim_capsule)
        b = torch.zeros_like(u_hat_vecs[:, :, :, 0]) # (batch_size,num_capsule,input_num_capsule)

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(torch.einsum('bij,bijk->bik', (c, u_hat_vecs))) # batch matrix multiplication
            # outputs shape (batch_size, num_capsule, dim_capsule)
            if i < self.routings - 1:
                b = torch.einsum('bik,bijk->bij', (outputs, u_hat_vecs)) # batch matrix multiplication
        return outputs # (batch_size, num_capsule, dim_capsule)

    # text version of squash, slight different from original one
    def squash(self, x, axis=-1):
        s_squared_norm  = (x ** 2).sum(axis, keepdim=True)
        scale = torch.sqrt(s_squared_norm + T_epsilon)
        return x / scale


class Dense_Layer(nn.Module):
    def __init__(self):
        super(Dense_Layer, self).__init__()
        self.fc = nn.Sequential(
            nn.Dropout(p=dropout_p,inplace=True),    
            nn.Linear(Num_capsule*Dim_capsule, num_classes), # num_capsule*dim_capsule -> num_classes
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, -1)
        return self.fc(x)

# capsule如果单纯做分类则不需要重构(reconstruction)
# 如果就用在分类里面，decoder用不到，不需要reconstruction

class Capsule_Main(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None):
        super(Capsule_Main, self).__init__()
        self.embed_layer = Embed_Layer(embedding_matrix, vocab_size)
        self.gru_layer = GRU_Layer()
        # 【重要】初始化GRU权重操作，这一步非常关键，acc上升到0.98，如果用默认的uniform初始化则acc一直在0.5左右
        self.gru_layer.init_weights()     
        self.caps_layer = Caps_Layer()
        self.dense_layer = Dense_Layer()

    def forward(self, content):
        content1 = self.embed_layer(content)
        content2, _ = self.gru_layer(content1) # 这个输出是个tuple，一个output(seq_len, batch_size, num_directions * hidden_size)，一个hn
        content3 = self.caps_layer(content2)
        output = self.dense_layer(content3)
        return output

In [3]:
# 以训练数据为例
data_path_dir = 'data'
data = pd.read_csv(os.path.join(data_path_dir,'cuishou_intent3.csv'),sep='\t')
data.columns = ['content','label']

data_tmp = data.copy(deep=True)

d_ = {}
for key, value in enumerate(set(data_tmp.label)):
    d_[value] = key
data_tmp['label'] = data_tmp['label'].apply(lambda x : d_.get(x))

y_all = np.array(data_tmp.label.tolist())
# 构造embedding字典
bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(), min_count=1, maxlen=30) # 长度补齐或截断固定长度30

vocab_size = len(bow.word2idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format('data/ft_wv.txt')

embedding_matrix = np.zeros((vocab_size+1,300))
for key, value in bow.word2idx.items():
    if key in word2vec.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab
        embedding_matrix[value] = word2vec.get_vector(key)
    else:
        embedding_matrix[value] = [0] * embedding_dim

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.959 seconds.
Prefix dict has been built succesfully.
   Word Count: 100%|██████████| 53850/53850 [00:00<00:00, 429428.88it/s]
Doc To Number: 100%|██████████| 53850/53850 [00:00<00:00, 231839.39it/s]


In [4]:
# train_test_split
X = copy.deepcopy(bow.doc2num)
y = copy.deepcopy(y_all)
skf = StratifiedKFold(n_splits=5,shuffle=True)
for train_idx, val_idx in skf.split(X,y):
    pass

X_train = X[train_idx]
y_train = y[train_idx]
X_val = X[val_idx]
y_val = y[val_idx]

# 数据处理成tensor
BATCH_SIZE = 128
train_label_tensor = torch.from_numpy(np.array(y_train)).long()
train_content_tensor = torch.from_numpy(np.array(X_train)).long()

train_torch_dataset = Data.TensorDataset(train_content_tensor, train_label_tensor)
train_loader = Data.DataLoader(
        dataset=train_torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

val_label_tensor = torch.from_numpy(np.array(y_val)).long()
val_content_tensor = torch.from_numpy(np.array(X_val)).long()

val_torch_dataset = Data.TensorDataset(val_content_tensor, val_label_tensor)
val_loader = Data.DataLoader(
        dataset=val_torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

In [5]:
if USE_CUDA:
    val_content_tensor = val_content_tensor.cuda()
    val_label_tensor = val_label_tensor.cuda()

In [6]:
# 网络结构、损失函数、优化器初始化
capnet = Capsule_Main(embedding_matrix, vocab_size=vocab_size) # 加载预训练embedding matrix
loss_func = nn.CrossEntropyLoss()
if USE_CUDA:
    capnet = capnet.cuda() # 把搭建的网络载入GPU
    loss_func.cuda() # 把损失函数载入GPU
optimizer = Adam(capnet.parameters(),lr=LR) # 默认lr

In [7]:
it = 1
EPOCH = 30
for epoch in tqdm_notebook(range(EPOCH)):
    for batch_id, (data, target) in enumerate(train_loader):
        if USE_CUDA:
            data, target = data.cuda(), target.cuda() # 数据载入GPU
        output = capnet(data)
        loss = loss_func(output, target)
#         if it % 200 == 0:
#             print('training loss: ', loss.cpu().data.numpy().tolist())
#             print('training acc: ', accuracy_score(target.cpu().data.numpy(), np.argmax(output.cpu().data.numpy(),axis=1)))
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        it += 1
    val_output = capnet(val_content_tensor)
    print('val acc: ', accuracy_score(val_label_tensor.cpu().data.numpy(), np.argmax(val_output.cpu().data.numpy(),axis=1)))
    print('epoch {}....................................'.format(epoch))
    del val_output

val acc:  0.9142352722542278
epoch 0....................................
val acc:  0.9284519606021185
epoch 1....................................
val acc:  0.9346775692250511
epoch 2....................................
val acc:  0.9416465341014681
epoch 3....................................
val acc:  0.9493588552313696
epoch 4....................................
val acc:  0.9475004645976585
epoch 5....................................
val acc:  0.9491730161679985
epoch 6....................................
val acc:  0.9525181193086787
epoch 7....................................
val acc:  0.963854302174317
epoch 8....................................
val acc:  0.9644118193644304
epoch 9....................................
val acc:  0.9619029920089203
epoch 10....................................
val acc:  0.9554915443226166
epoch 11....................................
val acc:  0.9643188998327449
epoch 12....................................
val acc:  0.9619959115406058
epoch 13............

Process Process-157:
Process Process-156:
Process Process-160:
Process Process-153:
Process Process-155:
Process Process-154:
Process Process-159:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Process Process-158:
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.6/multiprocessi

KeyboardInterrupt: 