In [1]:
import torch
import torch.utils.data as Data
import torch as t
from torch import nn
import torch.distributed as dist
import torch.nn.functional as F
from torch.multiprocessing import Process
from torch.autograd import Variable
from torchvision import datasets, transforms

import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models import Word2Vec, FastText
import re
import os
from math import ceil
from random import Random
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import accuracy_score
import copy

from m import f1_for_car, BOW, BasicModule

In [2]:
embedding_dim = 300
USE_CUDA = False # 用CPU
EPOCH = 20           # 训练整批数据多少次
BATCH_SIZE = 128
LR = 0.002         # 学习率

## 1. 构造embedding字典

In [3]:
# 以训练数据为例
data_path_dir = 'data'
data = pd.read_csv(os.path.join(data_path_dir,'cuishou_intent3.csv'),sep='\t')
data.columns = ['content','label']

data_tmp = data.copy(deep=True)

d_ = {}
for key, value in enumerate(set(data_tmp.label)):
    d_[value] = key
data_tmp['label'] = data_tmp['label'].apply(lambda x : d_.get(x))

y_all = np.array(data_tmp.label.tolist())
# 构造embedding字典
bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(), min_count=1, maxlen=30) # 长度补齐或截断固定长度30

vocab_size = len(bow.word2idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format('data/ft_wv.txt')

embedding_matrix = np.zeros((vocab_size+1,300))
for key, value in bow.word2idx.items():
    if key in word2vec.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab
        embedding_matrix[value] = word2vec.get_vector(key)
    else:
        embedding_matrix[value] = [0] * embedding_dim

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.017 seconds.
Prefix dict has been built succesfully.
   Word Count: 100%|██████████| 53850/53850 [00:00<00:00, 361791.65it/s]
Doc To Number: 100%|██████████| 53850/53850 [00:00<00:00, 195558.71it/s]


In [3]:
# np.save('save/embedding_matrix',arr=embedding_matrix)

# 2. 将数据进行partition，后面送入不同的进程中执行

In [51]:
X = copy.deepcopy(bow.doc2num)
y = copy.deepcopy(y_all)
skf = StratifiedKFold(n_splits=5,shuffle=True)
for train_idx, val_idx in skf.split(X,y):
    pass

X_train = X[train_idx]
y_train = y[train_idx]
X_val = X[val_idx]
y_val = y[val_idx]

# 数据处理成tensor
train_label_tensor = torch.from_numpy(np.array(y_train)).long()
train_content_tensor = torch.from_numpy(np.array(X_train)).long()
dataset = Data.TensorDataset(train_content_tensor, train_label_tensor)

# 验证集
val_label_tensor = torch.from_numpy(np.array(y_val)).long()
val_content_tensor = torch.from_numpy(np.array(X_val)).long()

In [52]:
# partition函数
class Partition(object):
    """ Dataset-like object, but only access a subset of it. """

    def __init__(self, data, index):
        self.data = data
        self.index = index

    def __len__(self):
        return len(self.index)

    def __getitem__(self, index):
        data_idx = self.index[index]
        return self.data[data_idx]

class DataPartitioner(object):
    """ Partitions a dataset into different chuncks. """

    def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):
        self.data = data
        self.partitions = []
        rng = Random()
        rng.seed(seed)
        data_len = len(data)
        indexes = [x for x in range(0, data_len)]
        rng.shuffle(indexes)

        for frac in sizes:
            part_len = int(frac * data_len)
            self.partitions.append(indexes[0:part_len])
            indexes = indexes[part_len:]

    def use(self, partition):
        return Partition(self.data, self.partitions[partition])

def partition_dataset(train_data_all):
    """ Partitioning MNIST """
    dataset = train_data_all
    size = dist.get_world_size()
    bsz = int(128 / size)
    partition_sizes = [1.0 / size for _ in range(size)]
    partition = DataPartitioner(dataset, partition_sizes)
    partition = partition.use(dist.get_rank())
    train_set = torch.utils.data.DataLoader(
        partition, batch_size=bsz, shuffle=True)
    return train_set, bsz

# 3. 构建textCNN模型

In [59]:
# 配置文件
class Config(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''

    loss = 'multilabelloss'
    model='TextCNN' 
    title_dim = 100 # 标题的卷积核数
    content_dim = 100 # 内容的卷积核数
    num_classes = 21 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 1000 # 全连接层隐藏元数目
    kmax_pooling = 2 # k
    hidden_size = 128 # LSTM hidden size
    num_layers=2 # LSTM layers
    inception_dim = 256 #i nception的卷积核数
    
    kernel_size = 3 # 单尺度卷积核
    kernel_sizes = [2,3,4] #多 尺度卷积核
    vocab_size = vocab_size# num of words 
    content_seq_len = 30 # 内容长度 word为50 char为100
    static = False
    use_pretrained_embedding = True
    embedding_path = 'save/embedding_matrix.npy'

opt = Config()

In [60]:
class Embed_Layer(BasicModule):

    def __init__(self, embedding_matrix=None, opt=None):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(opt.vocab_size+1, opt.embedding_dim)
        if opt.use_pretrained_embedding:
#             self.encoder.weight.data.copy_(t.from_numpy(np.load(embedding_path))) # 方法一，加载np.save的npy文件
            self.encoder.weight.data.copy_(t.from_numpy(embedding_matrix)) # 方法二
    def forward(self, x):
        return self.encoder(x)
    
kernel_sizes =  [1,2,3,4]
class MultiCNNTextBNDeep(BasicModule): 
    def __init__(self, opt):
        super(MultiCNNTextBNDeep, self).__init__()
        self.model_name = 'MultiCNNTextBNDeep'
        self.opt=opt
        content_convs = [nn.Sequential(
                                nn.Conv1d(in_channels = self.opt.embedding_dim,
                                        out_channels = self.opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(self.opt.content_dim),
                                nn.ReLU(inplace=True),

                                nn.Conv1d(in_channels = self.opt.content_dim,
                                        out_channels = self.opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(self.opt.content_dim),
                                nn.ReLU(inplace=True),
                                # maxpool1d kernel_size=50的意思就是对一句话里每50个单词取maxpool
                                nn.MaxPool1d(kernel_size = (self.opt.content_seq_len - kernel_size*2 + 2))
                            )
            for kernel_size in kernel_sizes]

        self.content_convs = nn.ModuleList(content_convs)
        
    def forward(self, content):
        if self.opt.static:
            content.detach()
        
        content_out = [content_conv(content.permute(0,2,1)) for content_conv in self.content_convs]
#         conv_out = t.cat((title_out+content_out),dim=1)
        # t.cat是对list进行拼接，这里对维度1进行拼接
        conv_out = t.cat(content_out,dim=1)
        return conv_out
    
class Dense_Layer(BasicModule):
    def __init__(self, opt=opt):
        super(Dense_Layer, self).__init__()
        self.opt = opt
        self.fc = nn.Sequential(
            nn.Linear(len(kernel_sizes)*self.opt.content_dim,self.opt.linear_hidden_size),
            nn.BatchNorm1d(self.opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(self.opt.linear_hidden_size,self.opt.num_classes),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, x):
        reshaped = x.view(x.size(0), -1)
        softmax = self.fc((reshaped))
        return softmax

class Net_Main(BasicModule):
    def __init__(self, opt=opt):
        super(Net_Main, self).__init__()
        self.embed_layer = Embed_Layer(embedding_matrix, opt)
        self.multicnn = MultiCNNTextBNDeep(opt)
        self.dense_layer = Dense_Layer(opt)
    def forward(self, x):
        content1 = self.embed_layer(x)
        content2 = self.multicnn(content1)
        res = self.dense_layer(content2)
        return res

# 4. 多进程跑模型

In [61]:
def average_gradients(model):
    """ Gradient averaging. """
    size = float(dist.get_world_size())
    for param in model.parameters():
        dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
        param.grad.data /= size

In [109]:
def run(rank, size):
    """ Distributed Synchronous SGD Example """
    torch.manual_seed(1200)
    train_loader, bsz = partition_dataset(dataset)
    model = Net_Main(opt)
    model = model
#    model = model.cuda(rank)
    # 指定优化函数和损失函数
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    loss_func = torch.nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类

    num_batches = ceil(len(train_loader.dataset) / float(bsz))
    it = 1
    for epoch in range(EPOCH):
        epoch_loss = 0.0
        for batch_id, (data, target) in enumerate(train_loader):
            data, target = Variable(data), Variable(target)
            output = model(data)
            loss = loss_func(output, target)
            epoch_loss += loss.data.item()
            optimizer.zero_grad()           # clear gradients for this training step
            loss.backward()                 # backpropagation, compute gradients
            average_gradients(model)
            optimizer.step()                # apply gradients
            it += 1

        print('CPU ',
                  dist.get_rank(), ', epoch ', epoch, ', ',
                  'trian loss ', epoch_loss / num_batches)
        val_output = model(val_content_tensor)
        print('val acc: ', accuracy_score(val_label_tensor.cpu().data.numpy(), np.argmax(val_output.cpu().data.numpy(),axis=1)))
        print('epoch {}....................................'.format(epoch))
        del val_output

In [110]:
def init_processes(rank, size, fn, backend='tcp'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, size)

In [None]:
if __name__ == "__main__":
    size = 2
    processes = []
    for rank in range(size):
        p = Process(target=init_processes, args=(rank, size, run))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

CPU  0 , epoch  0 ,  trian loss  2.197248056309867
CPU  1 , epoch  0 ,  trian loss  2.197548013766371
val acc:  0.9660843709347705
epoch 0....................................
val acc:  0.9660843709347705
epoch 0....................................
CPU  0 , epoch  1 ,  trian loss  2.1527170604343584
CPU  1 , epoch  1 ,  trian loss  2.154448325273191
val acc:  0.9730533358111875
epoch 1....................................
val acc:  0.9730533358111875
epoch 1....................................
CPU  1 , epoch  2 ,  trian loss  2.144392533542848
CPU  0 , epoch  2 ,  trian loss  2.1433684005227924
val acc:  0.9749117264448988
epoch 2....................................
val acc:  0.9749117264448988
epoch 2....................................
CPU  1 , epoch  3 ,  trian loss  2.14008997951134
CPU  0 , epoch  3 ,  trian loss  2.138311307578837
val acc:  0.9822523694480579
epoch 3....................................
val acc:  0.9822523694480579
epoch 3....................................
