In [2]:
import torch
import torch.utils.data as Data
import torch as t
from torch import nn
import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models import Word2Vec, FastText
import re
import os
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import accuracy_score
import copy

from m import f1_for_car, BOW, BasicModule

In [3]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_public.csv')

In [4]:
# 主题和情感合起来变成30类
data['subject_1'] = data['subject'] + data['sentiment_value'].astype(str)
subj_lst = list(filter(lambda x : x is not np.nan, list(set(data.subject_1))))
subj_lst_dic = {value:key for key, value in enumerate(subj_lst)}
data['label'] = data['subject_1'].apply(lambda x : subj_lst_dic.get(x))

data = data[['content', 'label']].copy(deep=True)
data_tmp = data.copy(deep=True)

In [None]:
embedding_dim = 300
USE_CUDA=True
EPOCH = 30           # 训练整批数据多少次
BATCH_SIZE = 128
LR = 0.002         # 学习率

In [None]:
d_ = {}
for key, value in enumerate(set(data_tmp.label)):
    d_[value] = key
data_tmp['label'] = data_tmp['label'].apply(lambda x : d_.get(x))

y_train = np.array(data_tmp.label.tolist())
# 构造embedding字典
bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(), min_count=1, maxlen=30) # 长度补齐或截断固定长度30

vocab_size = len(bow.word2idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format('data/ft_wv.txt')

embedding_matrix = np.zeros((vocab_size+1, 300))
for key, value in bow.word2idx.items():
    if key in word2vec.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab
        embedding_matrix[value] = word2vec.get_vector(key)
    else:
        embedding_matrix[value] = [0] * embedding_dim

In [45]:
np.save('save/embedding_matrix',arr=embedding_matrix)

In [46]:
# word对应的index
X_train = copy.deepcopy(bow.doc2num)
y_train = copy.deepcopy(y_train)

## CNN conv1d

In [101]:
class Config(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''

    loss = 'multilabelloss'
    model='LSTMText' 
    title_dim = 100 # 标题的卷积核数
    content_dim = 100 #内容的卷积核数
    num_classes = 30 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 1000 # 全连接层隐藏元数目
    kmax_pooling = 2 # k
    hidden_size = 128 #LSTM hidden size
    num_layers=2 #LSTM layers
    inception_dim = 256 #inception的卷积核数
    
    kernel_size = 3 #单尺度卷积核
    kernel_sizes = [2,3,4] #多尺度卷积核
    # vocab_size = 11973 # num of chars
    vocab_size = vocab_size# num of words 
    content_seq_len = 50 #内容长度 word为50 char为100
    static = False
    embedding_path = 'save/embedding_matrix.npy'

opt = Config()

In [218]:
kernel_sizes =  [1,2,3,4]
class MultiCNNTextBNDeep(BasicModule): 
    def __init__(self, opt ):
        super(MultiCNNTextBNDeep, self).__init__()
        self.model_name = 'MultiCNNTextBNDeep'
        self.opt=opt
        self.encoder = nn.Embedding(self.opt.vocab_size+1, opt.embedding_dim)

        content_convs = [nn.Sequential(
                                nn.Conv1d(in_channels = self.opt.embedding_dim,
                                        out_channels = self.opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(self.opt.content_dim),
                                nn.ReLU(inplace=True),

                                nn.Conv1d(in_channels = self.opt.content_dim,
                                        out_channels = self.opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(self.opt.content_dim),
                                nn.ReLU(inplace=True),
                                # maxpool1d kernel_size=50的意思就是对一句话里每50个单词取maxpool
                                nn.MaxPool1d(kernel_size = (self.opt.content_seq_len - kernel_size*2 + 2))
                            )
            for kernel_size in kernel_sizes]

        self.content_convs = nn.ModuleList(content_convs)

        self.fc = nn.Sequential(
            nn.Linear(len(kernel_sizes)*self.opt.content_dim,self.opt.linear_hidden_size),
            nn.BatchNorm1d(self.opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(self.opt.linear_hidden_size,self.opt.num_classes),
            nn.Softmax()
        )
        

        if opt.embedding_path:
            self.encoder.weight.data.copy_(t.from_numpy(np.load(self.opt.embedding_path)))

    def forward(self, content):
        content = self.encoder(content)
        if self.opt.static:
            content.detach()
        
        
        content_out = [content_conv(content.permute(0,2,1)) for content_conv in self.content_convs]
#         conv_out = t.cat((title_out+content_out),dim=1)
        # t.cat是对list进行拼接，这里对维度1进行拼接
        conv_out = t.cat(content_out,dim=1)
        reshaped = conv_out.view(conv_out.size(0), -1)
        softmax = self.fc((reshaped))
        return softmax

In [220]:
# 数据处理成tensor
label_tensor = torch.from_numpy(np.array(y_train)).long()
content_tensor = torch.from_numpy(np.array(X_train)).long()

torch_dataset = Data.TensorDataset(content_tensor, label_tensor)
train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

# 如果需要验证集则可以将X_train进行拆分

# model, optimizer, loss_func
m = MultiCNNTextBNDeep(opt)
optimizer = torch.optim.Adam(m.parameters(), lr=LR)   # optimize all lstm parameters;Adam比较好用
loss_func = torch.nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
if USE_CUDA:
    m.cuda()
    loss_func.cuda()

CrossEntropyLoss()

In [221]:
# # val
# if USE_CUDA:
#     content_val_tensor = content_val_tensor.cuda()
#     label_val_tensor = label_val_tensor.cuda()

In [222]:
it = 1
for epoch in tqdm_notebook(range(EPOCH)):
    for step, (content, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        content, b_y = content.cuda(), b_y.cuda()
        output = m(content)
        loss = loss_func(output, b_y)
        if it % 50 == 0:
            val_output = m(content_val_tensor)
            val_loss = loss_func(val_output, label_val_tensor).cpu().data.numpy().tolist()
            print('training loss: ', loss.cpu().data.numpy().tolist())
            print('val loss: ', val_loss)
            print('training acc: ',accuracy_score(b_y.cpu().data.numpy().tolist(), np.argmax(output.cpu().data.numpy().tolist(), axis=1)))
            print('val acc: ', accuracy_score(label_val_tensor.cpu().data.numpy().tolist(), np.argmax(val_output.cpu().data.numpy().tolist(), axis=1)))
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        it += 1