In [2]:
import torch
import torch.utils.data as Data
import torch as t
from torch import nn
import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models import Word2Vec, FastText
import re
import os
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import accuracy_score
import copy

from m import f1_for_car, BOW, BasicModule

In [3]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_public.csv')

In [11]:
# 主题和情感合起来变成30类
data['subject_1'] = data['subject'] + data['sentiment_value'].astype(str)
subj_lst = list(filter(lambda x : x is not np.nan, list(set(data.subject_1))))
subj_lst_dic = {value:key for key, value in enumerate(subj_lst)}
data['label'] = data['subject_1'].apply(lambda x : subj_lst_dic.get(x))

data = data[['content', 'label']].copy(deep=True)
data_tmp = data.copy(deep=True)

In [19]:
# subj_dict = {'价格':['价格','性价比','低价','降价','优惠','便宜','划算','不菲','实惠','贵','价差','单价','合算','合理','高昂','有钱任性','保值率','费用','同价位','评估价','最高配','最低配','前（钱）紧','8万'],\
#             '油耗':['油耗','高速','市区','公里','废油','不见得省','省油','个油','节油','机油','油号','费油','不省什么油'],\
#             '配置':['配置','导航','视野','倒车雷达','倒车影像','中控','后视镜','自动泊车','摄像头','前雷达','车载','音质','背光','简配','落锁','出风口'],\
#             '内饰':['内饰','氛围','单调','寒酸','用料','细致','设计感','异味','做工','简陋','粗糙','档次','不够整','劣质材料','防火材料'],\
#             '操控':['操控','控制','偏硬','不费劲','迟钝','底盘','操纵','减震','方向盘','尾排','加强件','刹车','灵活','韧性','漂移','手感差','变速箱','平衡性'],\
#             '空间':['空间','视野','舒服','容量','显小','钻进去','宽敞','宽大','轴距','车体'],\
#             '外观':['外观','杀马特','大气','前脸','外形','变色','漆面','油漆','车漆','眼缘','尾灯','帅气','镀铬','镀络','颜值','挺炫','屁股','新潮','里外不一','好看','颜色','寒冰银','蓝色','黑色','不耐脏','银色','红色','蓝棕','黄贴'],\
#             '动力':['动力','驱动','发动机','机油','散热','四驱','强劲','变速箱','飙车','爆缸','排量','尾排','爬坡','油门踩到底','怕烧机油'],\
#             '安全性':['安全','刹车','手刹','追尾','气囊','加速','扎实','防爆胎','被盗','防盗','失去抓地力'],\
#             '舒适性':['舒适','隔音','舒服','噪音','异响','吵','静音','风噪','都会响','出风口','安静','空调','气门','颈椎','累','制冷','恒温','声音','抖','座椅','视野','宽大','晕车','减震','腰疼','卡顿','坐姿','颠簸','气味','滴水','后备箱响']}

## 试一下用LSTM进行主题分类

In [None]:
embedding_dim = 300
USE_CUDA=True
EPOCH = 30           # 训练整批数据多少次
BATCH_SIZE = 128
LR = 0.002         # 学习率

### word_embedding

In [None]:
d_ = {}
for key, value in enumerate(set(data_tmp.label)):
    d_[value] = key
data_tmp['label'] = data_tmp['label'].apply(lambda x : d_.get(x))

y_train = np.array(data_tmp.label.tolist())
# 构造embedding字典
bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(), min_count=1, maxlen=30) # 长度补齐或截断固定长度30

vocab_size = len(bow.word2idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format('data/ft_wv.txt')

embedding_matrix = np.zeros((vocab_size+1, 300))
for key, value in bow.word2idx.items():
    if key in word2vec.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab
        embedding_matrix[value] = word2vec.get_vector(key)
    else:
        embedding_matrix[value] = [0] * embedding_dim

In [32]:
np.save('save/embedding_matrix', arr=embedding_matrix)

In [33]:
# word对应的index
X_train = copy.deepcopy(bow.doc2num)
y_train = copy.deepcopy(y_train)

### 构建LSTM模型

In [34]:
class Config(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''

    loss = 'multilabelloss'
    model='LSTMText' 
    num_classes = 30 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 1000 # 全连接层隐藏元数目
    kmax_pooling = 2 # k
    hidden_size = 128 #LSTM hidden size
    num_layers=2 #LSTM layers
    inception_dim = 256 #inception的卷积核数
    
    # vocab_size = 11973 # num of chars
    vocab_size = vocab_size # num of words 
    content_seq_len = 100 #描述长度 word为100 char为200
    static = False
    embedding_path = 'save/embedding_matrix.npy'

opt = Config()

In [37]:
# 相当于把seq_len压缩成k个'词'
# dim共三个维度，这里取2即seq_len那个维度，100->k
def kmax_pooling(x, dim, k):
    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
    return x.gather(dim, index)

class LSTMText(BasicModule): 
    def __init__(self, opt):
        super(LSTMText, self).__init__()
        self.model_name = 'LSTMText'
        self.opt=opt

        kernel_size = self.opt.kernel_size
        self.encoder = torch.nn.Embedding(self.opt.vocab_size+1, self.opt.embedding_dim)

        self.content_lstm =torch.nn.LSTM(input_size = self.opt.embedding_dim,\
                            hidden_size = self.opt.hidden_size,
                            num_layers = self.opt.num_layers,
                            bias = True,
                            batch_first = False,
                            dropout = 0.5, # dropout
                            bidirectional = True
                            )

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(self.opt.kmax_pooling*(self.opt.hidden_size*2),self.opt.linear_hidden_size),
            torch.nn.Dropout(0.2), # dropout
            torch.nn.BatchNorm1d(self.opt.linear_hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.opt.linear_hidden_size, self.opt.num_classes),
            torch.nn.Softmax()
        )

        if self.opt.embedding_path:
#             self.encoder.weight.data.copy_(t.from_numpy(np.load(self.opt.embedding_path)))
            self.encoder.weight = nn.Parameter(t.from_numpy(np.load(self.opt.embedding_path)).float(), requires_grad=False)
 
    def forward(self, content):
        content = self.encoder(content)
        # torch.Size([64, 100, 150])
        if self.opt.static:
            title=title.detach()
            content=content.detach()
        
        '''
        lstm输入的时候需要转成(seq_len, batch, embedding_dim）这种维度（用permute转）<br>
        output，每个时刻的LSTM网络的最后一层的输出，维度（seq_len, batch, hidden_size * num_directions）|双向lstm所以输出的hidden_size维度要乘以2<br>
        lstm的输出为output, (hn, cn) 的元组<br>
        这里取第一个就是output(100,64,256)，第二个是元组其中的第一个hn就是最后时刻的隐层状态hn(4,64,128)
        这里的4就是(2层num_layers*双向)lstm得到
        '''
        content_out = self.content_lstm(content.permute(1,0,2))[0].permute(1,2,0)
        #torch.Size([64, 256, 100])
        content_conv_out = kmax_pooling((content_out),2,self.opt.kmax_pooling)
        conv_out = content_conv_out
        reshaped = conv_out.view(conv_out.size(0), -1)
        softmax = self.fc((reshaped))
        return softmax

### 开始跑模型

In [39]:
# 数据处理成tensor
label_tensor = torch.from_numpy(np.array(y_train)).long()
content_tensor = torch.from_numpy(np.array(X_train)).long()

torch_dataset = Data.TensorDataset(content_tensor, label_tensor)
train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

# 如果需要验证集则可以将X_train进行拆分

# model, optimizer, loss_func
m = LSTMText(opt)
optimizer = torch.optim.Adam(m.parameters(), lr=LR)   # optimize all lstm parameters;Adam比较好用
loss_func = torch.nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
if USE_CUDA:
    m.cuda()
    loss_func.cuda()

CrossEntropyLoss()

In [40]:
# # val
# if USE_CUDA:
#     content_val_tensor = content_val_tensor.cuda()
#     label_val_tensor = label_val_tensor.cuda()

In [41]:
it = 1
for epoch in tqdm_notebook(range(EPOCH)):
    for step, (content, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        content, b_y = content.cuda(), b_y.cuda()
        output = m(content)
        loss = loss_func(output, b_y)
        if it % 50 == 0:
            val_output = m(content_val_tensor)
            val_loss = loss_func(val_output, label_val_tensor).cpu().data.numpy().tolist()
            print('training loss: ', loss.cpu().data.numpy().tolist())
            print('val loss: ', val_loss)
            print('training acc: ',accuracy_score(b_y.cpu().data.numpy().tolist(), np.argmax(output.cpu().data.numpy().tolist(), axis=1)))
            print('val acc: ', accuracy_score(label_val_tensor.cpu().data.numpy().tolist(), np.argmax(val_output.cpu().data.numpy().tolist(), axis=1)))
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        it += 1