In [50]:
import torch
import torch.utils.data as Data
import torch as t
from torch import nn

In [2]:
import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models import Word2Vec, FastText
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook

from m import f1_for_car, BOW, BasicModule

In [3]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_public.csv')

In [4]:
data['subject_1'] = data['subject'] + data['sentiment_value'].astype(str)
subj_lst = list(filter(lambda x : x is not np.nan, list(set(data.subject_1))))
subj_lst_dic = {value:key for key, value in enumerate(subj_lst)}
data['subject_1'] = data['subject_1'].apply(lambda x : subj_lst_dic.get(x))

In [5]:
skf = StratifiedKFold(n_splits=5,shuffle=True)
for train_idx, val_idx in skf.split(data, data.subject):
    break
    
df_train = data.iloc[train_idx]
df_val = data.iloc[val_idx]

# 试一下用CNN进行主题分类

In [223]:
df_train.head()

Unnamed: 0,content_id,content,subject,sentiment_value,sentiment_word,subject_1
0,vUXizsqexyZVRdFH,因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。,价格,0,影响,1
2,QmqJ2AvM5GplaRyz,斯柯达要说质量，似乎比大众要好一点，价格也低一些，用料完全一样。我听说过野帝，但没听说过你说...,价格,1,低,17
3,KMT1gFJiU4NWrVDn,这玩意都是给有钱任性又不懂车的土豪用的，这价格换一次我妹夫EP020可以换三锅了,价格,-1,有钱任性,12
4,nVIlGd5yMmc37t1o,17价格忒高，估计也就是14-15左右。,价格,-1,高,12
5,TVciHBPL5XmUxMEd,我开始就是荣放2.5 森林人2.5二选一 荣放主要是底盘质感不行 太硬 其次是...,价格,1,便宜,17


In [6]:
# 加载停用词词典
with open('data/stop_words_hagongda.txt') as f:
    stop_words = f.readlines()
    stop_words = [x.replace('\n','') for x in stop_words]

stop_words.append('')
    
def f1(x):
    tmp_l = jieba.cut(x)
    return [x.strip() for x in tmp_l if x.strip() not in stop_words]

In [41]:
# 将word建成index索引的方式，方便后面用embeddingm matrix
bow = BOW(data.content.apply(f1).tolist(), min_count=3, maxlen=50)

   Word Count: 100%|██████████| 9947/9947 [00:00<00:00, 201934.81it/s]
Doc To Number: 100%|██████████| 9947/9947 [00:00<00:00, 137312.46it/s]


In [42]:
word2vec = Word2Vec(data.content.apply(f1).tolist(),size=300,min_count=3)

word_embed_dict = {}
def get_word_embed_dict():
    for i in word2vec.wv.vocab:
        word_embed_dict[i] = word2vec.wv.get_vector(i).tolist()
#     word_embed_dict['UNK'] = [0]*300
    return word_embed_dict
word_embed_dict = get_word_embed_dict()

In [43]:
vocab_size = len(word_embed_dict)
embedding_matrix = np.zeros((vocab_size+1,300))

for key, value in bow.word2idx.items():
    embedding_matrix[value] = word_embed_dict.get(key)

In [44]:
embedding_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 7.54894853e-01, -6.20306373e-01,  4.13474776e-02, ...,
         3.79320860e-01, -7.17586100e-01, -6.60323679e-01],
       [ 8.14157128e-01, -2.04117373e-01,  3.04523617e-01, ...,
         2.26151064e-01, -6.42750382e-01, -7.18053758e-01],
       ...,
       [ 1.91839114e-02, -4.97836480e-03,  6.46228669e-03, ...,
        -3.93930328e-04, -1.89290550e-02, -1.02663375e-02],
       [ 9.69438441e-03, -3.16862273e-03,  1.91619981e-03, ...,
        -8.10113677e-04, -1.09908972e-02, -4.49970737e-03],
       [ 1.38809122e-02, -3.92513257e-03,  4.00957931e-03, ...,
        -1.74620154e-03, -1.59535334e-02, -6.97768107e-03]])

In [45]:
np.save('save/embedding_matrix',arr=embedding_matrix)

In [46]:
# word对应的index
df_train_ = bow.doc2num[train_idx]
df_val_ = bow.doc2num[val_idx]

## CNN conv1d

In [101]:
class Config(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''

    loss = 'multilabelloss'
    model='LSTMText' 
    title_dim = 100 # 标题的卷积核数
    content_dim = 100 #内容的卷积核数
    num_classes = 30 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 1000 # 全连接层隐藏元数目
    kmax_pooling = 2 # k
    hidden_size = 128 #LSTM hidden size
    num_layers=2 #LSTM layers
    inception_dim = 256 #inception的卷积核数
    
    kernel_size = 3 #单尺度卷积核
    kernel_sizes = [2,3,4] #多尺度卷积核
    # vocab_size = 11973 # num of chars
    vocab_size = 6597# num of words 
    content_seq_len = 50 #内容长度 word为50 char为100
    static = False
    embedding_path = 'save/embedding_matrix.npy'

opt = Config()

In [218]:
kernel_sizes =  [1,2,3,4]
class MultiCNNTextBNDeep(BasicModule): 
    def __init__(self, opt ):
        super(MultiCNNTextBNDeep, self).__init__()
        self.model_name = 'MultiCNNTextBNDeep'
        self.opt=opt
        self.encoder = nn.Embedding(self.opt.vocab_size,opt.embedding_dim)

        content_convs = [nn.Sequential(
                                nn.Conv1d(in_channels = self.opt.embedding_dim,
                                        out_channels = self.opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(self.opt.content_dim),
                                nn.ReLU(inplace=True),

                                nn.Conv1d(in_channels = self.opt.content_dim,
                                        out_channels = self.opt.content_dim,
                                        kernel_size = kernel_size),
                                nn.BatchNorm1d(self.opt.content_dim),
                                nn.ReLU(inplace=True),
                                # maxpool1d kernel_size=50的意思就是对一句话里每50个单词取maxpool
                                nn.MaxPool1d(kernel_size = (self.opt.content_seq_len - kernel_size*2 + 2))
                            )
            for kernel_size in kernel_sizes]

        self.content_convs = nn.ModuleList(content_convs)

        self.fc = nn.Sequential(
            nn.Linear(len(kernel_sizes)*self.opt.content_dim,self.opt.linear_hidden_size),
            nn.BatchNorm1d(self.opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(self.opt.linear_hidden_size,self.opt.num_classes),
            nn.Softmax()
        )
        

        if opt.embedding_path:
            self.encoder.weight.data.copy_(t.from_numpy(np.load(self.opt.embedding_path)))

    def forward(self, content):
        content = self.encoder(content)
        if self.opt.static:
            content.detach()
        
        
        content_out = [content_conv(content.permute(0,2,1)) for content_conv in self.content_convs]
#         conv_out = t.cat((title_out+content_out),dim=1)
        # t.cat是对list进行拼接，这里对维度1进行拼接
        conv_out = t.cat(content_out,dim=1)
        reshaped = conv_out.view(conv_out.size(0), -1)
        softmax = self.fc((reshaped))
        return softmax

In [219]:
EPOCH = 8           # 训练整批数据多少次,  我只训练了3次
BATCH_SIZE = 64
LR = 0.002         # 学习率

m = MultiCNNTextBNDeep(opt)
if torch.cuda.is_available():
    m.cuda()

In [220]:
# 数据处理成tensor
label_tensor = torch.from_numpy(np.array(df_train.subject_1)).long()
content_tensor = torch.from_numpy(np.array(df_train_)).long()
#val
content_val_tensor = torch.from_numpy(np.array(df_val_)).long()
label_val_tensor = torch.from_numpy(np.array(df_val.subject_1)).long()

torch_dataset = Data.TensorDataset(content_tensor, label_tensor)
train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

# optimizer, loss_func
optimizer = torch.optim.Adam(m.parameters(), lr=LR)   # optimize all lstm parameters;Adam比较好用
loss_func = torch.nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
loss_func.cuda()

CrossEntropyLoss()

In [221]:
content_val_tensor = content_val_tensor.cuda()
label_val_tensor = label_val_tensor.cuda()

In [222]:
it = 1
for epoch in tqdm_notebook(range(EPOCH)):
    for step, (content, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        content, b_y = content.cuda(), b_y.cuda()
        output = m(content)
        loss = loss_func(output, b_y)
        if it % 50 == 0:
            print('training loss: ', loss.cpu().data.numpy().tolist())
            print('val loss: ', loss_func(m(content_val_tensor), label_val_tensor).cpu().data.numpy().tolist())
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        it += 1

  input = module(input)


training loss:  3.111666202545166
val loss:  3.082869291305542
training loss:  3.124356269836426
val loss:  3.030306577682495
training loss:  3.0201728343963623
val loss:  3.0152130126953125
training loss:  2.919506549835205
val loss:  3.0144176483154297
training loss:  3.066793918609619
val loss:  2.999502420425415
training loss:  3.004420757293701
val loss:  2.9991745948791504
training loss:  3.004478693008423
val loss:  3.0036096572875977
training loss:  2.9099767208099365
val loss:  2.99699330329895
training loss:  3.009580612182617
val loss:  2.992452383041382
training loss:  3.1874115467071533
val loss:  3.0018603801727295
training loss:  3.0316457748413086
val loss:  2.9846084117889404
training loss:  2.975032329559326
val loss:  2.993483066558838
training loss:  2.8965225219726562
val loss:  2.9855575561523438
training loss:  3.0033328533172607
val loss:  2.975074052810669
training loss:  2.8534862995147705
val loss:  2.9710640907287598
training loss:  2.9624392986297607
val lo