In [1]:
import torch
import torch.utils.data as Data

In [2]:
import numpy as np
import pandas as pd
import jieba
import gensim
from gensim.models import Word2Vec, FastText
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook

from m import f1_for_car, BOW, BasicModule

In [3]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_public.csv')

In [5]:
subj_lst = list(filter(lambda x : x is not np.nan, list(set(data.subject))))
subj_lst_dic = {value:key for key, value in enumerate(subj_lst)}
data['subject_1'] = data['subject'].apply(lambda x : subj_lst_dic.get(x))

In [11]:
data['subject_1'] = data['subject'] + data['sentiment_value'].astype(str)
subj_lst = list(filter(lambda x : x is not np.nan, list(set(data.subject_1))))
subj_lst_dic = {value:key for key, value in enumerate(subj_lst)}
data['subject_1'] = data['subject_1'].apply(lambda x : subj_lst_dic.get(x))

## 将训练数据分成训练集和验证集(根据主题进行分层抽样)

In [17]:
skf = StratifiedKFold(n_splits=5,shuffle=True)
for train_idx, val_idx in skf.split(data, data.subject):
    break

In [18]:
df_train = data.iloc[train_idx]
df_val = data.iloc[val_idx]

In [19]:
subj_dict = {'价格':['价格','性价比','低价','降价','优惠','便宜','划算','不菲','实惠','贵','价差','单价','合算','合理','高昂','有钱任性','保值率','费用','同价位','评估价','最高配','最低配','前（钱）紧','8万'],\
            '油耗':['油耗','高速','市区','公里','废油','不见得省','省油','个油','节油','机油','油号','费油','不省什么油'],\
            '配置':['配置','导航','视野','倒车雷达','倒车影像','中控','后视镜','自动泊车','摄像头','前雷达','车载','音质','背光','简配','落锁','出风口'],\
            '内饰':['内饰','氛围','单调','寒酸','用料','细致','设计感','异味','做工','简陋','粗糙','档次','不够整','劣质材料','防火材料'],\
            '操控':['操控','控制','偏硬','不费劲','迟钝','底盘','操纵','减震','方向盘','尾排','加强件','刹车','灵活','韧性','漂移','手感差','变速箱','平衡性'],\
            '空间':['空间','视野','舒服','容量','显小','钻进去','宽敞','宽大','轴距','车体'],\
            '外观':['外观','杀马特','大气','前脸','外形','变色','漆面','油漆','车漆','眼缘','尾灯','帅气','镀铬','镀络','颜值','挺炫','屁股','新潮','里外不一','好看','颜色','寒冰银','蓝色','黑色','不耐脏','银色','红色','蓝棕','黄贴'],\
            '动力':['动力','驱动','发动机','机油','散热','四驱','强劲','变速箱','飙车','爆缸','排量','尾排','爬坡','油门踩到底','怕烧机油'],\
            '安全性':['安全','刹车','手刹','追尾','气囊','加速','扎实','防爆胎','被盗','防盗','失去抓地力'],\
            '舒适性':['舒适','隔音','舒服','噪音','异响','吵','静音','风噪','都会响','出风口','安静','空调','气门','颈椎','累','制冷','恒温','声音','抖','座椅','视野','宽大','晕车','减震','腰疼','卡顿','坐姿','颠簸','气味','滴水','后备箱响']}

In [20]:
cont_id_dict = {}
for key, value in subj_dict.items():
    cont_id_dict[key] = []
    for val in value:
        cont_id_dict[key] += df_val[df_val.content.apply(lambda x : val in x)].content_id.tolist()

In [21]:
cont_id_dict2 = {}
for key, value in cont_id_dict.items():
    for val in value:
        if val in cont_id_dict2:
            cont_id_dict2[val].append(key)
        else:
            cont_id_dict2[val] = [key]

In [22]:
def filter_subj1(x):
    tmp = Counter(x)
    tmp2 = list(filter(lambda y : y[1]>1, tmp.items()))
    if len(tmp2) == 0:
        return x
    return [y[0] for y in tmp2]

# 整成dataframe
tmp = pd.Series(cont_id_dict2)
tmp2 = pd.DataFrame()
tmp2['content_id'] = tmp.index
tmp2['subject_rule'] = tmp.tolist()
# tmp2['subject'] = tmp2['subject'].apply(filter_subj1)
# tmp2['subject_rule'] = tmp2['subject_rule'].apply(lambda x : list(set(x)))
tmp2['subject_rule'] = tmp2['subject_rule'].apply(lambda x : list(x))

In [23]:
df_val = df_val.merge(tmp2,on='content_id',how='left')
df_val.subject_rule.fillna('N',inplace=True)
df_val['subject_rule'] = df_val['subject_rule'].apply(lambda x : ','.join(x))

In [24]:
def filter_subj1(x):
    x = x.split(',')
    tmp = Counter(x)
    tmp2 = list(filter(lambda y : y[1]>1, tmp.items()))
    if len(tmp2) == 0:
        return ','.join(x)
    return ','.join([y[0] for y in tmp2])

In [25]:
c = 0
for i, j in zip(df_val['subject'],df_val['subject_rule'].apply(filter_subj1)):
    if i in j:
        c += 1
c

1620

## 试一下用LSTM进行主题分类

### word_embedding

In [26]:
stop_mark = ['，','。','！',',','!','？','?','',' ']
def f1(x):
    tmp_l = jieba.cut(x)
    return [x.strip() for x in tmp_l if x.strip() not in stop_mark]

word2vec = Word2Vec(data.content.apply(f1).tolist(),size=300,min_count=3)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.976 seconds.
Prefix dict has been built succesfully.


In [28]:
# 将word建成index索引的方式，方便后面用embeddingm matrix
bow = BOW(data.content.apply(f1).tolist(), min_count=3, maxlen=80)

   Word Count: 100%|██████████| 9947/9947 [00:00<00:00, 153908.48it/s]
Doc To Number: 100%|██████████| 9947/9947 [00:00<00:00, 87790.45it/s]


In [47]:
word_embed_dict = {}
def get_word_embed_dict():
    for i in word2vec.wv.vocab:
        word_embed_dict[i] = word2vec.wv.get_vector(i).tolist()
#     word_embed_dict['UNK'] = [0]*300
    return word_embed_dict
word_embed_dict = get_word_embed_dict()

In [49]:
vocab_size = len(word_embed_dict)
embedding_matrix = np.zeros((vocab_size+1,300))

for key, value in bow.word2idx.items():
    embedding_matrix[value] = word_embed_dict.get(key)

In [50]:
embedding_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.17498301e-01,  4.33762133e-01, -2.32221127e-01, ...,
        -3.37091684e-01, -2.42774144e-01, -6.83337003e-02],
       [ 7.20790505e-01,  1.36479223e+00, -5.62304854e-01, ...,
        -7.32061684e-01,  1.20418325e-01, -2.84205794e-01],
       ...,
       [ 2.44777999e-03,  2.71027833e-02, -1.48612587e-02, ...,
        -2.33698953e-02, -1.55870700e-02, -6.94512716e-03],
       [ 3.41599551e-03,  2.16386374e-02, -6.68662181e-03, ...,
        -1.76747274e-02, -8.45736265e-03, -7.47158891e-03],
       [ 3.34755937e-03,  1.41907008e-02, -1.84325725e-02, ...,
        -1.95526294e-02, -1.10311620e-02, -1.24784117e-03]])

In [32]:
np.save('save/embedding_matrix',arr=embedding_matrix)
# a = np.load('save/embedding_matrix.npy')

In [33]:
# word对应的index
df_train_ = bow.doc2num[train_idx]
df_val_ = bow.doc2num[val_idx]

### 构建LSTM模型

In [34]:
class Config(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''

    loss = 'multilabelloss'
    model='LSTMText' 
    num_classes = 30 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 1000 # 全连接层隐藏元数目
    kmax_pooling = 2 # k
    hidden_size = 128 #LSTM hidden size
    num_layers=2 #LSTM layers
    inception_dim = 256 #inception的卷积核数
    
    # vocab_size = 11973 # num of chars
    vocab_size = 6894 # num of words 
    content_seq_len = 100 #描述长度 word为100 char为200
    static = False
    embedding_path = 'save/embedding_matrix.npy'

opt = Config()

In [37]:
# 相当于把seq_len压缩成k个'词'
# dim共三个维度，这里取2即seq_len那个维度，100->k
def kmax_pooling(x, dim, k):
    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
    return x.gather(dim, index)

class LSTMText(BasicModule): 
    def __init__(self, opt):
        super(LSTMText, self).__init__()
        self.model_name = 'LSTMText'
        self.opt=opt

        kernel_size = self.opt.kernel_size
        self.encoder = torch.nn.Embedding(self.opt.vocab_size,self.opt.embedding_dim)

        self.content_lstm =torch.nn.LSTM(input_size = self.opt.embedding_dim,\
                            hidden_size = self.opt.hidden_size,
                            num_layers = self.opt.num_layers,
                            bias = True,
                            batch_first = False,
                            dropout = 0.5, # dropout
                            bidirectional = True
                            )

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(self.opt.kmax_pooling*(self.opt.hidden_size*2),self.opt.linear_hidden_size),
            torch.nn.Dropout(0.2), # dropout
            torch.nn.BatchNorm1d(self.opt.linear_hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.opt.linear_hidden_size, self.opt.num_classes),
            torch.nn.Softmax()
        )

        if self.opt.embedding_path:
            self.encoder.weight.data.copy_(t.from_numpy(np.load(self.opt.embedding_path)))
 
    def forward(self, content):
        content = self.encoder(content)
        # torch.Size([64, 100, 150])
        if self.opt.static:
            title=title.detach()
            content=content.detach()
        
        '''
        lstm输入的时候需要转成(seq_len, batch, embedding_dim）这种维度（用permute转）<br>
        output，每个时刻的LSTM网络的最后一层的输出，维度（seq_len, batch, hidden_size * num_directions）|双向lstm所以输出的hidden_size维度要乘以2<br>
        lstm的输出为output, (hn, cn) 的元组<br>
        这里取第一个就是output(100,64,256)，第二个是元组其中的第一个hn就是最后时刻的隐层状态hn(4,64,128)
        这里的4就是(2层num_layers*双向)lstm得到
        '''
        content_out = self.content_lstm(content.permute(1,0,2))[0].permute(1,2,0)
        #torch.Size([64, 256, 100])
        content_conv_out = kmax_pooling((content_out),2,self.opt.kmax_pooling)
        conv_out = content_conv_out
        reshaped = conv_out.view(conv_out.size(0), -1)
        softmax = self.fc((reshaped))
        return softmax

### 开始跑模型

In [38]:
EPOCH = 8# 训练整批数据多少次,  我训练了5次
BATCH_SIZE = 64
LR = 0.002         # 学习率

m = LSTMText(opt)
if torch.cuda.is_available():
    m.cuda()

In [39]:
# 数据处理成tensor
label_tensor = torch.from_numpy(np.array(df_train.subject_1)).long()
content_tensor = torch.from_numpy(np.array(df_train_)).long()
#val
content_val_tensor = torch.from_numpy(np.array(df_val_)).long()
label_val_tensor = torch.from_numpy(np.array(df_val.subject_1)).long()

torch_dataset = Data.TensorDataset(content_tensor, label_tensor)
train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=8,              # subprocesses for loading data
    )

# optimizer, loss_func
optimizer = torch.optim.Adam(m.parameters(), lr=LR)   # optimize all lstm parameters;Adam比较好用
loss_func = torch.nn.CrossEntropyLoss()   # the target label is not one-hotted 适用于多分类
loss_func.cuda()

CrossEntropyLoss()

In [40]:
content_val_tensor = content_val_tensor.cuda()
label_val_tensor = label_val_tensor.cuda()

In [41]:
it = 1
for epoch in tqdm_notebook(range(EPOCH)):
    for step, (content, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        content, b_y = content.cuda(), b_y.cuda()
        output = m(content)
        loss = loss_func(output, b_y)
        if it % 50 == 0:
            print('training loss: ', loss.cpu().data.numpy().tolist())
            print('val loss: ', loss_func(m(content_val_tensor), label_val_tensor).cpu().data.numpy().tolist())
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        it += 1

  input = module(input)


training loss:  3.1673879623413086
val loss:  3.18984317779541
training loss:  3.2834389209747314
val loss:  3.1693873405456543
training loss:  3.0643393993377686
val loss:  3.112532615661621
training loss:  3.0616390705108643
val loss:  3.0707297325134277
training loss:  3.021479845046997
val loss:  3.042405128479004
training loss:  3.0143232345581055
val loss:  3.033583402633667
training loss:  2.9115610122680664
val loss:  3.0212764739990234
training loss:  3.135138511657715
val loss:  3.025049924850464
training loss:  3.040111780166626
val loss:  3.0259411334991455
training loss:  3.022106647491455
val loss:  3.019199848175049
training loss:  3.107740879058838
val loss:  3.0234880447387695
training loss:  2.9572179317474365
val loss:  3.0143158435821533
training loss:  2.910447120666504
val loss:  3.0149362087249756
training loss:  3.0686566829681396
val loss:  3.008749485015869
training loss:  2.9876296520233154
val loss:  3.0189781188964844
training loss:  3.0338194370269775
val 