In [None]:
import pandas as pd 
import numpy as np 
import jieba
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import numpy as np
jieba.setLogLevel('WARN')

In [None]:
#数据预处理
class DataPreprocess():
    def __init__(self, tokenizer=None,
                 label_set=None):
        self.tokenizer = tokenizer
        self.num_words = None
        self.label_set = label_set
        self.sentence_len = None
        self.word_len = None

    def cut_texts(self, texts=None, word_len=1):
        """
        对文本分词
        :param texts: 文本列表
        :param word_len: 保留最短长度的词语
        :return:
        """
        if word_len > 1:
            texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
        else:
            texts_cut = [jieba.lcut(one_text) for one_text in texts]

        self.word_len = word_len

        return texts_cut

    def train_tokenizer(self,
                        texts_cut=None,
                        num_words=2000):
        """
        生成编码字典
        :param texts_cut: 分词的列表
        :param num_words: 字典按词频从高到低保留数量
        :return:
        """
        tokenizer = Tokenizer(num_words=num_words)
        tokenizer.fit_on_texts(texts=texts_cut)
        num_words = min(num_words, len(tokenizer.word_index) + 1)
        self.tokenizer = tokenizer
        self.num_words = num_words

    def text2seq(self,
                 texts_cut,
                 sentence_len=30):
        """
        文本转序列，用于神经网络的ebedding层输入。
        :param texts_cut: 分词后的文本列表
        :param sentence_len: 文本转序列保留长度
        :return:sequence list
        """
        tokenizer = self.tokenizer
        texts_seq = tokenizer.texts_to_sequences(texts=texts_cut)
        del texts_cut

        texts_pad_seq = pad_sequences(texts_seq,
                                      maxlen=sentence_len,
                                      padding='post',
                                      truncating='post')
        self.sentence_len = sentence_len
        return texts_pad_seq

    def creat_label_set(self, labels):
        '''
        获取标签集合，用于one-hot
        :param labels: 原始标签集
        :return:
        '''
        label_set = set()
        for i in labels:
            label_set = label_set.union(set(i))

        self.label_set = np.array(list(label_set))

    def creat_label(self, label):
        '''
        构建标签one-hot
        :param label: 原始标签
        :return: 标签one-hot形式的array
        eg. creat_label(label=data_valid_accusations[12], label_set=accusations_set)
        '''
        label_set = self.label_set
        label_zero = np.zeros(len(label_set))
        label_zero[np.in1d(label_set, label)] = 1
        return label_zero

    def creat_labels(self, labels=None):
        '''
        调用creat_label遍历标签列表生成one-hot二维数组
        :param label: 原始标签集
        :return:
        '''
        label_set = self.label_set
        labels_one_hot = [self.creat_label(label) for label in labels]

        return np.array(labels_one_hot)

In [None]:
#CNN模型
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import Conv1D, GlobalMaxPool1D, Dropout


def CNN(input_dim,
        input_length,
        vec_size,
        output_shape,
        output_type='multiple'):
    '''
    Creat CNN net,use Embedding+CNN1D+GlobalMaxPool1D+Dense.
    You can change filters and dropout rate in code..
    :param input_dim: Size of the vocabulary
    :param input_length:Length of input sequences
    :param vec_size:Dimension of the dense embedding
    :param output_shape:Target shape,target should be one-hot term
    :param output_type:last layer type,multiple(activation="sigmoid") or single(activation="softmax")
    :return:keras model
    '''
    data_input = Input(shape=[input_length])
    word_vec = Embedding(input_dim=input_dim + 1,
                         input_length=input_length,
                         output_dim=vec_size)(data_input)
    x = Conv1D(filters=128,
               kernel_size=[3],
               strides=1,
               padding='same',
               activation='relu')(word_vec)
    x = GlobalMaxPool1D()(x)
    x = Dense(500, activation='relu')(x)
    x = Dropout(0.1)(x)
    if output_type == 'multiple':
        x = Dense(output_shape, activation='sigmoid')(x)
        model = Model(inputs=data_input, outputs=x)
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['acc'])
    elif output_type == 'single':
        x = Dense(output_shape, activation='softmax')(x)
        model = Model(inputs=data_input, outputs=x)
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['acc'])
    else:
        raise ValueError('output_type should be multiple or single')
    return model


if __name__ == '__main__':
    model = CNN(input_dim=10, input_length=10, vec_size=10, output_shape=10, output_type='multiple')
    model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10)]              0         
                                                                 
 embedding (Embedding)       (None, 10, 10)            110       
                                                                 
 conv1d (Conv1D)             (None, 10, 128)           3968      
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 500)               64500     
                                                                 
 dropout (Dropout)           (None, 500)               0         
                                                             

In [None]:
import numpy as np


class TextClassification():
    def __init__(self):
        self.preprocess = None
        self.model = None

    def get_preprocess(self, texts, labels, word_len=1, num_words=2000, sentence_len=30):
        # 数据预处理
        preprocess = DataPreprocess()

        # 处理文本
        texts_cut = preprocess.cut_texts(texts, word_len)
        preprocess.train_tokenizer(texts_cut, num_words)
        texts_seq = preprocess.text2seq(texts_cut, sentence_len)

        # 得到标签
        preprocess.creat_label_set(labels)
        labels = preprocess.creat_labels(labels)
        self.preprocess = preprocess

        return texts_seq, labels

    def fit(self, texts_seq, texts_labels, output_type, epochs, batch_size, model=None):
        if model is None:
            preprocess = self.preprocess
            model = CNN(preprocess.num_words,
                        preprocess.sentence_len,
                        128,
                        len(preprocess.label_set),
                        output_type)
        # 训练神经网络
        model.fit(texts_seq,
                  texts_labels,
                  epochs=epochs,
                  batch_size=batch_size)
        self.model = model

    def predict(self, texts):
        preprocess = self.preprocess
        word_len = preprocess.word_len
        sentence_len = preprocess.sentence_len

        # 处理文本
        texts_cut = preprocess.cut_texts(texts, word_len)
        texts_seq = preprocess.text2seq(texts_cut, sentence_len)

        return self.model.predict(texts_seq)

    def label2toptag(self, predictions, labelset):
        labels = []
        for prediction in predictions:
            label = labelset[prediction == prediction.max()]
            labels.append(label.tolist())
        return labels

    def label2half(self, predictions, labelset):
        labels = []
        for prediction in predictions:
            label = labelset[prediction > 0.5]
            labels.append(label.tolist())
        return labels

    def label2tag(self, predictions, labelset):
        labels1 = self.label2toptag(predictions, labelset)
        labels2 = self.label2half(predictions, labelset)
        labels = []
        for i in range(len(predictions)):
            if len(labels2[i]) == 0:
                labels.append(labels1[i])
            else:
                labels.append(labels2[i])
        return labels

## 属性分析
### 以下使用的是1w条汽车VOC数据

In [None]:
import pandas as pd
df3 = pd.read_csv("train_data.csv")
df3.drop(['Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6','Unnamed: 7'],inplace=True,axis = 1)

In [None]:
df3

In [None]:
import tensorflow as tf
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

x_info = df3['content']
y_info = [[i] for i in df3['label']]
data_type = 'single'

x_train, x_test, y_train, y_test = train_test_split(x_info, y_info, test_size=0.2, random_state=1)

##### 以下是训练过程 #####

clf = TextClassification()
texts_seq, texts_labels = clf.get_preprocess(x_train, y_train,
                                             word_len=1,
                                             num_words=2000,
                                             sentence_len=50)
clf.fit(texts_seq=texts_seq,
        texts_labels=texts_labels,
        output_type=data_type,
        epochs=10,
        batch_size=64,
        model=None)

# 保存整个模块,包括预处理和神经网络
with open('./%s.pkl' % data_type, 'wb') as f:
    pickle.dump(clf, f)

##### 以下是预测过程 #####

# 导入刚才保存的模型
with open('./%s.pkl' % data_type, 'rb') as f:
    clf = pickle.load(f)
y_predict = clf.predict(x_test)
y_predict = [[clf.preprocess.label_set[i.argmax()]] for i in y_predict]
score = sum(y_predict == np.array(y_test)) / len(y_test)
print(score) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




[0.76396058]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_predict,y_test))

              precision    recall  f1-score   support

          价格       0.86      0.80      0.83       302
          内饰       0.78      0.69      0.73        89
          动力       0.83      0.79      0.81       596
          外观       0.71      0.75      0.73       102
         安全性       0.65      0.69      0.67       105
          操控       0.72      0.62      0.67       246
          油耗       0.80      0.85      0.82       254
          空间       0.67      0.65      0.66        91
         舒适性       0.63      0.76      0.69       168
          配置       0.71      0.83      0.77       178

    accuracy                           0.76      2131
   macro avg       0.74      0.74      0.74      2131
weighted avg       0.77      0.76      0.76      2131

