In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
with open('E:/百度/train+cnn.txt', encoding='utf8') as file:
    line_list1 = [k.strip() for k in file.readlines()]
    #读取每行
    train_label_list = [k.split()[0] for k in line_list1]
    #将标签依次取出
    train_content_list = [k.split(maxsplit=1)[1] for k in line_list1]
    #将内容依次取出,此处注意split()选择最大分割次数为1,否则句子被打断.
with open('E:/百度/test+cnn.txt', encoding='utf8') as file:
    line_list2 = [k.strip() for k in file.readlines()]
    test_label_list = [k.split()[0] for k in line_list2]
    test_content_list = [k.split(maxsplit=1)[1] for k in line_list2]
with open('E:/百度/val+cnn.txt', encoding='utf8') as file:
    line_list3 = [k.strip() for k in file.readlines()]
    val_label_list = [k.split()[0] for k in line_list3]
    val_content_list = [k.split(maxsplit=1)[1] for k in line_list3]

In [3]:
from keras.utils import plot_model
from keras.layers import Embedding
import gensim
from gensim.models.word2vec import Word2Vec,LineSentence,Text8Corpus

Using TensorFlow backend.


In [4]:
model = Word2Vec.load('cont.model')

In [5]:
word_vectors = model.wv

In [6]:
EMBEDDING_DIM = 128 # 词向量空间维度
num_classes=10 # 类型数量
maxlen=600 # 文本长度
max_words=10000 # 词汇表数量

In [7]:
embeddings_index = {}
for word, vocab_obj in model.wv.vocab.items():
    if int(vocab_obj.index) < max_words:
        embeddings_index[word] = word_vectors[word]
del model, word_vectors # 删掉gensim模型释放内存
print('Found %s word vectors.' % len(embeddings_index))

Found 10000 word vectors.


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [9]:
tokenizer = Tokenizer(num_words=max_words) # 传入我们词向量的字典
content_list=train_content_list+test_content_list+val_content_list
tokenizer.fit_on_texts(content_list) # 传入我们的训练数据，得到训练数据中出现的词的字典
sequences1 = tokenizer.texts_to_sequences(train_content_list) # 根据训练数据中出现的词的字典，将训练数据转换为sequences
sequences2 = tokenizer.texts_to_sequences(test_content_list) # 根据训练数据中出现的词的字典，将训练数据转换为sequences
sequences3 = tokenizer.texts_to_sequences(val_content_list) # 根据训练数据中出现的词的字典，将训练数据转换为sequences
word_index = tokenizer.word_index
print('Found %s unique tokens2.' % len(word_index))


Found 383707 unique tokens2.


In [10]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
train_Y = to_categorical(label.fit_transform(train_label_list),num_classes=num_classes)
test_Y = to_categorical(label.fit_transform(test_label_list),num_classes=num_classes)
val_Y = to_categorical(label.fit_transform(val_label_list),num_classes=num_classes)

In [11]:
train_X = pad_sequences(sequences1,maxlen=maxlen,truncating='post',padding='post') # 和原作者的不同
test_X = pad_sequences(sequences2,maxlen=maxlen,truncating='post',padding='post') # 和原作者不同 
val_X = pad_sequences(sequences3,maxlen=maxlen,truncating='post',padding='post') # 和原作者不同 

In [12]:
from keras.models import Sequential 
from keras import layers

In [13]:
embedding_matrix = np.zeros((max_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # 文本数据中的词在词向量字典中没有，向量为取0；如果有则取词向量中该词的向量
        embedding_matrix[i] = embedding_vector

In [19]:
model = Sequential()
# 将预训练好的词向量加载如embedding layer
# 我们设置 trainable = False，代表词向量不作为参数进行更新
embedding_layer = model.add(layers.Embedding(max_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False))
con=model.add(layers.Conv1D(filters=256,kernel_size=8))
maxpool=model.add(layers.MaxPooling1D(593))
flatten=model.add(layers.Flatten())
dense1=model.add(layers.Dense(128))  # 128是输出层的维度
dropout=model.add(layers.Dropout(0.2))  # 控制需要断开的神经元比例，此处应该为0.2
bn=model.add(layers.normalization.BatchNormalization(axis=1))  # 规范层加速
active1=model.add(layers.core.Activation('relu'))  # 激活函数为relu
dense2=model.add(layers.Dense(10)) #最后的输出层
predict_y=model.add(layers.core.Activation('softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 600, 128)          1280000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 593, 256)          262400    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 1, 256)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 128)               512       
__________

In [20]:
from keras import optimizers
optimizer = optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['categorical_accuracy'] )

In [21]:
import random
for i in range(5000):
    train_index = random.sample(list(range(len(train_Y))),k=64)
    X = train_X[train_index]
    Y = train_Y[train_index]
    model.fit(X,Y,verbose=0)
    step = i + 1 
    if step % 100 == 0:
        val_index = random.sample(list(range(len(val_Y))), k=100)
        x = val_X[val_index]
        y = val_Y[val_index]
        loss_value, accuracy_value = model.evaluate(x,y,verbose=0)
        print('step:%d loss:%.4f accuracy:%.4f' %(step, loss_value, accuracy_value))

step:100 loss:0.2306 accuracy:0.9200
step:200 loss:0.1763 accuracy:0.9300
step:300 loss:0.1097 accuracy:0.9600
step:400 loss:0.1031 accuracy:0.9700
step:500 loss:0.0786 accuracy:0.9800
step:600 loss:0.0945 accuracy:0.9700
step:700 loss:0.0637 accuracy:0.9800
step:800 loss:0.1440 accuracy:0.9500
step:900 loss:0.0485 accuracy:0.9800
step:1000 loss:0.0525 accuracy:0.9800
step:1100 loss:0.2993 accuracy:0.8800
step:1200 loss:0.1810 accuracy:0.9100
step:1300 loss:0.6527 accuracy:0.8000
step:1400 loss:0.0792 accuracy:0.9700
step:1500 loss:0.2569 accuracy:0.9400
step:1600 loss:0.2446 accuracy:0.9200
step:1700 loss:0.0918 accuracy:0.9700
step:1800 loss:0.0419 accuracy:0.9900
step:1900 loss:0.1250 accuracy:0.9800
step:2000 loss:0.1434 accuracy:0.9600
step:2100 loss:0.2688 accuracy:0.9200
step:2200 loss:0.0993 accuracy:0.9800
step:2300 loss:0.0320 accuracy:0.9900
step:2400 loss:0.0575 accuracy:0.9800
step:2500 loss:0.1194 accuracy:0.9600
step:2600 loss:0.0796 accuracy:0.9700
step:2700 loss:0.2312

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import mglearn
import matplotlib.pyplot as plt
def predictAll(test_X, batch_size=100):
    predict_value_list = []
    for i in range(0, len(test_X), batch_size):
        X = test_X[i: i + batch_size]
        predict_value = model.predict(X)
        predict_value_list.extend(predict_value)
    return np.array(predict_value_list)

Y = predictAll(test_X)
y = np.argmax(Y, axis=1)
predict_label_list = label.inverse_transform(y)
score_image = mglearn.tools.heatmap(confusion_matrix(test_label_list, predict_label_list),
                                    xlabel='Predicted label',ylabel='True label',xticklabels=label.classes_,
                                    yticklabels=label.classes_,cmap=plt.cm.gray_r,fmt='%d')
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.title('混淆矩阵')
plt.gca().invert_yaxis()

In [23]:
from sklearn.metrics import precision_recall_fscore_support

def eval_model(y_true, y_pred, labels):
    # 计算每个分类的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # 计算总体的平均Precision, Recall, f1, support
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': labels,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['总体'],
        u'Precision': [tot_p],
        u'Recall': [tot_r],
        u'F1': [tot_f1],
        u'Support': [tot_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

eval_model(test_label_list, predict_label_list, label.classes_)

Unnamed: 0,Label,Precision,Recall,F1,Support
0,体育,0.996016,1.0,0.998004,1000
1,娱乐,0.996917,0.97,0.983274,1000
2,家居,0.990453,0.83,0.903156,1000
3,房产,0.960976,0.985,0.97284,1000
4,教育,0.967675,0.928,0.947422,1000
5,时尚,0.913761,0.996,0.95311,1000
6,时政,0.931262,0.989,0.959263,1000
7,游戏,0.955727,0.993,0.974007,1000
8,科技,0.992813,0.967,0.979737,1000
9,财经,0.954633,0.989,0.971513,1000
