In [None]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. 
# This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

In [None]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. 
# All changes under this directory will be kept even after reset. 
# Please clean unnecessary files in time to speed up environment loading. 
!ls /home/aistudio/work

In [None]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, 
# you need to use the persistence path as the following: 
!mkdir /home/aistudio/external-libraries
!pip install beautifulsoup4 -t /home/aistudio/external-libraries

In [None]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: 
# Also add the following code, 
# so that every time the environment (kernel) starts, 
# just run the following code: 
import sys 
sys.path.append('/home/aistudio/external-libraries')

# 简介
## paddlepaddle论文复现
- https://aistudio.baidu.com/aistudio/competition/detail/106
### 复现论文： 字符级CNN文本分类
- 论文名称： Character-level Convolutional Networks for Text Classification
- 数据集： AG News Dbpedia Yelp Binary classification Yelp Fine-grained classification
- 验收标准： 
	1. 复现论文中“Lg. Conv. Th.”模型（见Table 4）（参考论文和实现链接） 
    
   		- 	https://arxiv.org/pdf/1509.01626v3.pdf
    	-   https://github.com/gaussic/text-classification-cnn-rnn
      
	2. Amazon Review Full测试集error rate=40.45%，Yahoo! Answers测试集error rate=28.80%



# 处理输入数据

In [None]:
! head -n 1 work/data/cnews.train.txt

体育	马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 来到沈阳，国奥队依然没有摆脱雨水的困扰。7月31日下午6点，国奥队的日常训练再度受到大雨的干扰，无奈之下队员们只慢跑了25分钟就草草收场。31日上午10点，国奥队在奥体中心外场训练的时候，天就是阴沉沉的，气象预报显示当天下午沈阳就有大雨，但幸好队伍上午的训练并没有受到任何干扰。下午6点，当球队抵达训练场时，大雨已经下了几个小时，而且丝毫没有停下来的意思。抱着试一试的态度，球队开始了当天下午的例行训练，25分钟过去了，天气没有任何转好的迹象，为了保护球员们，国奥队决定中止当天的训练，全队立即返回酒店。在雨中训练对足球队来说并不是什么稀罕事，但在奥运会即将开始之前，全队变得“娇贵”了。在沈阳最后一周的训练，国奥队首先要保证现有的球员不再出现意外的伤病情况以免影响正式比赛，因此这一阶段控制训练受伤、控制感冒等疾病的出现被队伍放在了相当重要的位置。而抵达沈阳之后，中后卫冯萧霆就一直没有训练，冯萧霆是7月27日在长春患上了感冒，因此也没有参加29日跟塞尔维亚的热身赛。队伍介绍说，冯萧霆并没有出现发烧症状，但为了安全起见，这两天还是让他静养休息，等感冒彻底好了之后再恢复训练。由于有了冯萧霆这个例子，因此国奥队对雨中训练就显得特别谨慎，主要是担心球员们受凉而引发感冒，造成非战斗减员。而女足队员马晓旭在热身赛中受伤导致无缘奥运的前科，也让在沈阳的国奥队现在格外警惕，“训练中不断嘱咐队员们要注意动作，我们可不能再出这样的事情了。”一位工作人员表示。从长春到沈阳，雨水一路伴随着国奥队，“也邪了，我们走到哪儿雨就下到哪儿，在长春几次训练都被大雨给搅和了，没想到来沈阳又碰到这种事情。”一位国奥球员也对雨水的“青睐”有些不解。


In [None]:
! head  work/data/cnews.vocab.txt

<PAD>
，
的
。
一
是
在
0
有
不


In [None]:
! wc -l work/data/cnews.vocab.txt

4999 work/data/cnews.vocab.txt


In [None]:
import sys
from collections import Counter

import numpy as np

In [None]:
# 将词转化成id
def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [_.strip() for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id

words, word_dict = read_vocab('work/data/cnews.vocab.txt')

print(list(word_dict.items())[:20])

[('<PAD>', 0), ('，', 1), ('的', 2), ('。', 3), ('一', 4), ('是', 5), ('在', 6), ('0', 7), ('有', 8), ('不', 9), ('了', 10), ('中', 11), ('1', 12), ('人', 13), ('大', 14), ('、', 15), ('国', 16), ('', 3903), ('2', 18), ('这', 19)]


In [None]:
# 生成批次数据

def batch_iter(x, y, batch_size=64):
    """生成批次数据"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

In [None]:
# 编码label

def read_category():
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']

    categories = [x for x in categories]

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id

categories, cat_dict = read_category()
print(list(cat_dict.items()))

[('体育', 0), ('财经', 1), ('房产', 2), ('家居', 3), ('教育', 4), ('科技', 5), ('时尚', 6), ('时政', 7), ('游戏', 8), ('娱乐', 9)]


In [None]:
# 编码输入输出文件

# 朝前面补充0
def pad(arr, l):
    if len(arr) > l:
        return arr[:l]
    else:
        return [0 for _ in range(l - len(arr))] + arr

# one hot 编码
def one_hot(pos, l):
    res = [0 for _ in range (l)]
    res[pos] = 1
    return res

def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
    return contents, labels


def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
        # if i == 1000:
        #     break

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    # print(data_id[:3])
    # print(label_id[:3])
    # x_pad = pad(data_id, max_length)
    x_pad = [pad(x, max_length) for x in data_id]
    # x_pad = map(data_id, pad(l = max_length))
    y_pad = [one_hot(x, len(cat_to_id)) for x in label_id]
    # y_pad = map(label_id, one_hot(l = len(cat_to_id)))
    # y_pad = one_hot(label_id, len(cat_to_id))  # 将标签转换为one-hot表示
    return x_pad, y_pad


train_x, train_y = process_file('work/data/cnews.train.txt', word_dict, cat_dict )


In [None]:
print(train_y[9999])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


# 模型构造
### 原始模型代码
```python
%tensorflow_version 1.x
import tensorflow as tf
print(tf.__version__)



class TCNNConfig(object):
    """CNN配置参数"""

    embedding_dim = 64  # 词向量维度
    seq_length = 600  # 序列长度
    num_classes = 10  # 类别数
    num_filters = 256  # 卷积核数目
    kernel_size = 5  # 卷积核尺寸
    vocab_size = 5000  # 词汇表达小

    hidden_dim = 128  # 全连接层神经元

    dropout_keep_prob = 0.5  # dropout保留比例
    learning_rate = 1e-3  # 学习率

    batch_size = 64  # 每批训练大小
    num_epochs = 10  # 总迭代轮次

    print_per_batch = 100  # 每多少轮输出一次结果
    save_per_batch = 10  # 每多少轮存入tensorboard


class TextCNN(object):
    """文本分类，CNN模型"""

    def __init__(self, config):
        self.config = config

        # 三个待输入的数据
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.cnn()

    def cnn(self):
        """CNN模型"""
        # 词向量映射
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)

        with tf.name_scope("cnn"):
            # CNN layer
            conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
            # global max pooling layer
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        with tf.name_scope("score"):
            # 全连接层，后面接dropout以及relu激活
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)

            # 分类器
            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别

        with tf.name_scope("optimize"):
            # 损失函数，交叉熵
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # 优化器
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # 准确率
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
```

### 原始模型结构

---------
|Variables: name (type shape) |[size]|
|-|-|
|embedding:0| (float32_ref 5000x64) [320000, bytes: 1280000]
|conv/kernel:0| (float32_ref 5x64x256) [81920, bytes: 327680]
|conv/bias:0| (float32_ref 256) [256, bytes: 1024]
|fc1/kernel:0| (float32_ref 256x128) [32768, bytes: 131072]
|fc1/bias:0| (float32_ref 128) [128, bytes: 512]
|fc2/kernel:0| (float32_ref 128x10) [1280, bytes: 5120]
|fc2/bias:0| (float32_ref 10) [10, bytes: 40]
|Total size of variables: |436362
|Total bytes of variables: |1745448


In [None]:
# 新建模型
# 定义 SimpleNet 网络结构
import paddle
from paddle.nn import Conv2D, MaxPool2D, Linear,Conv1D,max_pool1d
import paddle.nn.functional as F
import paddle.fluid as fluid


#config
class TCNNConfig(object):
    """CNN配置参数"""

    embedding_dim = 64  # 词向量维度
    seq_length = 600  # 序列长度
    num_classes = 10  # 类别数
    num_filters = 256  # 卷积核数目
    kernel_size = 5  # 卷积核尺寸
    vocab_size = 5000  # 词汇表达小

    hidden_dim = 128  # 全连接层神经元

    dropout_keep_prob = 0.5  # dropout保留比例
    learning_rate = 1e-3  # 学习率

    batch_size = 64  # 每批训练大小
    num_epochs = 10  # 总迭代轮次

    print_per_batch = 100  # 每多少轮输出一次结果
    save_per_batch = 10  # 每多少轮存入tensorboard



# 多层卷积神经网络实现
class CNN(paddle.nn.Layer):
     def __init__(self, config):
         super(CNN, self).__init__()
         self.config = config

         #embedding
         self.embedding = fluid.layers.embedding(input = inputs, size = [self.config.vocab_size, self.config.embedding_dim])
         # 定义卷积层，输出特征通道out_channels设置为20，卷积核的大小kernel_size为5，卷积步长stride=1，padding=2
         self.conv1 = Conv1D(in_channels= config.embedding_dim, out_channels= config.num_filters, kernel_size= config.kernel_size)
         # 定义池化层，池化核的大小kernel_size为2，池化步长为2
         self.max_pool1 = max_pool1d(kernel_size=64, stride=0)
         # 定义卷积层，输出特征通道out_channels设置为20，卷积核的大小kernel_size为5，卷积步长stride=1，padding=2
         self.conv2 = Conv2D(in_channels=20, out_channels=20, kernel_size=5, stride=1, padding=2)
         # 定义池化层，池化核的大小kernel_size为2，池化步长为2
         self.max_pool2 = MaxPool2D(kernel_size=2, stride=2)
         # 定义一层全连接层，输出维度是1
         self.fc = Linear(in_features=980, out_features=1)
         
    # 定义网络前向计算过程，卷积后紧接着使用池化层，最后使用全连接层计算最终输出
    # 卷积层激活函数使用Relu，全连接层不使用激活函数
     def forward(self, inputs):
         x = self.conv1(inputs)
         x = F.relu(x)
         x = self.max_pool1(x)
         x = self.conv2(x)
         x = F.relu(x)
         x = self.max_pool2(x)
         x = paddle.reshape(x, [x.shape[0], -1])
         x = self.fc(x)
         return x

In [None]:
!git clone https://github.com/beiyouwuyanzu/text_classification_cnn.git

Cloning into 'text_classification_cnn'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.
Checking connectivity... done.


In [23]:
!cd text_classification_cnn/ && git push origin HEAD:refs/for/main

fatal: unable to access 'https://github.com/beiyouwuyanzu/text_classification_cnn.git/': gnutls_handshake() failed: Error in the pull function.


请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 