In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

print('>>> 读取数据...')
data = pd.read_csv('../data/data.csv')
print('数据维度：', data.shape)

print('>>> 数据构建...')
data['accusation'] = data['accusation'].apply(lambda x: x.split(';')[0])
top10 = data.accusation.value_counts().sort_values(ascending=False).index.to_list()[:10]
data = data.loc[data.accusation.isin(top10)]

print('>>> 数据探索性分析...')
labels = []
for acc in data.accusation:
    labels.extend(acc.split(';'))
counter = Counter(labels)
print('类别：', counter.keys())
print('各类别样本数：\n', dict(counter))
print('最少类别样本数：', min(counter.values()))

print('>>> 数据集划分...')
columns = ['accusation', 'fact']
data = data[columns]
_ = [data.loc[data.accusation == acc][:3900] for acc in counter.keys()]
data = pd.concat(_, axis=0)
train, test = train_test_split(data, test_size=2/13, stratify=data.accusation)
train, dev = train_test_split(train, test_size=1/11, stratify=train.accusation)
print('训练集维度：', train.shape)
print('验证集维度：', dev.shape)
print('测试集维度：', test.shape)
train.to_csv('../data/train.csv', index=None)
test.to_csv('../data/test.csv', index=None)
dev.to_csv('../data/dev.csv', index=None)
# >>> 读取数据...
# 数据维度： (357454, 5)
# >>> 数据构建...
# >>> 数据探索性分析...
# 各类别样本数：
#  {'危险驾驶': 76527, '交通肇事': 36709, '盗窃': 80206, '信用卡诈骗': 5153, '容留他人吸毒': 13720, '故意伤害': 42809, '抢劫': 6128, '寻衅滋事': 7791, '走私、贩卖、运输、制造毒品': 27157, '诈骗': 11193}
# 最少类别样本数： 5153
# >>> 数据集划分...
# 训练集维度： (30000, 2)
# 验证集维度： (3000, 2)
# 测试集维度： (6000, 2)

>>> 读取数据...
数据维度： (357454, 5)
>>> 数据构建...
>>> 数据探索性分析...
类别： dict_keys(['危险驾驶', '交通肇事', '盗窃', '信用卡诈骗', '容留他人吸毒', '故意伤害', '抢劫', '寻衅滋事', '走私、贩卖、运输、制造毒品', '诈骗'])
各类别样本数：
 {'危险驾驶': 76527, '交通肇事': 36709, '盗窃': 80206, '信用卡诈骗': 5153, '容留他人吸毒': 13720, '故意伤害': 42809, '抢劫': 6128, '寻衅滋事': 7791, '走私、贩卖、运输、制造毒品': 27157, '诈骗': 11193}
最少类别样本数： 5153
>>> 数据集划分...
训练集维度： (30000, 2)
验证集维度： (3000, 2)
测试集维度： (6000, 2)


In [1]:
# dl_eda.py

import pandas as pd

print('>>> 读取数据...')
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

wordslen_median = train['fact'].apply(lambda x: len(x)).median()
print('字符数中位数：', wordslen_median)

In [6]:
# rnn_model.py

import tensorflow as tf

class TRNNConfig(object):
    '''RNN配置'''
    
    # 1. 模型参数
    vocab_size = 5000 # 词汇表大小
    
    embedding_dim = 64 # 词嵌入维度
    seq_length = 300 # 序列长度
    
    hidden_dim = 128 # 隐藏层维度
    rnn = 'gru' # 单元类型
    num_layers = 2 # 隐层数量
    
    num_classes = 30 # 类别数量
    
    # 2. 学习参数
    dropout_keep_prob = 0.8 # dropout保留比例（cnn: 0.5）
    learning_rate = 1e-3 # 学习率
    
    batch_size = 128 # 每批训练大小
    num_epochs = 10 # 全数据集迭代次数
    
    print_per_batch = 100 # 每多少轮输出结果
    save_per_batch = 10 # 每多少轮保存结果
    
class TextRNN(object):
    '''RNN文本分类模型'''
    
    def __init__(self, config):
        self.config = config
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.rnn()

    def rnn(self):
        
        def lstm_cell(): # lstm核
            return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
        
        def gru_cell(): # gru核
            return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
        
        def dropout(): # 加dropout层
            if self.rnn == 'lstm':
                cell = lstm_cell()
            else:
                cell = gru_cell()
            return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
        
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
            
        with tf.name_scope('rnn'):
            cells = [dropout() for i in range(self.config.num_layers)]
            rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
            
            _output, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
            last = _output[:, -1, :]
            
        with tf.name_scope('score'):
            fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)
            
            self.logits = tf.layers.dense(fc, self.config.num_calsses, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)
        
        with tf.name_scope('optimize'):
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
            
        with tf.name_scope('accuracy'):
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [44]:
from collections import Counter

train_path = '../data/train.csv'
test_path = '../data/test.csv'
vocab_path = '../data/vocab.txt'

def build_vocab(train_path, text_col, vocab_path, vocab_size=5000):
    '''构建词汇表'''
    
    df = pd.read_csv(train_path)
    texts = df[text_col].to_list()
    
    all_data = []
    for text in texts:
        all_data.extend(text)
        
    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size)
    words, _ = list(zip(*count_pairs))
    
    with open(vocab_path, 'w') as f:
        f.write('\n'.join(words))

def word_encode():
    pass

def label_encode():
    pass

def process_file(data_path, text_col, label_col, vocab_path):
    '''构建数据集'''
    
    df = pd.read_csv(data_path)
    texts, labels = df[text_col].to_list(), df[label_col].to_list()
    with open(vocab_path, 'r') as f:
        vocabf.readlines()
    
    


# build_vocab(train_path, 'fact', vocab_path, 5000)
process_file(train_path, 'fact', 'accusation', vocab_path)

['，\n',
 '某\n',
 '人\n',
 '1\n',
 '被\n',
 '0\n',
 '2\n',
 '。\n',
 '、\n',
 '的\n',
 '告\n',
 '×\n',
 '证\n',
 '定\n',
 '民\n',
 '公\n',
 '事\n',
 '年\n',
 '月\n',
 '车\n',
 '3\n',
 '在\n',
 '经\n',
 '5\n',
 '机\n',
 '日\n',
 '市\n',
 '为\n',
 '中\n',
 '行\n',
 '4\n',
 '后\n',
 '时\n',
 '实\n',
 '6\n',
 '害\n',
 '法\n',
 '述\n',
 '元\n',
 '认\n',
 '关\n',
 '驶\n',
 '上\n',
 '案\n',
 '检\n',
 '以\n',
 '处\n',
 '鉴\n',
 '区\n',
 '安\n',
 '其\n',
 '院\n',
 '指\n',
 '路\n',
 '7\n',
 '控\n',
 '一\n',
 '8\n',
 '‘\n',
 '查\n',
 '发\n',
 '诉\n',
 '9\n',
 '许\n',
 '刑\n',
 '伤\n',
 '驾\n',
 '等\n',
 '书\n',
 '交\n',
 '场\n',
 '明\n',
 '据\n',
 '现\n',
 '县\n',
 '并\n',
 '甲\n',
 '了\n',
 '理\n',
 '通\n',
 '供\n',
 '至\n',
 '和\n',
 '获\n',
 '号\n',
 '（\n',
 '）\n',
 '张\n',
 '：\n',
 '王\n',
 '故\n',
 '有\n',
 '察\n',
 '及\n',
 '毒\n',
 '过\n',
 '李\n',
 '当\n',
 '犯\n',
 '陈\n',
 '到\n',
 '警\n',
 '价\n',
 '币\n',
 '将\n',
 '酒\n',
 '分\n',
 '成\n',
 '与\n',
 '本\n',
 'X\n',
 '.\n',
 '出\n',
 '内\n',
 '任\n',
 '部\n',
 '共\n',
 '向\n',
 '品\n',
 '大\n',
 '审\n',
 '；\n',
 '盗\n',
 '物\n',
 '罪\n',


In [41]:
dict(zip([1,2,3], ['a', 'b', 'c']))

{1: 'a', 2: 'b', 3: 'c'}

In [42]:
zip([1,2,3], ['a', 'b', 'c'])

<zip at 0x1cdcb4e88>

In [6]:
import tensorflow as tf
import numpy as np

# 定义一个未知变量input_ids用于存储索引
input_ids = tf.placeholder(dtype=tf.int32, shape=[None, 7])

# 定义一个已知变量embedding，是一个5*5的对角矩阵
# embedding = tf.Variable(np.identity(5, dtype=np.int32))

# 或者随机一个矩阵
embedding = np.asarray([[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3], [4.1, 4.2, 4.3]])

# 根据input_ids中的id，查找embedding中对应的元素
input_embedding = tf.nn.embedding_lookup(embedding, input_ids)

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
# print(embedding.eval())
print(sess.run(input_embedding, feed_dict={input_ids: [[1, 2, 3, 0, 3, 2, 1], [1,3,0,3,2,2,2]]}))


[[[1.1 1.2 1.3]
  [2.1 2.2 2.3]
  [3.1 3.2 3.3]
  [0.1 0.2 0.3]
  [3.1 3.2 3.3]
  [2.1 2.2 2.3]
  [1.1 1.2 1.3]]

 [[1.1 1.2 1.3]
  [3.1 3.2 3.3]
  [0.1 0.2 0.3]
  [3.1 3.2 3.3]
  [2.1 2.2 2.3]
  [2.1 2.2 2.3]
  [2.1 2.2 2.3]]]


