In [17]:
import random
import jieba
import pandas as pd
import numpy as np

In [2]:
stopwords = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\stopwords.txt", index_col=False, quoting=3, sep="\t", names=["stopword"], encoding="utf-8")
stopwords = stopwords["stopword"].values

In [6]:
# 加载语料
laogong_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beilaogongda.csv", encoding="utf-8", sep=",")
laopo_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beilaopoda.csv", encoding="utf-8", sep=",")
erzi_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beierzida.csv", encoding="utf-8", sep=",")
nver_df = pd.read_csv(r"E:\DeepLearning\jupyter_code\dataset\corpus\03_project\beinverda.csv", encoding="utf-8", sep=",")

In [7]:
# 删除语料的nan行
laogong_df.dropna(inplace=True)
laopo_df.dropna(inplace=True)
erzi_df.dropna(inplace=True)
nver_df.dropna(inplace=True)

In [8]:
# 转换
laogong = laogong_df.segment.values.tolist()
laopo = laopo_df.segment.values.tolist()
erzi = erzi_df.segment.values.tolist()
nver = nver_df.segment.values.tolist()

In [10]:
# 分词和去掉停用词

## 定义分词和打标签函数preprocess_text
def preprocess_text(content_lines, sentences, category):
    # content_lines是上面转换得到的list
    # sentences是空的list，用来存储打上标签后的数据
    # category是类型标签
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            segs = [v for v in segs if not str(v).isdigit()]  # 除去数字
            segs = list(filter(lambda x: x.strip(), segs))  # 除去左右空格
            segs = list(filter(lambda x: len(x) > 1, segs))  # 除去长度为1的字符
            segs = list(filter(lambda x: x not in stopwords, segs))  # 除去停用词
            sentences.append((" ".join(segs), category))  # 打标签
        except Exception:
            print(line)
            continue

# 调用上面函数，生成训练数据
sentences = []
preprocess_text(laogong, sentences, 0)
preprocess_text(laopo, sentences, 1)
preprocess_text(erzi, sentences, 2)
preprocess_text(nver, sentences, 3)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.337 seconds.
Prefix dict has been built succesfully.


In [11]:
# 先打乱数据，使得数据分布均匀，然后获取特征和标签列表
random.shuffle(sentences)  # 打乱数据，生成更可靠的训练集
for sentence in sentences[:10]:    # 输出前10条数据，观察一下
    print(sentence[0], sentence[1])

# 所有特征和对应标签
all_texts = [sentence[0] for sentence in sentences]
all_labels = [sentence[1] for sentence in sentences]

报警 老婆 椅子 民警 到场 1
报警 老婆 民警 到场 1
家中 银行卡 不见 儿子 拿称 儿子 民警 到场 2
报警 儿子 民警 到场 通知 卫生局 民警 到场 确认 不用 自行 通知 2
报警 人称 老公 持械 民警 到场 0
报警 老婆 棍子 民警 到场 携带 防护 装备 1
报警 人称 女儿 手持 铁棍 无人 民警 到场 3
丈夫 赶出来 人伤 无需 救护 0
家暴 报警 妹妹 老公 妹妹 轻微伤 不明 民警 到场 携带 防护 装备 0
报警 人称 老公 持械 无需 救护车 携带 防护 装备 民警 到场 0


In [13]:
# 使用LSTM对数据进行分类
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, Embedding, GRU
from keras.models import Sequential

In [20]:
# 预定义变量
MAX_SEQENCE_LENGTH = 100   # 最大序列长度
EMBEDDING_DIM = 200   # 词嵌入维度
VALIDATION_SPLIT = 0.16   # 验证集比例
TEST_SPLIT = 0.2  # 测试集比例

# 使用keras的sequence模块文本序列填充
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)
sequences = tokenizer.texts_to_sequences(all_texts)
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))


data = pad_sequences(sequences, maxlen=MAX_SEQENCE_LENGTH)
labels = to_categorical(np.asarray(all_labels))
print("data shape:", data.shape)
print("labels shape:", labels.shape)

Found 480 unique tokens.
data shape: (1674, 100)
labels shape: (1674, 4)


In [21]:
# 数据切分
p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT))
p2 = int(len(data) * (1 - TEST_SPLIT))

# 训练集
x_train = data[:p1]
y_train = labels[:p1]

# 验证集
x_val = data[p1:p2]
y_val = labels[p1:p2]

# 测试集
x_test = data[p2:]
y_test = labels[p2:]


In [23]:
# LSTM训练模型
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQENCE_LENGTH))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dense(labels.shape[1], activation="softmax"))
model.summary()

# 模型编译
model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])
print(model.metrics_names)

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=128)
model.save("lstm.h5")
# 模型评估
print(model.evaluate(x_test, y_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 200)          96200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                12864     
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 260       
Total params: 430,124
Trainable params: 430,124
Non-trainable params: 0
_________________________________________________________________
['loss', 'acc']
Train on 1071 samples, validate on 268 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7

In [26]:
# 使用GRU模型
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQENCE_LENGTH))
model.add(GRU(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dense(labels.shape[1], activation="softmax"))
model.summary()

model.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])
print(model.metrics_names)

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=128)
model.save("gru.h5")

print(model.evaluate(x_test, y_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 200)          96200     
_________________________________________________________________
gru_1 (GRU)                  (None, 200)               240600    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                12864     
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 260       
Total params: 349,924
Trainable params: 349,924
Non-trainable params: 0
_________________________________________________________________
['loss', 'acc']
Train on 1071 samples, validate on 268 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7