<a href="https://colab.research.google.com/github/biabiubong/bobingtest/blob/master/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import json
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import jieba

jieba.setLogLevel('WARN')

class data_transform():
    def __init__(self):
        self.data_path = None
        self.data = None
        self.texts_cut = None
        self.tokenizer = None
        self.label_set = {}
        self.extraction = {}
        self.tokenizer_fact = None

    def read_data(self, path=None):
        '''
        读取json文件,必须readlines，否则中间有格式会报错
        :param path: 文件路径
        :return:json数据
        eg. data_valid = data_transform.read_data(path='./data/data_valid.json')
        '''
        self.data_path = path
        f = open(path, 'r', encoding='utf8')
        data_raw = f.readlines()
        data = []
        for num, data_one in enumerate(data_raw):
            try:
                data.append(json.loads(data_one))
            except Exception as e:
                print('1')
        self.data = data

    def extract_data(self, name='accusation'):
        '''
        提取需要的信息，以字典形式存储
        :param name: 提取内容
        :return: 事实描述,罪名,相关法条
        eg. data_valid_accusations = data_transform.extract_data(name='accusation')
        '''
        data = self.data
        if name == 'fact':
            extraction = list(map(lambda x: x['fact'], data))
        elif name in ['accusation', 'relevant_articles']:
            extraction = list(map(lambda x: x['meta'][name], data))
        elif name == 'imprisonment':
            extraction = []
            for i in data:
                if i['meta']['term_of_imprisonment']['death_penalty']:
                    extraction.append([500])
                elif i['meta']['term_of_imprisonment']['life_imprisonment']:
                    extraction.append([400])
                else:
                    extraction.append([i['meta']['term_of_imprisonment']['imprisonment']])
        self.extraction.update({name: extraction})

    def cut_texts(self, texts=None, need_cut=True, word_len=1, texts_cut_savepath=None):
        '''
        文本分词剔除停用词
        :param texts:文本列表
        :param need_cut:是否需要分词
        :param word_len:保留词语长度
        :param texts_cut_savepath:保存路径
        :return:
        '''
        if need_cut:
            if word_len > 1:
                texts_cut = [[word for word in jieba.lcut(one_text) if len(word) >= word_len] for one_text in texts]
            else:
                texts_cut = [jieba.lcut(one_text) for one_text in texts]
        else:
            if word_len > 1:
                texts_cut = [[word for word in one_text if len(word) >= word_len] for one_text in texts]
            else:
                texts_cut = texts

        if texts_cut_savepath is not None:
            with open(texts_cut_savepath, 'w') as f:
                json.dump(texts_cut, f)
        return texts_cut

    def text2seq(self, texts_cut=None, tokenizer_fact=None, num_words=2000, maxlen=30):
        '''
        文本转序列，训练集过大全部转换会内存溢出，每次放5000个样本
        :param texts_cut: 分词后的文本列表
        :param tokenizer:转换字典
        :param num_words:字典词数量
        :param maxlen:保留长度
        :return:向量列表
        eg. ata_transform.text2seq(texts_cut=train_fact_cut,num_words=2000, maxlen=500)
        '''
        texts_cut_len = len(texts_cut)

        if tokenizer_fact is None:
            tokenizer_fact = Tokenizer(num_words=num_words)
            if texts_cut_len > 10000:
                print('文本过多，分批转换')
            n = 0
            # 分批训练
            while n < texts_cut_len:
                tokenizer_fact.fit_on_texts(texts=texts_cut[n:n + 10000])
                n += 10000
                if n < texts_cut_len:
                    print('tokenizer finish fit samples')
                else:
                    print('tokenizer finish fit samples')
            self.tokenizer_fact = tokenizer_fact

        # 全部转为数字序列
        fact_seq = tokenizer_fact.texts_to_sequences(texts=texts_cut)
        print('finish texts to sequences')

        # 内存不够，删除
        del texts_cut

        n = 0
        fact_pad_seq = []
        # 分批执行pad_sequences
        while n < texts_cut_len:
            fact_pad_seq += list(pad_sequences(fact_seq[n:n + 10000], maxlen=maxlen,
                                               padding='post', value=0, dtype='int'))
            n += 10000
            if n < texts_cut_len:
                print('finish pad_sequences samples')
            else:
                print('finish pad_sequences samples')
        self.fact_pad_seq = fact_pad_seq

    def creat_label_set(self, name):
        '''
        获取标签集合，用于one-hot
        :param name: 待创建集合的标签名称
        :return:
        '''
        if name == 'accusation':
            name_f = 'accu'
        elif name == 'relevant_articles':
            name_f = 'law'
        with open('/content/accu.txt', encoding='utf-8') as f:
            label_set = f.readlines()
        label_set = [i[:-1] for i in label_set]
        self.label_set.update({name: np.array(label_set)})

    def creat_label(self, label, label_set):
        '''
        构建标签one-hot
        :param label: 原始标签
        :param label_set: 标签集合
        :return: 标签one-hot
        eg. creat_label(label=data_valid_accusations[12], label_set=accusations_set)
        '''
        label_str = [str(i) for i in label]
        label_zero = np.zeros(len(label_set))
        label_zero[np.in1d(label_set, label_str)] = 1
        return label_zero

    def creat_labels(self, label_set=None, labels=None, name='accusation'):
        '''
        调用creat_label遍历标签列表生成one-hot二维数组
        :param label_set: 标签集合,数组
        :param labels: 标签数据，二维列表，没有则调用extract_data函数提取
        :param name:
        :return:
        '''
        if label_set is None:
            label_set = self.label_set[name]
        if labels is None:
            labels = self.extraction[name]
        labels_one_hot = list(map(lambda x: self.creat_label(label=x, label_set=label_set), labels))
        return labels_one_hot

In [4]:
import json
import pickle
import jieba
import numpy as np
# -*- coding: utf-8 -*-
jieba.setLogLevel('WARN')

num_words = 40000
maxlen = 400
########################################################################################
# big数据集处理
data_transform_big = data_transform()

# 读取json文件,1710857行
data_transform_big.read_data(path='/content/data_test.json')

# 提取需要信息
data_transform_big.extract_data(name='fact')
# big_fact = data_transform_big.extraction['fact']

# 分词并保存原始分词结果，词语长度后期可以再改
for i in range(18):
    texts=data_transform_big.extraction['fact'][i*100000:(i*100000 + 100000)]
    big_fact_cut = data_transform_big.cut_texts(texts=texts, word_len=1,
                                                need_cut=True)
    with open('/content/datadeal/data_cut/big_fact_cut_%d_%d.pkl' % (i*100000, i*100000 + 100000), mode='wb') as f:
        pickle.dump(big_fact_cut, f)
    print('finish big_fact_cut_%d_%d' % (i*100000, i*100000 + 100000))

for i in range(18):
    print('start big_fact_cut_%d_%d' % (i*100000, i*100000 + 100000))
    with open('/content/datadeal/data_cut/big_fact_cut_%d_%d.pkl' % (i*100000, i*100000 + 100000), mode='rb') as f:
        big_fact_cut = pickle.load(f)
    data_transform_big = data_transform()
    big_fact_cut_new = data_transform_big.cut_texts(texts=big_fact_cut,
                                                    word_len=2,
                                                    need_cut=False)
    with open('/content/datadeal/data_cut/big_fact_cut_%d_%d_new.pkl' % (i*100000, i*100000 + 100000), mode='wb') as f:
        pickle.dump(big_fact_cut_new, f)
    print('finish big_fact_cut_%d_%d' % (i*100000, i*100000 + 100000))

1
finish big_fact_cut_0_100000
finish big_fact_cut_100000_200000
finish big_fact_cut_200000_300000
finish big_fact_cut_300000_400000
finish big_fact_cut_400000_500000
finish big_fact_cut_500000_600000
finish big_fact_cut_600000_700000
finish big_fact_cut_700000_800000
finish big_fact_cut_800000_900000
finish big_fact_cut_900000_1000000
finish big_fact_cut_1000000_1100000
finish big_fact_cut_1100000_1200000
finish big_fact_cut_1200000_1300000
finish big_fact_cut_1300000_1400000
finish big_fact_cut_1400000_1500000
finish big_fact_cut_1500000_1600000
finish big_fact_cut_1600000_1700000
finish big_fact_cut_1700000_1800000
start big_fact_cut_0_100000
finish big_fact_cut_0_100000
start big_fact_cut_100000_200000
finish big_fact_cut_100000_200000
start big_fact_cut_200000_300000
finish big_fact_cut_200000_300000
start big_fact_cut_300000_400000
finish big_fact_cut_300000_400000
start big_fact_cut_400000_500000
finish big_fact_cut_400000_500000
start big_fact_cut_500000_600000
finish big_fact_

In [6]:
import json
import pickle
import jieba
import numpy as np
# -*- coding: utf-8 -*-
jieba.setLogLevel('WARN')

num_words = 40000
maxlen = 400
########################################################################################
# big数据集处理
data_transform_big = data_transform()

# 读取json文件,1710857行
data_transform_big.read_data(path='/content/data_test.json')

# 创建数据one-hot标签
data_transform_big.extract_data(name='accusation')
big_accusations = data_transform_big.extraction['accusation']
data_transform_big.creat_label_set(name='accusation')
big_labels = data_transform_big.creat_labels(name='accusation')
np.save('/content/datadeal/labels/_accusation.npy', big_labels)

# big数据集处理
data_transform_big = data_transform()

# 读取json文件,1710857行
data_transform_big.read_data(path='/content/data_test.json')
data_transform_big.extract_data(name='relevant_articles')
big_relevant_articless = data_transform_big.extraction['relevant_articles']
data_transform_big.creat_label_set(name='relevant_articles')
big_labels = data_transform_big.creat_labels(name='relevant_articles')
np.save('/content/datadeal/labels/big_labels_relevant_articles.npy', big_labels)

# big数据集处理
data_transform_big = data_transform()

# 读取json文件,1710857行
data_transform_big.read_data(path='/content/data_test.json')

# 创建刑期连续变量
data_transform_big.extract_data(name='imprisonment')
big_imprisonments = data_transform_big.extraction['imprisonment']
np.save('/content/datadeal/labels/big_labels_imprisonments.npy', big_imprisonments)

# big数据集处理
data_transform_big = data_transform()

# 读取json文件,1710857行
data_transform_big.read_data(path='/content/data_test.json')

# 创建刑期离散变量
data_transform_big.extract_data(name='imprisonment')
big_imprisonments = data_transform_big.extraction['imprisonment']
data_transform_big.creat_label_set(name='imprisonment')
big_labels = data_transform_big.creat_labels(name='imprisonment')
np.save('/content/datadeal/labels/big_labels_imprisonments_discrete.npy', big_labels)

1
1
1
1


In [10]:
import pickle
import jieba
import json
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

jieba.setLogLevel('WARN')

num_words = 80000
maxlen = 400

tokenizer_fact = Tokenizer(num_words=num_words)

for i in range(18):
    print('start big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))
    with open('/content/datadeal/data_cut/big_fact_cut_%d_%d_new.pkl' % (i * 100000, i * 100000 + 100000), mode='rb') as f:
        big_fact_cut = pickle.load(f)
    texts_cut_len = len(big_fact_cut)
    n = 0
    # 分批训练
    while n < texts_cut_len:
        tokenizer_fact.fit_on_texts(texts=big_fact_cut[n:n + 10000])
        n += 10000
        if n < texts_cut_len:
            print('tokenizer finish fit %d samples' % n)
        else:
            print('tokenizer finish fit %d samples' % texts_cut_len)
    print('finish big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))

with open('/content/model/tokenizer_fact_%d.pkl' % (num_words), mode='wb') as f:
    pickle.dump(tokenizer_fact, f)

with open('/content/model/tokenizer_fact_%d.pkl' % (num_words), mode='rb') as f:
    tokenizer_fact=pickle.load(f)

# texts_to_sequences
for i in range(18):
    print('start big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))
    with open('/content/datadeal/data_cut/big_fact_cut_%d_%d_new.pkl' % (i * 100000, i * 100000 + 100000), mode='rb') as f:
        big_fact_cut = pickle.load(f)
    # 分批执行 texts_to_sequences
    big_fact_seq = tokenizer_fact.texts_to_sequences(texts=big_fact_cut)
    with open('/content/datadeal/fact_seq/fact_seq_%d_%d.pkl' % (i * 100000, i * 100000 + 100000), mode='wb') as f:
        pickle.dump(big_fact_seq, f)
    print('finish big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))

# pad_sequences
for i in range(18):
    print('start big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))
    with open('/content/datadeal/fact_seq/fact_seq_%d_%d.pkl' % (i * 100000, i * 100000 + 100000), mode='rb') as f:
        big_fact_seq = pickle.load(f)
    texts_cut_len = len(big_fact_seq)
    n = 0
    fact_pad_seq = []
    # 分批执行pad_sequences
    while n < texts_cut_len:
        fact_pad_seq += list(pad_sequences(big_fact_seq[n:n + 20000], maxlen=maxlen,
                                           padding='post', value=0, dtype='int'))
        n += 20000
        if n < texts_cut_len:
            print('finish pad_sequences %d samples' % n)
        else:
            print('finish pad_sequences %d samples' % texts_cut_len)
    with open('/content/datadeal/fact_pad_seq/fact_pad_seq_%d_%d_%d.pkl' % (maxlen, i * 100000, i * 100000 + 100000),
              mode='wb') as f:
        pickle.dump(fact_pad_seq, f)

# 汇总pad_sequences,5G,16G内存够用
maxlen = 400
num_words = 40000
fact_pad_seq = []
for i in range(18):
    print('start big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))
    with open('/content/datadeal/fact_pad_seq/fact_pad_seq_%d_%d_%d.pkl' % (maxlen, i * 100000, i * 100000 + 100000),
              mode='rb') as f:
        fact_pad_seq += pickle.load(f)
fact_pad_seq = np.array(fact_pad_seq)
np.save('/content/datadeal/fact_pad_seq/big_fact_pad_seq_%d_%d.npy' % (num_words, maxlen), fact_pad_seq)

start big_fact_cut_0_100000
tokenizer finish fit 6344 samples
finish big_fact_cut_0_100000
start big_fact_cut_100000_200000
finish big_fact_cut_100000_200000
start big_fact_cut_200000_300000
finish big_fact_cut_200000_300000
start big_fact_cut_300000_400000
finish big_fact_cut_300000_400000
start big_fact_cut_400000_500000
finish big_fact_cut_400000_500000
start big_fact_cut_500000_600000
finish big_fact_cut_500000_600000
start big_fact_cut_600000_700000
finish big_fact_cut_600000_700000
start big_fact_cut_700000_800000
finish big_fact_cut_700000_800000
start big_fact_cut_800000_900000
finish big_fact_cut_800000_900000
start big_fact_cut_900000_1000000
finish big_fact_cut_900000_1000000
start big_fact_cut_1000000_1100000
finish big_fact_cut_1000000_1100000
start big_fact_cut_1100000_1200000
finish big_fact_cut_1100000_1200000
start big_fact_cut_1200000_1300000
finish big_fact_cut_1200000_1300000
start big_fact_cut_1300000_1400000
finish big_fact_cut_1300000_1400000
start big_fact_cut_1

In [5]:
import numpy as np
from sklearn.metrics import f1_score


def predict2half(predictions):
    return np.where(predictions > 0.5, 1.0, 0.0)


def predict2top(predictions):
    one_hots = []
    for prediction in predictions:
        one_hot = np.where(prediction == prediction.max(), 1.0, 0.0)
        one_hots.append(one_hot)
    return np.array(one_hots)


def predict2both(predictions):
    one_hots = []
    for prediction in predictions:
        one_hot = np.where(prediction > 0.5, 1.0, 0.0)
        if one_hot.sum() == 0:
            one_hot = np.where(prediction == prediction.max(), 1.0, 0.0)
        one_hots.append(one_hot)
    return np.array(one_hots)


def f1_avg(y_pred, y_true):
    '''
    mission 1&2
    :param y_pred:
    :param y_true:
    :return:
    '''
    f1_micro = f1_score(y_pred=y_pred, y_true=y_true, pos_label=1, average='micro')
    f1_macro = f1_score(y_pred=y_pred, y_true=y_true, pos_label=1, average='macro')
    return (f1_micro + f1_macro) / 2


def distance_score(y_true, y_pred):
    '''
    mission 3
    :param y_true:
    :param y_pred:
    :return:
    '''
    result = 0
    n = len(y_true)
    for i in range(n):
        v = np.abs(np.log10(y_true[i][0] + 1) - np.log10(y_pred[i][0] + 1))
        if y_true[i][0] == 500:
            if y_pred[i][0] > 400:
                result += 1 / n
        elif y_true[i][0] == 400:
            if y_pred[i][0] <= 400 and y_pred[i][0] > 300:
                result += 1 / n
        else:
            if v <= 0.2:
                result += 1 / n
            elif v <= 0.4:
                result += 0.8 / n
            elif v <= 0.6:
                result += 0.6 / n
            elif v <= 0.8:
                result += 0.4 / n
            elif v <= 1.0:
                result += 0.2 / n
            else:
                pass
    return result


if __name__ == '__main__':
    print(f1_avg(y_pred=np.array([[0, 1], [1, 0]]),
                 y_true=np.array([[0, 1], [1, 1]])))

from keras.layers import Conv1D, BatchNormalization, Activation, GlobalMaxPool1D


def textcnn_one(word_vec=None, kernel_size=1, filters=512):
    x = word_vec
    x = Conv1D(filters=filters, kernel_size=[kernel_size], strides=1, padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation(activation='relu')(x)
    x = Conv1D(filters=filters, kernel_size=[kernel_size], strides=1, padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation(activation='relu')(x)
    x = GlobalMaxPool1D()(x)

    return x
if __name__ == '__main__':
    from keras.layers import Dense, Embedding, Input, Dropout
    from keras.layers import BatchNormalization, Concatenate
    from keras.models import Model
    from keras.utils import plot_model

    filters=256
    data_input = Input(shape=[400])
    word_vec = Embedding(input_dim=40000 + 1,
                         input_length=400,
                         output_dim=512,
                         mask_zero=False,
                         name='Embedding')(data_input)

    x1 = textcnn_one(word_vec=word_vec, kernel_size=1, filters=filters)
    x2 = textcnn_one(word_vec=word_vec, kernel_size=2, filters=filters)
    x3 = textcnn_one(word_vec=word_vec, kernel_size=3, filters=filters)
    x4 = textcnn_one(word_vec=word_vec, kernel_size=4, filters=filters)
    x5 = textcnn_one(word_vec=word_vec, kernel_size=5, filters=filters)

    x = Concatenate(axis=1)([x1, x2, x3, x4, x5])
    x = BatchNormalization()(x)
    x = Dense(500, activation="relu")(x)
    x = Dense(202, activation="sigmoid")(x)
    model = Model(inputs=data_input, outputs=x)
    plot_model(model, './textcnn.png', show_shapes=True)

0.8166666666666667


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Dense, Embedding, Input,Dropout
from keras.layers import BatchNormalization, Concatenate
import pandas as pd
import time
from keras.models import load_model
#训练text_cnn

print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('accusation')
data_transform_big = data_transform()
num_words = 80000
maxlen = 400
filters = 256
print('num_words = 40000, maxlen = 400')

# fact数据集
fact = np.load('/content/datadeal/fact_pad_seq/big_fact_pad_seq_40000_400.npy')
fact_train, fact_test = train_test_split(fact, test_size=0.05, random_state=1)
del fact

# 标签数据集
labels = np.load('/content/datadeal/labels/_accusation.npy')
labels_train, labels_test = train_test_split(labels, test_size=0.05, random_state=1)
del labels
set_accusation = data_transform_big.creat_label_set(name='accusation')
# set_accusation = np.load('/content/datadeal/set/set_accusation.npy')

data_input = Input(shape=[maxlen])
word_vec = Embedding(input_dim=num_words + 1,
                     input_length=maxlen,
                     output_dim=512,
                     mask_zero=False,
                     name='Embedding')(data_input)

x1 = textcnn_one(word_vec=word_vec, kernel_size=1, filters=filters)
x2 = textcnn_one(word_vec=word_vec, kernel_size=2, filters=filters)
x3 = textcnn_one(word_vec=word_vec, kernel_size=3, filters=filters)
x4 = textcnn_one(word_vec=word_vec, kernel_size=4, filters=filters)
x5 = textcnn_one(word_vec=word_vec, kernel_size=5, filters=filters)

x = Concatenate(axis=1)([x1, x2, x3, x4, x5])
x = BatchNormalization()(x)
x = Dense(1000, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(labels_train.shape[1], activation="sigmoid")(x)
model = Model(inputs=data_input, outputs=x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

n_start = 1
n_end = 21
score_list1 = []
score_list2 = []
for i in range(n_start, n_end):
    model.fit(x=fact_train, y=labels_train, batch_size=512, epochs=1, verbose=1)

    model.save('/content/model/%d_%d/accusation/TextCNN_%d_epochs_%d.h5' % (num_words, maxlen, filters, i))

    y = model.predict(fact_test[:])
    y1 = predict2top(y)
    y2 = predict2half(y)
    y3 = predict2both(y)

    print('%s accu:' % i)
    # 只取最高置信度的准确率
    s1 = [(labels_test[i] == y1[i]).min() for i in range(len(y1))]
    print(sum(s1) / len(s1))
    # 只取置信度大于0.5的准确率
    s2 = [(labels_test[i] == y2[i]).min() for i in range(len(y1))]
    print(sum(s2) / len(s2))
    # 结合前两个
    s3 = [(labels_test[i] == y3[i]).min() for i in range(len(y1))]
    print(sum(s3) / len(s3))

    print('%s f1:' % i)
    # 只取最高置信度的准确率
    s4 = f1_avg(y_pred=y1, y_true=labels_test)
    print(s4)
    # 只取置信度大于0.5的准确率
    s5 = f1_avg(y_pred=y2, y_true=labels_test)
    print(s5)
    # 结合前两个
    s6 = f1_avg(y_pred=y3, y_true=labels_test)
    print(s6)

    score_list1.append([i,
                        sum(s1) / len(s1),
                        sum(s2) / len(s2),
                        sum(s3) / len(s3)])
    score_list2.append([i, s4, s5, s6])

print(pd.DataFrame(score_list1))
print(pd.DataFrame(score_list2))
print('end', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('#####################\n')
# nohup python model_CNN_accusation.py 2>&1 &

start 2024-04-21 08:03:59
accusation
num_words = 40000, maxlen = 400
 1/12 [=>............................] - ETA: 18:41 - loss: 0.8402 - accuracy: 0.0059

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.layers import Dense, Dropout, BatchNormalization
from keras.layers import GRU, MaxPooling1D, Bidirectional
import pandas as pd
import time
from keras.models import load_model
#训练纯cnn


print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('accusation')

num_words = 80000
maxlen = 400
kernel_size = 3
DIM = 512
batch_size = 256

print('num_words = 40000, maxlen = 400 ')

# fact数据集
fact = np.load('/content/datadeal/fact_pad_seq/big_fact_pad_seq_40000_400.npy')
fact_train, fact_test = train_test_split(fact, test_size=0.05, random_state=1)
del fact

# 标签数据集
labels = np.load('/content/datadeal/labels/_accusation.npy')
labels_train, labels_test = train_test_split(labels, test_size=0.05, random_state=1)
del labels

# 数据增强
maxcount = 10000
num = 100
# index_add_accusation = np.load('')
# fact_train = np.concatenate([fact_train, fact_train[index_add_accusation]], axis=0)
# labels_train = np.concatenate([labels_train, labels_train[index_add_accusation]], axis=0)

data_input = Input(shape=[fact_train.shape[1]])
word_vec = Embedding(input_dim=num_words + 1,
                     input_length=maxlen,
                     output_dim=DIM,
                     mask_zero=0,
                     name='Embedding')(data_input)
x = word_vec
x = Conv1D(filters=512, kernel_size=[kernel_size], strides=1, padding='same', activation='relu')(x)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dense(1000, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(labels_train.shape[1], activation="sigmoid")(x)
model = Model(inputs=data_input, outputs=x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model.summary()
n_start = 1
n_end = 21
score_list1 = []
score_list2 = []

for i in range(n_start, n_end):
    model.fit(x=fact_train, y=labels_train, batch_size=batch_size, epochs=1, verbose=1)

    model.save('./model/%d_%d/accusation/CNN_epochs_%d.h5' % (num_words, maxlen, i))

    y = model.predict(fact_test[:])
    y1 = predict2top(y)
    y2 = predict2half(y)
    y3 = predict2both(y)

    print('%s accu:' % i)
    # 只取最高置信度的准确率
    s1 = [(labels_test[i] == y1[i]).min() for i in range(len(y1))]
    print(sum(s1) / len(s1))
    # 只取置信度大于0.5的准确率
    s2 = [(labels_test[i] == y2[i]).min() for i in range(len(y1))]
    print(sum(s2) / len(s2))
    # 结合前两个
    s3 = [(labels_test[i] == y3[i]).min() for i in range(len(y1))]
    print(sum(s3) / len(s3))

    print('%s f1:' % i)
    # 只取最高置信度的准确率
    s4 = f1_avg(y_pred=y1, y_true=labels_test)
    print(s4)
    # 只取置信度大于0.5的准确率
    s5 = f1_avg(y_pred=y2, y_true=labels_test)
    print(s5)
    # 结合前两个
    s6 = f1_avg(y_pred=y3, y_true=labels_test)
    print(s6)

    score_list1.append([i,
                        sum(s1) / len(s1),
                        sum(s2) / len(s2),
                        sum(s3) / len(s3)])
    score_list2.append([i, s4, s5, s6])
    print(pd.DataFrame(score_list1))
    print(pd.DataFrame(score_list2))

print('end', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('#####################\n')

start 2024-04-21 08:07:05
accusation
num_words = 40000, maxlen = 400 
 3/24 [==>...........................] - ETA: 4:51 - loss: 0.6306 - accuracy: 0.0195

In [7]:
from keras.layers import *
from keras.models import *
from keras.utils import plot_model


def attention(input=None, depth=None):
    attention = Dense(1, activation='tanh')(input)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(depth)(attention)
    attention = Permute([2, 1], name='attention_vec')(attention)
    attention_mul = Multiply(name='attention_mul')([input, attention])
    return attention_mul


if __name__ == '__main__':
    data_input = Input(shape=[400])
    word_vec = Embedding(input_dim=40000 + 1,
                         input_length=400,
                         output_dim=512,
                         mask_zero=False,
                         name='Embedding')(data_input)
    x = word_vec
    x = Conv1D(filters=512, kernel_size=[3], strides=1, padding='same', activation='relu')(x)
    x = attention(input=x, depth=512)
    x = GlobalMaxPool1D()(x)
    x = BatchNormalization()(x)
    x = Dense(500, activation="relu")(x)
    x = Dense(202, activation="sigmoid")(x)
    model = Model(inputs=data_input, outputs=x)
    plot_model(model, './attention.png', show_shapes=True)

In [9]:
#cnn_attention训练
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.layers import Dense, Dropout, BatchNormalization
from keras.layers import GRU, MaxPooling1D, Bidirectional
import pandas as pd
import time
from keras.models import load_model

data_transform_big = data_transform()
print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('accusation')
set_accusation = data_transform_big.creat_label_set(name='accusation')
num_words = 80000
maxlen = 400

print('num_words = 40000, maxlen = 400')

# fact数据集
fact = np.load('/content/datadeal/fact_pad_seq/big_fact_pad_seq_40000_400.npy')
fact_train, fact_test = train_test_split(fact, test_size=0.05, random_state=1)
del fact

# 标签数据集
labels = np.load('/content/datadeal/labels/_accusation.npy')
labels_train, labels_test = train_test_split(labels, test_size=0.05, random_state=1)
del labels



data_input = Input(shape=[maxlen])
word_vec = Embedding(input_dim=num_words + 1,
                     input_length=maxlen,
                     output_dim=512,
                     mask_zero=0,
                     name='Embedding')(data_input)
x = word_vec
x = Conv1D(filters=512, kernel_size=[3], strides=1, padding='same', activation='relu')(x)
x = attention(input=x, depth=512)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dense(1000, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(labels_train.shape[1], activation="sigmoid")(x)
model = Model(inputs=data_input, outputs=x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

n_start = 1
n_end = 21
score_list1 = []
score_list2 = []
for i in range(n_start, n_end):
    model.fit(x=fact_train, y=labels_train, batch_size=512, epochs=1, verbose=1)

    model.save('/content/model/CNN_attention_epochs_%d.h5' % (num_words, maxlen, i))

    y = model.predict(fact_test[:])
    y1 = predict2top(y)
    y2 = predict2half(y)
    y3 = predict2both(y)

    print('%s accu:' % i)
    # 只取最高置信度的准确率
    s1 = [(labels_test[i] == y1[i]).min() for i in range(len(y1))]
    print(sum(s1) / len(s1))
    # 只取置信度大于0.5的准确率
    s2 = [(labels_test[i] == y2[i]).min() for i in range(len(y1))]
    print(sum(s2) / len(s2))
    # 结合前两个
    s3 = [(labels_test[i] == y3[i]).min() for i in range(len(y1))]
    print(sum(s3) / len(s3))

    print('%s f1:' % i)
    # 只取最高置信度的准确率
    s4 = f1_avg(y_pred=y1, y_true=labels_test)
    print(s4)
    # 只取置信度大于0.5的准确率
    s5 = f1_avg(y_pred=y2, y_true=labels_test)
    print(s5)
    # 结合前两个
    s6 = f1_avg(y_pred=y3, y_true=labels_test)
    print(s6)

    score_list1.append([i,
                        sum(s1) / len(s1),
                        sum(s2) / len(s2),
                        sum(s3) / len(s3)])
    score_list2.append([i, s4, s5, s6])
    print(pd.DataFrame(score_list1))
    print(pd.DataFrame(score_list2))

print('end', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

start 2024-04-21 08:12:39
accusation
num_words = 40000, maxlen = 400


KeyboardInterrupt: 

In [12]:
from keras.models import Model
from keras.layers import Dense, Input, Embedding
from keras.layers import GlobalMaxPool1D, Dropout, Conv1D, BatchNormalization, Activation, Add
from keras.utils import plot_model


def block(x, kernel_size):
    x_Conv_1 = Conv1D(filters=512, kernel_size=[kernel_size], strides=1, padding='same')(x)
    x_Conv_1 = Activation(activation='relu')(x_Conv_1)
    x_Conv_2 = Conv1D(filters=512, kernel_size=[kernel_size], strides=1, padding='same')(x_Conv_1)
    x_Conv_2 = Add()([x, x_Conv_2])
    x = Activation(activation='relu')(x_Conv_2)
    return x


if __name__ == '__main__':
    num_words = 80000
    maxlen = 400
    kernel_size = 3
    DIM = 512
    batch_size = 256

    data_input = Input(shape=[maxlen])
    word_vec = Embedding(input_dim=num_words + 1,
                         input_length=maxlen,
                         output_dim=DIM,
                         mask_zero=0,
                         name='Embedding')(data_input)
    block1 = block(x=word_vec, kernel_size=3)
    block2 = block(x=block1, kernel_size=3)
    x = GlobalMaxPool1D()(block2)
    x = BatchNormalization()(x)
    x = Dense(1000, activation="relu")(x)
    x = Dropout(0.2)(x)
    x = Dense(202, activation="sigmoid")(x)
    model = Model(inputs=data_input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    plot_model(model, './resnet.png', show_shapes=True)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.layers import Dense, Dropout, BatchNormalization
from keras.layers import GRU, MaxPooling1D, Bidirectional
import pandas as pd
import time
# from resnet import block
# from evaluate import predict2both, predict2half, predict2top, f1_avg
from keras.models import load_model
#resmodel训练
print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('accusation')

num_words = 80000
maxlen = 400
kernel_size = 3
DIM = 512
batch_size = 512

print('num_words = 40000, maxlen = 400')

# fact数据集
fact = np.load('/content/datadeal/fact_pad_seq/big_fact_pad_seq_40000_400.npy')
fact_train, fact_test = train_test_split(fact, test_size=0.05, random_state=1)
del fact

# 标签数据集
labels = np.load('/content/datadeal/labels/_accusation.npy')
labels_train, labels_test = train_test_split(labels, test_size=0.05, random_state=1)
del labels

# 数据增强
maxcount = 60000
num = 10
# index_add_accusation = np.load('./data_deal/index_add_accusation_%d_%d.npy' % (maxcount, num))
# fact_train = np.concatenate([fact_train, fact_train[index_add_accusation]], axis=0)
# labels_train = np.concatenate([labels_train, labels_train[index_add_accusation]], axis=0)

data_input = Input(shape=[maxlen])
word_vec = Embedding(input_dim=num_words + 1,
                     input_length=maxlen,
                     output_dim=DIM,
                     mask_zero=0,
                     name='Embedding')(data_input)
block1 = block(x=word_vec, kernel_size=kernel_size)
block2 = block(x=block1, kernel_size=kernel_size)
x = GlobalMaxPool1D()(block2)
x = BatchNormalization()(x)
x = Dense(1000, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(labels_train.shape[1], activation="sigmoid")(x)
model = Model(inputs=data_input, outputs=x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

n_start = 6
n_end = 13
score_list1 = []
score_list2 = []
for i in range(n_start, n_end):
    model.fit(x=fact_train, y=labels_train, batch_size=batch_size, epochs=1, verbose=2)

    model.save('/content/model/RES_epochs_%d.h5' % (num_words, maxlen, i))

    y = model.predict(fact_test[:])
    y1 = predict2top(y)
    y2 = predict2half(y)
    y3 = predict2both(y)

    print('%s accu:' % i)
    # 只取最高置信度的准确率
    s1 = [(labels_test[i] == y1[i]).min() for i in range(len(y1))]
    print(sum(s1) / len(s1))
    # 只取置信度大于0.5的准确率
    s2 = [(labels_test[i] == y2[i]).min() for i in range(len(y1))]
    print(sum(s2) / len(s2))
    # 结合前两个
    s3 = [(labels_test[i] == y3[i]).min() for i in range(len(y1))]
    print(sum(s3) / len(s3))

    print('%s f1:' % i)
    # 只取最高置信度的准确率
    s4 = f1_avg(y_pred=y1, y_true=labels_test)
    print(s4)
    # 只取置信度大于0.5的准确率
    s5 = f1_avg(y_pred=y2, y_true=labels_test)
    print(s5)
    # 结合前两个
    s6 = f1_avg(y_pred=y3, y_true=labels_test)
    print(s6)

    score_list1.append([i,
                        sum(s1) / len(s1),
                        sum(s2) / len(s2),
                        sum(s3) / len(s3)])
    score_list2.append([i, s4, s5, s6])

print(pd.DataFrame(score_list1))
print(pd.DataFrame(score_list2))
print('end', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
print('#####################\n')
# nohup python model_CNN_accusation.py 2>&1 &

start 2024-04-21 08:16:04
accusation
num_words = 40000, maxlen = 400


In [None]:
import pickle
from keras.models import load_model
import numpy as np
import os

localpath = os.path.dirname(__file__)


class Predictor:
  #改一下路径
    def __init__(self, num_words=80000, maxlen=400,
                 accusation_path=localpath + '/model/accusation/DIM_256_CNN_BN_gpool_epochs_19.h5',
                 relevant_articles_path=localpath + '/model/relevant_articles/DIM_512_RES_BN_gpool_bs_256_epochs_18.h5',
                 imprisonments_path=localpath + '/model/imprisonments/DIM_512_CNN_gpool_BN_epochs_10.h5',
                 tokenizer_path=localpath + '/model/tokenizer_fact_80000.pkl'):
        self.num_words = num_words
        self.maxlen = maxlen
        self.accusation_path = accusation_path
        self.relevant_articles_path = relevant_articles_path
        self.batch_size = 512
        self.content_transform = data_transform()
        self.tokenizer_path = tokenizer_path
        self.model1 = load_model(accusation_path)
        self.model2 = load_model(relevant_articles_path)
        self.model3 = load_model(imprisonments_path)

    def predict(self, content):
        num_words = self.num_words
        maxlen = self.maxlen
        content_transform = self.content_transform
        tokenizer_path = self.tokenizer_path
        # 分词
        content_cut = content_transform.cut_texts(texts=content, word_len=2)
        with open(tokenizer_path, mode='rb') as f:
            tokenizer_fact = pickle.load(f)
        content_transform.text2seq(texts_cut=content_cut, tokenizer_fact=tokenizer_fact,
                                   num_words=num_words, maxlen=maxlen)
        content_fact_pad_seq = np.array(content_transform.fact_pad_seq)

        model1 = self.model1
        accusation = model1.predict(content_fact_pad_seq)
        model2 = self.model2
        relevant_articles = model2.predict(content_fact_pad_seq)
        model3 = self.model3
        imprisonments = model3.predict(content_fact_pad_seq)

        def transform(x):
            n = len(x)
            x_return = np.arange(1, n + 1)[x > 0.5].tolist()
            if len(x_return) == 0:
                x_return = np.arange(1, n + 1)[x == x.max()].tolist()
            return x_return

        result = []
        for i in range(0, len(content)):
            if imprisonments[i][0] > 400:
                imprisonment = -2
            elif imprisonments[i][0] > 300:
                imprisonment = -1
            else:
                imprisonment = int(np.round(imprisonments[i][0], 0))

            result.append({
                "accusation": transform(accusation[i]),
                "articles": transform(relevant_articles[i]),
                "imprisonment": imprisonment
            })
        return result


if __name__ == '__main__':
    content = ['我爱北京天安门', '收款方哪家口碑北京开始，数据库备份围绕健康上网电费']
    predictor = Predictor()
    m = predictor.predict(content)
    print(m)