<a href="https://colab.research.google.com/github/ZhouNLP/tcnlp/blob/master/lstm_model/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

此模型根据其他选手公开的模型修改而来，原模型https://github.com/LogicJake/tianchi_nlp/blob/master/model_lstm_5fold.ipynb

原模型测试集A f1 95.63，调整后的模型测试集A 96.68，测试集B 96.74

如果在colab里面运行，需自行上传除.ipynb以外的其他项目文件

Keras==2.3.1

tensorflow==1.15.1

In [None]:
!wget https://tianchi-competition.oss-cn-hangzhou.aliyuncs.com/531810/train_set.csv.zip
!wget https://tianchi-competition.oss-cn-hangzhou.aliyuncs.com/531810/test_a.csv.zip
!wget https://tianchi-competition.oss-cn-hangzhou.aliyuncs.com/531810/test_b.csv.zip
!unzip train_set.csv.zip
!unzip test_a.csv.zip
!unzip test_b.csv.zip

In [None]:
!wget https://drive.deepnlp.workers.dev/test_a_fake_label.csv  # 伪标签文件

In [None]:
%tensorflow_version 1.x

In [None]:
import os
import numpy as np
import pandas as pd
import warnings
from gensim.models import KeyedVectors
from tqdm import tqdm
import random
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold
# from keras_self_attention import SeqSelfAttention, SeqWeightedAttention

from keras import backend as K
from keras.preprocessing import text, sequence
from keras import Model
from keras.layers import Conv1D, Embedding, Input, Bidirectional, CuDNNLSTM, Dense, Concatenate, Masking, LSTM, SpatialDropout1D
from keras.layers import BatchNormalization, Dropout, Activation
from keras.layers import GlobalMaxPool1D, GlobalAveragePooling1D, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from keras.utils import to_categorical
from keras_radam import RAdam
from keras_lookahead import Lookahead
# from keras import regularizers
# os.environ["CUDA_VISIBLE_DEVICES"] = '1'
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [None]:
!pip install keras-rectified-adam
!pip install keras-lookahead

In [None]:
def fix_seed(seed):
#     random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

seed = 2020
fix_seed(seed)

In [None]:
df_train = pd.read_csv('train_set.csv', sep='\t')
df_test = pd.read_csv('test_a.csv', sep='\t')
df_test_b = pd.read_csv('test_b.csv', sep='\t')
fake_label_a = pd.read_csv('test_a_fake_label.csv', sep='\t')  #伪标签文件，如果从头训练，需要去掉此处及下面的相关代码
df_data = df_train.append(df_test)
df_data = df_data.reset_index(drop=True)
df_data = df_data.append(df_test_b)
df_data = df_data.reset_index(drop=True)
print(df_data.shape, fake_label_a.shape)

In [None]:
max_words_num = None
seq_len = 2000
embedding_dim = 200
col = 'text'

In [None]:
print('Generate seqs')
os.makedirs('seqs', exist_ok=True)
seq_path = 'seqs/seqs_{}_{}.npy'.format(max_words_num, seq_len)
word_index_path = 'seqs/word_index_{}_{}.npy'.format(max_words_num, seq_len)
fake_path = 'seqs/fake_label.npy'

In [None]:
# 前后截取2000个字符，虽然截取3000会更好一点，但是训练时间会大大增加

if not os.path.exists(seq_path) or not os.path.exists(word_index_path):
    tokenizer = text.Tokenizer(num_words=max_words_num, lower=False, filters='')
    tokenizer.fit_on_texts(df_data[col].values.tolist())
    ids_doc = tokenizer.texts_to_sequences(df_data[col].values.tolist())
    pre_post = [doc if len(doc) <= 2000 else doc[:1000]+doc[-1000:] for doc in ids_doc]                                   
    seqs = sequence.pad_sequences(pre_post, maxlen=seq_len,
                        padding='post', truncating='pre')
    word_index = tokenizer.word_index
#   下面是对伪标签的处理，全新训练可注释掉   
    ids_doc_ = tokenizer.texts_to_sequences(fake_label_a[col].values.tolist())
    pre_post_ = [doc if len(doc) <= 2000 else doc[:1000]+doc[-1000:] for doc in ids_doc_]                                   
    fake_seqs = sequence.pad_sequences(pre_post_, maxlen=seq_len,
                        padding='post', truncating='pre')
    
    np.save(fake_path, fake_seqs)
    np.save(seq_path, seqs)
    np.save(word_index_path, word_index)

else:
    fake_seqs = np.load(fake_path)
    seqs = np.load(seq_path)
    word_index = np.load(word_index_path, allow_pickle=True).item()
    
embedding_path = 'word2vec.txt'   # 提前训练好的200维word2vec词向量

model = KeyedVectors.load_word2vec_format(embedding_path)

embedding = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in tqdm(word_index.items()):
    embedding_vector = model[word] if word in model else None
    if embedding_vector is not None:
        embedding[i] = embedding_vector
embedding = embedding / np.std(embedding)

In [None]:
"""# 模型训练"""

os.makedirs('model', exist_ok=True)
os.makedirs('sub', exist_ok=True)
os.makedirs('prob', exist_ok=True)

all_index = df_data[df_data['label'].notnull()].index.tolist()
test_index = df_data[df_data['label'].isnull()].index.tolist()[-50000:]  # 因为同时包含了测试集A、B，所以预测的是最后50000个测试集B

In [None]:
# 这里模型的各层都加了name，是因为本来后面要用到load_weights(model_path, by_name=True)

def build_model(emb, seq_len):
    emb_layer = Embedding(
        input_dim=emb.shape[0],
        output_dim=emb.shape[1],
        weights=[emb],
        input_length=seq_len,
        name='emb_word2vec',
        trainable=False  # 这里我虽然设成了False，但是根据你词向量的具体情况，可能设成True会更好
    )
    
    seq = Input(shape=(seq_len, ), name='seq_input')
    seq_emb = emb_layer(seq)
    
    seq_emb = SpatialDropout1D(rate=0.2, name='drop_out1')(seq_emb)

    lstm = Bidirectional(CuDNNLSTM(200, return_sequences=True, name='lstm'), name='bi_layer')(seq_emb)
    lstm_avg_pool = GlobalAveragePooling1D(name='avg')(lstm)
    lstm_max_pool = GlobalMaxPooling1D(name='max')(lstm)
#     att = SeqWeightedAttention(name='wei_att')(lstm)
    x = Concatenate(name='concat')([lstm_avg_pool,lstm_max_pool])
#     x = att
    
    x = Dropout(0.2, name='drop_2')(Activation(activation='relu',name='acti')(BatchNormalization(name='bn')(Dense(1024,name='dense_1')(x))))
    out = Dense(14, activation='softmax',name='dense_2')(x)
    
    model = Model(inputs=seq, outputs=out)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Lookahead(RAdam()), metrics=['accuracy'])

    return model

In [None]:
class Evaluator(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.best_val_f1 = 0.
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def evaluate(self):
        y_true = self.y_val
        y_pred = self.model.predict(self.x_val).argmax(axis=1)
        f1 = f1_score(y_true, y_pred, average='macro')
        return f1

    def on_epoch_end(self, epoch, logs=None):
        val_f1 = self.evaluate()
        if val_f1 > self.best_val_f1:
            self.best_val_f1 = val_f1
        logs['val_f1'] = val_f1
        print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}')

bs = 256
monitor = 'val_f1'

In [None]:
# 这里我对伪标签的处理是，只加入各折的训练集，并未加入验证集。至于这样做是不是更好，没来得及做对比实验

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index, df_data.iloc[all_index]['label'])):
#     train_x = seqs[train_index]
    train_x = np.vstack((seqs[train_index], fake_seqs))
    val_x = seqs[val_index]

    label = df_data['label'].values
    fake_label = fake_label_a['label'].values
#     train_y = label[train_index]
    train_y = np.hstack((label[train_index], fake_label))
    val_y = label[val_index]
    
    model_path = 'model/lstm_{}.h5'.format(fold_id)
    checkpoint = ModelCheckpoint(model_path, monitor=monitor, verbose=1, save_best_only=True, mode='max', save_weights_only=True)
    earlystopping = EarlyStopping(monitor=monitor, patience=5, verbose=1, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor=monitor, factor=0.5, patience=2, mode='max', verbose=1)
    
    model = build_model(embedding, seq_len)
#     model_path = 'model/lstm_{}.h5'.format(fold_id)
    model.load_weights(model_path, by_name=True)
    model.fit(train_x, train_y, batch_size=bs, epochs=30,
              validation_data=(val_x, val_y),
              callbacks=[Evaluator(validation_data=(val_x, val_y)), checkpoint, reduce_lr, earlystopping], verbose=1, shuffle=True)

In [None]:
"""# 模型预测"""

oof_pred = np.zeros((len(all_index), 14))
test_pred = np.zeros((len(test_index), 14))

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index, df_data.iloc[all_index]['label'])):
    model = build_model(embedding, seq_len)
    model_path = 'model/lstm_{}.h5'.format(fold_id)
    model.load_weights(model_path)
    
    val_x = seqs[val_index]
    prob = model.predict(val_x, batch_size=bs, verbose=1)
    oof_pred[val_index] = prob
    
    test_x = seqs[test_index]
    prob = model.predict(test_x, batch_size=bs, verbose=1)
    df = pd.DataFrame(prob)
    df.to_csv('sub/lstm_{}.csv'.format(fold_id), index=False, sep=',', header=False)
    test_pred += prob / 5

In [None]:
df_oof = df_data.loc[all_index][['label']]
df_oof['predict'] = np.argmax(oof_pred, axis=1)
f1score = f1_score(df_oof['label'], df_oof['predict'], average='macro')
print(f1score)

In [None]:
np.save('prob/sub_5fold_lstm_{}.npy'.format(f1score), test_pred)
np.save('prob/oof_5fold_lstm_{}.npy'.format(f1score), oof_pred)

In [None]:
sub = pd.DataFrame()
sub['label'] = np.argmax(test_pred, axis=1)
sub.to_csv('sub/lstm_{}.csv'.format(f1score), index=False)