# 데이터 분할
### 82년생 김지영, 공작, 남산의 부장들 1점대 댓글 중 21% + 라벨링한 합본 이용

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('데이터/전체 train'.csv')
data = data[['text','혐오','성별','이념/지역','인간존엄성']] # train에 필요한 컬럼만 불러오기

# 8:2 split
train, test = train_test_split(data, test_size=0.2, random_state=777)

train.to_csv('데이터/성능확인 train.csv', encoding='cp949')
test.to_csv('데이터/성능확인 test.csv', encoding='cp949')

# Model

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from AttentionLayer import *
from keras.layers import BatchNormalization, Dropout



class AMCNN:
    def __init__(self, maxlen, embed_dim,words_count, filter_size, channel, mask_prob=0.7,att_reg=0.0001 ):
        """
        :param maxlen: Max length of sequence
        :param embed_dim: Embedding size of word embedding layer
        :param words_count:  Word count of Tokenizer
        :param filter_size:  Filter size of CNN layer
        :param channel: Number of Attention Layer Channels
        :param mask_prob: Masking proportion of Attention Layer(It only apply training model.)
        :param att_reg: L2 regularizer term of Attention Layer
        """
        self.maxlen = maxlen
        self.words_count = words_count
        self.embed_dim = embed_dim
        self.filter_size = filter_size
        self.channel = channel
        self.att_reg = att_reg
        num_filter = embed_dim // filter_size
        self.num_filters = list(range(1, num_filter + 1))
        self.mask_prob = mask_prob

    def build(self, emb_trainable=True, pre_emb=True, emb_weight=None):
        """
        :param emb_trainable: Define trainable of Embedding Layer
        :param pre_emb: Whether to use pre-trained embedding weights
        :param emb_weight: Pre-trained embedding weights
        :return:
        """
        inputs = layers.Input(shape=(self.maxlen,))
        pad_k = tf.expand_dims(tf.cast((inputs == 0), dtype=tf.float32) * -99999, axis=2)

        if pre_emb:
            emb_layer = layers.Embedding(self.words_count + 1, self.embed_dim, trainable=emb_trainable,
                                         weights=[emb_weight])
        else:
            emb_layer = layers.Embedding(self.words_count + 1, self.embed_dim, trainable=
            True)
        inputs_emb = emb_layer(inputs)

        # Bi-LSTM cell summary
        lstm_layer = layers.LSTM(self.embed_dim, return_sequences=True)
        bi_lstm = layers.Bidirectional(lstm_layer, merge_mode="ave")(inputs_emb)

        C_features, self.scalar_att, self.vector_att = AttentionLayer(self.embed_dim, self.embed_dim, self.channel, 0.0001,
                                                            self.mask_prob)(bi_lstm, pad_k)
        inputs_emb2 = tf.expand_dims(inputs_emb, axis=3)
        C_features = tf.concat([inputs_emb2, C_features], axis=3)

        # kim-cnn process
        pools = []
        for filter_sizes in self.num_filters:
            cnn_layers = layers.Conv2D(self.filter_size, kernel_size=(filter_sizes, self.embed_dim), activation="relu")
            cnn_out = cnn_layers(C_features)
            cnn_out = layers.BatchNormalization()(cnn_out)  #배치정규화
            max_pools = layers.MaxPool2D(pool_size=(self.maxlen - filter_sizes + 1, 1))(cnn_out)
            max_pools = layers.Flatten()(max_pools)
            pools.append(max_pools)
        concated = layers.concatenate(pools)  # filter size x num_fiilters 수

        # Higy-way process
        gap_input_emb = layers.GlobalAvgPool1D()(inputs_emb)  # 임베딩 사이즈로 global average pooling
        trans_ = layers.Dense(self.embed_dim, activation="relu", use_bias=True)(gap_input_emb)
        carry_ = 1 - trans_
        gap_ = layers.Multiply()([trans_, gap_input_emb])
        concated_ = layers.Multiply()([carry_, concated])
        concated_ = layers.Dropout(0.6)(concated_)  # Dropout
        concated_ = layers.Add()([concated_, gap_])
        outputs = layers.Dense(1, activation="sigmoid")(concated_)

        self.model = keras.Model(inputs=inputs, outputs=outputs)
        return self.model

    def load_weights(self, path):
        self.model.load_weights(path)
        print("Load Weights Compelete!")

# Train
### 혐오 컬럼

In [None]:
from Token import Token
from gensim.models import word2vec
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score
from sklearn.model_selection import train_test_split
import pickle
from Metric import *
from tensorflow import keras
import numpy as np
import argparse
from tensorflow.keras import backend as K
import os
import easydict
from imblearn.over_sampling import SMOTE
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam


args = easydict.EasyDict({ "epochs" : 10,
                           "train_steps": None, 
                           "batch_size": 64, 
                           "max_length": 100,
                           "lr_rate": 0.001, 
                           "lr_decay": 0.9, 
                           "patience": 5,
                           "save_period": 1,
                           "att_reg": 0.0001,
                           "channel": 2,
                           "weight_save_path" : "Weight",
                           "train_data" : "데이터/성능확인 train.csv",
                           "document" : "text",
                           "label": '혐오'})



def main():
    
    global words_vector
    
    # Check Gpu Enable
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    if len(physical_devices) > 0:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # parsing Arg
    train_data_path = args.train_data
    max_len = args.max_length
    epoch = args.epochs
    batch_size = args.batch_size
    att_reg = args.att_reg
    lr_decay = args.lr_decay
    lr_rate = args.lr_rate
    warmup_lr_rate = lr_rate*0.1
    patience = args.patience
    #period = args.save_period
    weight_save_path = args.weight_save_path
    document = args.document
    label = args.label
    channel = args.channel
    steps_per_epoch = args.train_steps

    # model weight path (Weight 폴더에 가중치 파일 저장)
    if os.path.isdir(weight_save_path) == False:
        os.mkdir(weight_save_path)

    # Read Data
    if ".csv" in train_data_path:
        read_data = pd.read_csv
    elif ".xlsx" in train_data_path:
        read_data = pd.read_excel
    else:
        read_data = pd.read_table
    train_data = read_data(train_data_path)

    # Make Tokenizer Token
    tk = Token("Tokenizer", max_len)
    train_data["Token"] = train_data[document].apply(lambda x: tk.make_token_ori(x))
    
    # Using Keras Tokenizer
    k_tokenizer = keras.preprocessing.text.Tokenizer(filters='')
    k_tokenizer.fit_on_texts(train_data["Token"].values.tolist())
    words_count = len(k_tokenizer.word_counts)
    print("Save Keras tokenizer for validate in %s"%(weight_save_path))
    with open(os.path.join(weight_save_path,"keras_tokenizer.pkl"), "wb") as f:
        pickle.dump(k_tokenizer, f)
    
    # Load Pre-trained embedding Word2Vec
    w2v_model = word2vec.Word2Vec.load("w2v_pretrain_emb/w2v_20M_500.model")
    init_weight = np.random.uniform(size=(words_count + 1, 500), low=-1, high=1)
    
    words_lst = []
    for i in range(1, len(k_tokenizer.index_word) + 1):
        words = k_tokenizer.index_word[i]
        try:
            words_vector = w2v_model.wv[words]
        except:
            words_lst.append([i, words])
        init_weight[i] = words_vector

    #  K_tokenizer Sequence
    sequences = k_tokenizer.texts_to_sequences(train_data['Token'])
    x_train = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
    y_train = train_data[label].values
    
    # SMOTE
    smote = SMOTE(random_state=0)
    x_train_over,y_train_over = smote.fit_sample(x_train,y_train)

    # Define validation set 분할
    x_train2, x_val, y_train2, y_val = train_test_split(x_train_over, y_train_over, test_size=0.1, random_state=0)

    # Build simple binary model
    tf.keras.backend.clear_session()
    amcnn = AMCNN(maxlen=max_len,
                  embed_dim=500,
                  words_count=words_count,
                  filter_size=50,
                  channel=channel,
                  mask_prob=0.5,
                  att_reg=att_reg)
    model = amcnn.build(emb_trainable=False, emb_weight=init_weight)

    adam = Adam(clipnorm=5.0, lr = warmup_lr_rate)
    model.compile(optimizer=adam, loss="binary_crossentropy",
                         metrics=['accuracy', k_precision, k_recall, k_f1score])
    checkpoint_path = os.path.join(weight_save_path,"model-{epoch:04d}.h5")

    # Define callbacks condition
    callbacks = ModelCheckpoint(checkpoint_path, monitor='val_loss',
                                verbose=1,save_best_only=True, save_weights_only=True) #  period=period,


    # Define reduce Learning rate schedule
    lr_reducer = ReduceLROnPlateau(monitor='loss',
                                   factor=lr_decay,
                                   cooldown=0,
                                   patience=patience,
                                   min_lr=lr_rate*0.01,
                                   verbose=1)


    # Train Warm up stage
    print("===========Warm up %d Epoch Stage==========="%(int(epoch*0.1)))
    # warm up embedding weight
    model.fit(x_train2, y_train2, epochs=int(epoch*0.1), callbacks=[callbacks, lr_reducer], steps_per_epoch=steps_per_epoch,
                     batch_size=batch_size,verbose=2)
    print("============Main %d Epoch Stage============="%(epoch-int(epoch*0.1)))
    K.set_value(model.optimizer.learning_rate, lr_rate)
    hist = model.fit(x_train2, y_train2, epochs=epoch-int(epoch * 0.1), callbacks=[callbacks, lr_reducer], steps_per_epoch=steps_per_epoch
              ,validation_steps=steps_per_epoch,
              batch_size=batch_size, validation_data=(x_val, y_val),verbose=2)

  # 시각화
    fig, loss_ax = plt.subplots()

    acc_ax = loss_ax.twinx()

    loss_ax.plot(hist.history['loss'], 'y', label='train loss')
    loss_ax.plot(hist.history['val_loss'], 'r', label='val loss')

    acc_ax.plot(hist.history['accuracy'], 'b', label='train acc')
    acc_ax.plot(hist.history['val_accuracy'], 'g', label='val acc')

    loss_ax.set_xlabel('epoch')
    loss_ax.set_ylabel('loss')
    acc_ax.set_ylabel('accuray')

    loss_ax.legend(loc='upper left')
    acc_ax.legend(loc='lower left')

    print("Complete Training Model")
    print("Check Model Weight file in %s"%(weight_save_path))

if __name__ == "__main__":
    main()