# Data and Pre-trained Model Preparation

In [None]:
###Import Library###
import pandas as pd
import codecs, gc, re
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_custom_objects
from keras.metrics import top_k_categorical_accuracy
from keras.layers import *
from keras.layers import Input
from keras.callbacks import *
from keras.models import Model, load_model
import keras.backend as K
from keras.optimizers import Adam
from keras.utils import to_categorical
import os
import keras

In [None]:
###Set file root###
CWD = os.getcwd()
print(CWD)
SEQ_LEN = 500

## Load Pre-trained Model

In [None]:
config_path = 'Scibert/bert_config.json'
checkpoint_path = 'Scibert/bert_model.ckpt'
vocab_path = 'Scibert/vocab.txt'

In [None]:
###Set up dictionary###
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader :
    for line in reader :
        token = line.strip()
        token_dict[token] = len(token_dict)
len(token_dict)

In [None]:
tokenizer = Tokenizer(token_dict)

## Load and Preprocess Data

In [None]:
###Load data###
train_df = pd.read_csv(os.path.join(CWD,'data/trainset.csv')).astype(str)
val_df = pd.read_csv(os.path.join(CWD,'data/validset.csv')).astype(str)
test_df = pd.read_csv(os.path.join(CWD,'data/testset.csv')).astype(str)
private_df = pd.read_csv(os.path.join(CWD,'data/task2_private_testset.csv')).astype(str)

In [None]:
###Transform label data to onehot encoding###
def label_to_onehot(labels) :
    """ Convert label to onehot .
        Args:
            labels (string): sentence's labels.
        Return:
            outputs (onehot list): sentence's onehot label.
    """
    label_dict = {'THEORETICAL': 0, 'ENGINEERING':1, 'EMPIRICAL':2, 'OTHERS':3}
    onehot = [0,0,0,0]
    for l in labels.split():
        onehot[label_dict[l]] = 1
    return onehot

###Preprocess data###
def process_data(data, test=0) :
    all_x1 = []
    all_x2 = []
    label = []
    print(len(data))
    for i in range(len(data)) :
        sentid = []
        senti = []
        for sent in data['Abstract'][i].split('$$$') :
            x1, x2 = tokenizer.encode(sent)
            sentid = sentid + x1
            senti = senti + x2
        if test == 0:
            label.append(label_to_onehot(data['Task 2'][i]))
        if len(sentid) > SEQ_LEN :
            all_x1.append(sentid[:SEQ_LEN])
            all_x2.append(senti[:SEQ_LEN])
        else :
            all_x1.append(sentid + [0] * (SEQ_LEN - len(sentid)))
            all_x2.append(senti + [0] * (SEQ_LEN - len(sentid)))
    if test == 0:
        return [np.asarray(all_x1), np.asarray(all_x2)], label
    else:
        return [np.asarray(all_x1), np.asarray(all_x2)]

###User define f1-score metric### 
class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=10):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        #if epoch % self.interval == 0:
        y_pred = (np.asarray(self.model.predict([self.X_val[0], self.X_val[1]]))).round().astype(int)
        #for i in range(len(y_pred)):
        #    print(y_pred[i])
            #print(self.y_val[i])
        score = f1_score(self.y_val, y_pred, average='micro')
        print("f1_score - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [None]:
x_train, y_train = process_data(train_df)
x_val, y_val = process_data(val_df)
test_data = process_data(test_df, 1)
private_data = process_data(private_df, 1)

In [None]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_val = np.asarray(x_val)
y_val = np.asarray(y_val)
test_data = np.asarray(test_data)
private_data = np.asarray(private_data)

In [None]:
get_f1 = IntervalEvaluation(validation_data=(x_val, y_val)) 

# Modify the model

In [None]:
bert_model = load_trained_model_from_checkpoint(
        config_path,
        checkpoint_path,
        training=True,
        trainable=True,
        seq_len=SEQ_LEN,
        )

In [None]:
inputs = bert_model.inputs[:2]

frontout = bert_model.get_layer('Encoder-4-FeedForward-Norm').output
dense1 = GlobalAveragePooling1D()(frontout)

modelout = bert_model.get_layer('Encoder-12-FeedForward-Norm').output
dense2 = GlobalAveragePooling1D()(modelout)

denseout = keras.layers.Maximum()([dense2, dense1])

denseout = Dropout(0.1)(denseout)

outputs = Dense(4, activation='sigmoid')(denseout)

model = Model(inputs, outputs)

print(model.summary())

# Fine-tuning the model

In [None]:
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-5), metrics=['binary_accuracy'])

In [None]:
checkpointer = ModelCheckpoint(filepath='save/weights.h5', verbose=1, save_best_only=True)

In [None]:
###1e-5
history = model.fit([x_train[0], x_train[1]], y_train, batch_size=4, epochs=1, validation_data=([x_val[0], x_val[1]],y_val), callbacks=[get_f1,checkpointer])

In [None]:
###1e-6
history = model.fit([x_train[0], x_train[1]], y_train, batch_size=4, epochs=1, validation_data=([x_val[0], x_val[1]],y_val), callbacks=[get_f1,checkpointer])

In [None]:
###1e-7
history = model.fit([x_train[0], x_train[1]], y_train, batch_size=4, epochs=1, validation_data=([x_val[0], x_val[1]],y_val), callbacks=[get_f1,checkpointer])

In [None]:
##1e-7
history = model.fit([x_train[0], x_train[1]], y_train, batch_size=4, epochs=1, validation_data=([x_val[0], x_val[1]],y_val), callbacks=[get_f1,checkpointer])

# Load and Save model

In [None]:
#model.save('save/weights.h5')

In [None]:
model = load_model('save/weights_best.h5', custom_objects=get_custom_objects())

# Predict the answer

In [None]:
###Public dataset testing###
ans_pred = model.predict([test_data[0], test_data[1]])

In [None]:
###Private dataset testing###
privans_pred = model.predict([private_data[0], private_data[1]])

In [None]:
###Validation dataset testing###
valpred = model.predict([x_val[0], x_val[1]])

In [None]:
###Function to modify the threshold of sigmoid output###
def mod_round(data, threshold=0.5):
    pred = []
    for i in range(len(data)) :
        pred_line = []
        for j in range(4) :
            if(data[i][j]) > threshold :
                pred_line += [1]
            else :
                pred_line += [0]
        pred.append(pred_line)
    pred = np.asarray(pred).astype(int)
    return pred

###Fix the unreasonable prediction###
def fix_ans(data):
    wrong1 = 0
    wrong2 = 0
    others = 0
    index2 = 0
    for i in range(len(data)):
        if sum(data[i]) == 0:
            wrong1 += 1
            data[i] = [0,0,0,1]
        if sum(data[i][0:3]) > 0 and data[i][3] == 1:
            wrong2 += 1
            data[i][3] = 0
        if data[i][3] == 1 :
            others += 1
    return data, wrong1, wrong2, others

###Calculate the F1-score when testing with validation data###
def cal_score(data, y_val) :
    TP = 0
    AP = 0
    TL = 0
    for i in range(len(data)) :
        TP = TP + (data[i]*y_val[i]).sum()
        AP = AP + (data[i].sum())
        TL = TL + (y_val[i]).sum()
    precision = TP / AP
    recall = TP / TL
    FS = 2*precision*recall/(precision+recall)
    #print(TP, AP, TL)
    return precision, recall, FS

In [None]:
###Find the outlier of testing data
outlier = []
for i in range(len(private_df)) :
    if len(private_df['Abstract'][i]) < 500:
        if len(private_df['Abstract'][i]) < 30:
            outlier.append(i)
            print(i+1)
        else :
            for sents in private_df['Abstract'][i].split('$$$') :
                for sent in re.split('.\s',sents) :
                    if sent == 'withdrawn' :
                        print(i+1)
                        outlier.append(i)
                    elif sent[0:-1] == 'withdrawn' :
                        print(i+1)
                        outlier.append(i)
                    elif sent == 'withdraw' :
                        print(i+1)
                        outlier.append(i)
                    elif sent[0:-1] == 'withdraw' :
                        print(i+1)
                        outlier.append(i)
print(outlier)

In [None]:
pred = mod_round(privans_pred,0.419)

In [None]:
pred, wrong1, wrong2, others = fix_ans(pred)
print(pred, wrong1, wrong2, others, index2)

In [None]:
###Fix the outlier of test data###
for index in outlier : 
    pred[index] = [0,0,0,1]

In [None]:
###To calculate f1-score of validation dataset###
print(cal_score(pred, y_val))

In [None]:
###List the f1-score with different threshold on validation data###
table = []
for i in range(400,500,1) :
    pred = mod_round(val1pred, i/1000)
    pred, wrong1, wrong2, others, index2 = fix_ans(pred)
    recision, recall, FS = cal_score(pred, y_val)
    table.append([i/1000, recision, recall, FS])

In [None]:
table

## Generate Submit File

In [None]:
def SubmitGenerator(prediction, sampleFile, public=True, filename='prediction.csv'):
    """
    Args:
        prediction (numpy array)
        sampleFile (str)
        public (boolean)
        filename (str)
    """
    sample = pd.read_csv(sampleFile)
    submit = {}
    submit['order_id'] = list(sample.order_id.values)
    redundant = len(sample) - prediction.shape[0]
    if public:
        submit['THEORETICAL'] = list(prediction[:,0]) + [0]*redundant
        submit['ENGINEERING'] = list(prediction[:,1]) + [0]*redundant
        submit['EMPIRICAL'] = list(prediction[:,2]) + [0]*redundant
        submit['OTHERS'] = list(prediction[:,3]) + [0]*redundant
    else:
        submit['THEORETICAL'] = [0]*redundant + list(prediction[:,0])
        submit['ENGINEERING'] = [0]*redundant + list(prediction[:,1])
        submit['EMPIRICAL'] = [0]*redundant + list(prediction[:,2])
        submit['OTHERS'] = [0]*redundant + list(prediction[:,3])
    df = pd.DataFrame.from_dict(submit) 
    df.to_csv(filename,index=False)

In [None]:
SubmitGenerator(pred, 
                os.path.join(CWD,'data/task2_sample_submission.csv'),
                False, 
                os.path.join(CWD,'task2_submission_0102.csv'))