## Import Library

In [1]:
import re
import nltk
import string
import pickle
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import classification_report, confusion_matrix

from tensorflow import keras
from tensorflow.keras import layers, models, initializers, regularizers, constraints, optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, Embedding, Dropout, GlobalMaxPool1D, SpatialDropout1D, BatchNormalization, Bidirectional, LSTM, GlobalMaxPooling1D, MaxPooling1D, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text, sequence

from tqdm import tqdm

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

warnings.simplefilter(action="ignore", category=FutureWarning)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Utility Method

### Utility

In [2]:
def label_encoding(label):
    if(label == 'fact-based'):
        return 0
    elif(label == 'feeling-based'):
        return 1
    else:
        return 2

### Preprocessing

Tahap preprocessing akan mengubah kalimat menjadi format yang lebih sederhana dan general. Beberapa preprocessing yang dilakukan adalah membuat lowercase, menyaring karakter, dan melakukan lemmatisasi.

In [3]:
lemmatizer = WordNetLemmatizer()
    
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):

    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def sentence_pos_tag(sentence):
    text = word_tokenize(sentence)
    pos_tag = nltk.pos_tag(text)
    pos_tag_res = ''
    for i in range(len(pos_tag)):
        pos_tag_res += pos_tag[i][1]
        pos_tag_res += ' ' if i != len(sentence)-1 else '' 
    return pos_tag_res

def preprocessing(text):
    text = text.lower()
    text = text.strip()
    text = re.sub(r" \d+ ", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"[^a-z ]", "", text)
    text = re.sub(r"  ", " ", text)
    text = lemmatize_sentence(text)
    return text

## Preparation

### Load Preprocessed Data

Untuk mempercepat dan mengurangi beban komputasi

In [4]:
file = open("../input/quote-response/quotexresponseprocessed.pkl", "rb")
df = pickle.load(file)
file.close()

### Prepare Data

Proses data yang dibutuhkan untuk training. Data quote adalah kalimat yang berisikan pernyataan yang mengangkat diskusi, dan data response merupakan argumen terhadap pernyataan yang diberikan. Data pos tag dari kalimat response menjadi salah satu fitur eksperimen.

In [5]:
df = df[df['emotion_fact'] != 'unsure']
df = df.loc[:, ['presented_quote', 'presented_response', 'emotion_fact', 'presented_response_tag']]

In [6]:
# df['presented_quote'] = df['presented_quote'].apply(lambda x: preprocessing(x))
# df['presented_response'] = df['presented_response'].apply(lambda x: preprocessing(x))
df['emotion_fact'] = df['emotion_fact'].apply(lambda x: label_encoding(x))

In [7]:
df

Unnamed: 0,presented_quote,presented_response,emotion_fact,presented_response_tag
0,i get a good idea however they do tend to stay...,by your own admission you havent hang out with...,1,IN PRP$ JJ NN PRP VBP VB RP IN NN IN DT NN CC ...
2,one of the big argument against gun control be...,not quite to be more correct regard government...,0,RB RB TO VB RBR JJ JJ NN CC VB DT NN TO VB NN ...
4,there be some incedents that be beyond your co...,well yes,1,RB RB
6,legality do not matter religous implication do...,exact to the point amp beautiful,1,NN TO DT NN NN NN
8,once again you seem to support the killing of ...,base on the idea that people be dispensible pa...,1,NN IN DT NN WDT NNS VB JJ RB IN PRP VBP PRP$ V...
...,...,...,...,...
9977,the id movement form of id state that there be...,that of course be the logical fallacy know as ...,0,DT IN NN VB DT JJ NN VBP IN JJ NN DT NN PRP VB...
9978,for me it would therefore have make no differe...,it logically follow from the moral foundation ...,1,PRP RB VBP IN DT JJ NN VBN RP CC PRP VBP TO JJ...
9979,good thing this argument have never be doneoh ...,and teen sex doesnt by the very nature of its ...,1,CC JJ NN NN IN DT JJ NN IN PRP$ NN NN IN NN DT...
9980,i know one thing anything that happen politica...,wasnt sinjin crow about his plan to take the f...,1,NN NN NN IN PRP$ NN TO VB DT NN NN CC NN TO DT...


### Split Training Data

In [8]:
from sklearn.model_selection import train_test_split

X = df.loc[:, ['presented_quote', 'presented_response', 'presented_response_tag']]
y = df.loc[:, ['emotion_fact']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=20)

In [9]:
X_train_quotes = X_train['presented_response'].values
X_train_responses = X_train['presented_response'].values
X_train_responses_tag = X_train['presented_response_tag'].values

X_test_quotes = X_test['presented_response'].values
X_test_responses = X_test['presented_response'].values
X_test_responses_tag = X_test['presented_response_tag'].values

y_train = y_train['emotion_fact'].values
y_test = y_test['emotion_fact'].values

X_train_text = X_train_quotes + X_train_responses

## Training BERT

In [10]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [11]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

In [12]:
def bert_encode(quotes, responses, tokenizer, max_len=160):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for i in range(len(quotes)):
        quote = tokenizer.tokenize(quotes[i])
        response = tokenizer.tokenize(responses[i])
            
        quote = quote[:75]
        response = response[:75]
        
        input_sequence = ["[CLS]"] + quote + ["[SEP]"] + response
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [13]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    out = Dense(1, activation='sigmoid')(clf_output)

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(learning_rate=5.95e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [14]:
def train_model(train_input, train_labels):
      
    model_BERT = build_model(bert_layer, max_len=160)
    model_BERT.summary()
    
    checkpoint = ModelCheckpoint('model_BERT.h5', monitor='val_loss', save_best_only=True)

    model_BERT.fit(
        train_input, train_labels,
        validation_split = 0.15,
        epochs = 3,
        callbacks=[checkpoint],
        batch_size = 16
    )
    
    return model_BERT

In [15]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

2021-11-16 15:10:43.561549: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-16 15:10:43.685493: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-16 15:10:43.686850: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-16 15:10:43.691087: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

## Eksperimen Fitur

### Statement (Quote) dan Argument (Response)

In [16]:
train_input = bert_encode(X_train_quotes, X_train_responses, tokenizer, max_len=160)
test_input = bert_encode(X_test_quotes, X_test_responses, tokenizer, max_len=160)
train_labels = y_train

In [17]:
model_BERT = train_model(train_input, train_labels)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

2021-11-16 15:11:27.128943: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/3
Epoch 3/3


In [18]:
from sklearn.metrics import classification_report

test_pred_BERT = model_BERT.predict(test_input)
test_pred_BERT_int = test_pred_BERT.round().astype('int')

pred = test_pred_BERT_int.reshape([test_pred_BERT_int.shape[0]])
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.81      0.81      0.81       524
           1       0.72      0.71      0.72       354

    accuracy                           0.77       878
   macro avg       0.76      0.76      0.76       878
weighted avg       0.77      0.77      0.77       878



### Argument (Response) dan TAG Argument (Response)

In [19]:
train_input = bert_encode(X_train_responses, X_train_responses_tag, tokenizer, max_len=160)
test_input = bert_encode(X_test_responses, X_test_responses_tag, tokenizer, max_len=160)
train_labels = y_train

In [20]:
model_BERT = train_model(train_input, train_labels)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [21]:
from sklearn.metrics import classification_report

test_pred_BERT = model_BERT.predict(test_input)
test_pred_BERT_int = test_pred_BERT.round().astype('int')

pred = test_pred_BERT_int.reshape([test_pred_BERT_int.shape[0]])
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       524
           1       0.74      0.71      0.73       354

    accuracy                           0.78       878
   macro avg       0.78      0.77      0.77       878
weighted avg       0.78      0.78      0.78       878

