In [1]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

# Predict Code

## Fact-Feeling Classification

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import numpy as np

import re
import nltk
import string
import pickle
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import classification_report, confusion_matrix

from tensorflow import keras
from tensorflow.keras import layers, models, initializers, regularizers, constraints, optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, Embedding, Dropout, GlobalMaxPool1D, SpatialDropout1D, BatchNormalization, Bidirectional, LSTM, GlobalMaxPooling1D, MaxPooling1D, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text, sequence

from tqdm import tqdm

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

warnings.simplefilter(action="ignore", category=FutureWarning)

import tokenization

def bert_encode(quotes, responses, tokenizer, max_len=160):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for i in range(len(quotes)):
        quote = tokenizer.tokenize(quotes[i])
        response = tokenizer.tokenize(responses[i])
            
        quote = quote[:75]
        response = response[:75]
        
        input_sequence = ["[CLS]"] + quote + ["[SEP]"] + response
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    out = Dense(1, activation='sigmoid')(clf_output)

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(learning_rate=5.95e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


lemmatizer = WordNetLemmatizer()
    
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):

    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def sentence_pos_tag(sentence):
    text = word_tokenize(sentence)
    pos_tag = nltk.pos_tag(text)
    pos_tag_res = ''
    for i in range(len(pos_tag)):
        pos_tag_res += pos_tag[i][1]
        pos_tag_res += ' ' if i != len(sentence)-1 else '' 
    return pos_tag_res

def preprocessing(text):
    text = text.lower()
    text = text.strip()
    text = re.sub(r" \d+ ", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"[^a-z ]", "", text)
    text = re.sub(r"  ", " ", text)
    text = lemmatize_sentence(text)
    return text

def label_encoding(label):
    if(label == 'fact-based'):
        return 0
    elif(label == 'feeling-based'):
        return 1
    else:
        return 2

def fact_label_encoding(label):
    if(label == 'feeling-based'):
        return 0
    elif(label == 'fact-based'):
        return 1
    else:
        return 2

def agreement_label_encoding(label):
    if(label == 'disagreement'):
        return 0
    elif(label == 'agreement'):
        return 1
    else:
        return 2

def agreement_label_decoding(label):
    if label == 0:
        return 'disagreement'
    elif label == 1:
        return 'agreement'
    else:
        return 'unsure'

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

model_BERT = build_model(bert_layer, max_len=160)
model_BERT.load_weights('../input/bert-result/model_BERT.h5')



[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


2021-11-16 22:04:58.257617: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-16 22:04:58.376731: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-16 22:04:58.377440: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-16 22:04:58.378723: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [3]:
def decode_prediction(pred):
    for i in range(len(pred)):
        pred[i][0] = 'fact-based' if pred[i][0] == 0 else 'feeling-based'
    return np.array(pred)

def emotion_predict(quote, responses):
    quotes = []
    responses_tag = []
    
    quote = preprocessing(quote)
    
    for i in range(len(responses)):
        quotes.append(quote)

        responses[i] = preprocessing(responses[i])
        responses_tag.append(sentence_pos_tag(responses[i]))

    inputs = bert_encode(responses, responses_tag, tokenizer, max_len=160)

    return model_BERT.predict(inputs).round().astype('int')

In [4]:
quote = 'self confidence is very needed for every one'
responses = ['every one have their choice', 
             'according to book written by mister, there is one example with clearly explanation', 
             'no, i very bad in confidence']

decode_prediction(emotion_predict(quote, responses).tolist())

2021-11-16 22:05:30.616352: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


array([['feeling-based'],
       ['fact-based'],
       ['feeling-based']], dtype='<U13')

## Agree Disagree Classification (Pipeline)

In [5]:
def agree_predict(quote, response):
    emotion = decode_prediction(emotion_predict(quote, [response]).tolist())[0][0]
    print(emotion)
    model = keras.models.load_model('../input/agreedisagree/double_biGRU_with_emotion_model.h5')
    tokenizer_file = open('../input/agreedisagree/tokenizer.pkl', "rb")
    tokenizer = pickle.load(tokenizer_file)
    tokenizer_file.close()
    q = tokenizer.texts_to_sequences(np.array([preprocessing(quote)]))
    r = tokenizer.texts_to_sequences(np.array([preprocessing(response)]))
    e = fact_label_encoding(emotion)
    q = sequence.pad_sequences(q, maxlen=1000)
    r = sequence.pad_sequences(r, maxlen=1000)
    e = np.array([e])
    p = model.predict({'quote': q, 'response':r, 'emotion':e})
    return agreement_label_decoding(np.argmax(p))

# Testing

## Fact-Feeling Test

In [6]:
quote = 'self confidence is very needed for every one'
responses = ['every one have their choice', 
             'according to book written by mister, there is one example with clearly explanation', 
             'no, i very bad in confidence']

decode_prediction(emotion_predict(quote, responses).tolist())

array([['feeling-based'],
       ['fact-based'],
       ['feeling-based']], dtype='<U13')

## Agreement Disagreement Test

In [7]:
quote = "I am against suicide because you are basically not only harming yourself, but everyone else around you. Let’s not mention it is a cowards way out. I also have a religious but I CAN explain that reason."
response = "So true man people only harm the people they love by dying. I am not religious and religiously and non-religiously suicide is wrong."

agree_predict(quote, response)

feeling-based


2021-11-16 22:05:38.471958: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


'agreement'