In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import bert
import matplotlib.pyplot as plt

In [2]:
device = '/GPU:0' if len(tf.config.list_physical_devices('GPU')) > 0 else '/CPU:0'
device

'/GPU:0'

# Load Data

In [3]:
# load prediction data
directory = 'data/'
dataset_path = os.path.join(directory, 'prediction_dataset.csv')
dataset_df = pd.read_csv(dataset_path)
print("Shape:", dataset_df.shape)

Shape: (52836, 2)


In [4]:
# take the text data to be predicted
text_df = dataset_df['judul_dokumen']
text_df.head()

0    atur kepala badan duduk keluarga rencana nasio...
1    atur menteri uang nomor pmk tatacara hitung ba...
2    atur menteri uang nomor pmk ubah ata atur ment...
3    atur menteri uang nomor pmk kembali bea masuk ...
4    atur menteri uang nomor pmk alokasi kurang bay...
Name: judul_dokumen, dtype: object

# Transform

In [5]:
# create tokenizer instance
bert_model = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
bert_layer = hub.KerasLayer(bert_model, trainable=False)
BertTokenizer = bert.bert_tokenization.FullTokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocab_file, do_lower_case)

In [6]:
max_seq_length = 128

In [7]:
# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [8]:
def build_bert_input(df):
    input_word_ids, input_mask, input_type_ids = [], [], []
    for sequence in df:
        sequence_tokens = tokenizer.tokenize(sequence)
        sequence_tokens = ["[CLS]"] + sequence_tokens + ["[SEP]"]
        
        if len(sequence_tokens) > max_seq_length:
            sequence_tokens = sequence_tokens[:max_seq_length]

        sequence_ids = get_ids(sequence_tokens, tokenizer, max_seq_length)
        input_word_ids.append(sequence_ids)

        sequence_mask = get_masks(sequence_tokens, max_seq_length)
        input_mask.append(sequence_mask)

        sequence_segments = get_segments(sequence_tokens, max_seq_length)
        input_type_ids.append(sequence_segments)

    transformed_seq = dict(
        input_word_ids= tf.convert_to_tensor(np.asarray(input_word_ids).astype('int32'), dtype=tf.int32),
        input_mask= tf.convert_to_tensor(np.asarray(input_mask).astype('int32'), dtype=tf.int32),
        input_type_ids= tf.convert_to_tensor(np.asarray(input_type_ids).astype('int32'), dtype=tf.int32)
    )
    
    return transformed_seq

In [9]:
batch_size = 16
transformed_seq = build_bert_input(text_df)
ds = tf.data.Dataset.from_tensor_slices(transformed_seq)
ds = ds.shuffle(text_df.shape[0] // 4).batch(batch_size)

# Inference

## Load Model

In [10]:
# load keras saved model
saved_model_path = 'bert_kaidah_model.h5'
model = tf.keras.models.load_model(saved_model_path, custom_objects={'KerasLayer':hub.KerasLayer})
model.summary()

Model: "bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
bert_embedding (KerasLayer)     {'default': (None, 7 109482241   input_2[0][0]                    
                                                                 input_3[0][0]           

## Predict

In [11]:
# predict from text
predictions = model.predict(ds, batch_size=batch_size, verbose=1)



# Prediction Label

In [12]:
class_path = os.path.join(directory, 'label.csv')
class_df = pd.read_csv(class_path)

In [19]:
seed = 44388    
index = predictions[seed].argmax()
print("Title:", dataset_df['judul_dokumen'][seed])
print("Predicted label:", class_df['class'][index], '(', index+1, ')')

Title: putus menteri ppn kepala bappena nomor kep m ppn hk angkat anggota forum masyarakat statistik masa kerja tahun
Predicted label: Jabatan Fungsional ( 17 )


In [15]:
saved_df = pd.DataFrame()
saved_df['id'] = dataset_df.index + 1
saved_df['subject_id'] = saved_df['id'].apply(lambda x: predictions[x-1].argmax() + 1)
saved_df['subject'] = saved_df['subject_id'].apply(lambda x: class_df['class'][x-1])
saved_df.head()

Unnamed: 0,id,subject_id,subject
0,1,54,Lain-lain
1,2,54,Lain-lain
2,3,54,Lain-lain
3,4,54,Lain-lain
4,5,54,Lain-lain


# Save Data

In [16]:
# save to directory
if not os.path.exists(directory):
    os.mkdir(directory)

saved_data_path = os.path.join(directory, 'predicted_jdihn.csv')
saved_df.to_csv(saved_data_path, index=False)