In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
device = '/GPU:0' if len(tf.config.list_physical_devices('GPU')) > 0 else '/CPU:0'
device

'/GPU:0'

# Load Data

In [3]:
# load prediction data
directory = 'data/'
dataset_path = os.path.join(directory, 'prediction_dataset.csv')
dataset_df = pd.read_csv(dataset_path)
print("Shape:", dataset_df.shape)

Shape: (55374, 24)


In [4]:
# take the text data to be predicted
text_df = dataset_df['judul_dokumen']
text_df.head()

0    atur kepala badan duduk keluarga rencana nasio...
1    atur menteri uang nomor pmk tatacara hitung ba...
2    atur menteri uang nomor pmk ubah ata atur ment...
3    atur menteri uang nomor pmk kembali bea masuk ...
4    atur menteri uang nomor pmk alokasi kurang bay...
Name: judul_dokumen, dtype: object

# Transform

## Tokenize

In [5]:
# create indobert tokenizer instance
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

In [6]:
# tokenize the text
text_sequences = text_df.apply(lambda sequence: tokenizer.encode(sequence))
print("Real sentence:\n", text_df[12345])
print("Tokenized sentence:\n", text_sequences[12345])

Real sentence:
 atur menteri hubung nomor pm tahun tata cara aju tuju rencana investasi usaha umum lembaga selenggara layan navigasi terbang indonesia
Tokenized sentence:
 [2, 7194, 2690, 23789, 1288, 2419, 262, 2816, 354, 2116, 30360, 23641, 2596, 3077, 1062, 752, 2178, 157, 1005, 85, 933, 5, 14296, 5109, 300, 3]


## Padding

In [7]:
# pad the text into the same length
max_length = 100
padded_sequences = pad_sequences(text_sequences, maxlen=max_length)
print("Shape:", padded_sequences[12345].shape)

Shape: (100,)


# Model

## Load Model

In [8]:
# load keras saved model
saved_model_path = 'simple_kaidah_model.h5'
model = tf.keras.models.load_model(saved_model_path)
model.summary()

Model: "kaidah_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 64)           1953344   
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
output (Dense)               (None, 54)                3510      
Total params: 1,956,854
Trainable params: 1,956,854
Non-trainable params: 0
_________________________________________________________________


## Inference

In [9]:
# predict from text
predictions = model.predict(padded_sequences)

In [10]:
predictions[55373]

array([0.12235361, 0.11918838, 0.13036887, 0.11352316, 0.11331199,
       0.14487611, 0.11495935, 0.10588241, 0.10497704, 0.09917248,
       0.13381623, 0.12376384, 0.10592593, 0.10646056, 0.1451548 ,
       0.12206271, 0.11736595, 0.11643667, 0.09956431, 0.09232514,
       0.11091688, 0.14629649, 0.11876089, 0.12659644, 0.11393417,
       0.1308963 , 0.1530765 , 0.09712965, 0.11423308, 0.11072848,
       0.09749048, 0.09977307, 0.13663243, 0.10286006, 0.1289975 ,
       0.13436511, 0.12042274, 0.10650082, 0.10002356, 0.13518551,
       0.12282439, 0.12096839, 0.16307774, 0.10923144, 0.12998466,
       0.11269809, 0.10365465, 0.11710158, 0.11226106, 0.10180741,
       0.08857999, 0.14915283, 0.11853752, 0.8789374 ], dtype=float32)

0.8957185 0.89517206 0.8943941

In [11]:
for index, prediction in enumerate(predictions):
    if prediction.argmax() != 53:
        print(index, prediction.argmax())