In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import os

In [2]:
device = '/GPU:0' if len(tf.config.list_physical_devices('GPU')) > 0 else '/CPU:0'
device

'/GPU:0'

# Load Data

In [3]:
# load prediction data
directory = 'data/'
dataset_path = os.path.join(directory, 'prediction_dataset.csv')
dataset_df = pd.read_csv(dataset_path)
print("Shape:", dataset_df.shape)

Shape: (55374, 20)


In [4]:
# take the text data to be predicted
text_df = dataset_df['judul_dokumen']
text_df.head()

0    atur kepala badan duduk keluarga rencana nasio...
1    atur menteri uang nomor pmk tatacara hitung ba...
2    atur menteri uang nomor pmk ubah ata atur ment...
3    atur menteri uang nomor pmk kembali bea masuk ...
4    atur menteri uang nomor pmk alokasi kurang bay...
Name: judul_dokumen, dtype: object

# Transform

## Tokenize

In [5]:
# create indobert tokenizer instance
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

In [6]:
# tokenize the text
text_sequences = text_df.apply(lambda sequence: tokenizer.encode(sequence))
print("Real sentence:\n", text_df[12345])
print("Tokenized sentence:\n", text_sequences[12345])

Real sentence:
 atur menteri hubung nomor pm tahun tata cara aju tuju rencana investasi usaha umum lembaga selenggara layan navigasi terbang indonesia
Tokenized sentence:
 [2, 7194, 2690, 23789, 1288, 2419, 262, 2816, 354, 2116, 30360, 23641, 2596, 3077, 1062, 752, 2178, 157, 1005, 85, 933, 5, 14296, 5109, 300, 3]


## Padding

In [7]:
# pad the text into the same length
max_length = 100
padded_sequences = pad_sequences(text_sequences, maxlen=max_length)
print("Shape:", padded_sequences[12345].shape)

Shape: (100,)


# Inference

## Load Model

In [8]:
# load keras saved model
saved_model_path = 'simple_kaidah_model.h5'
model = tf.keras.models.load_model(saved_model_path)
model.summary()

Model: "kaidah_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 64)           1953344   
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
output (Dense)               (None, 54)                3510      
Total params: 1,956,854
Trainable params: 1,956,854
Non-trainable params: 0
_________________________________________________________________


## Predict

In [9]:
# predict from text
predictions = model.predict(padded_sequences)

# Prediction Label

In [None]:
class_path = os.path.join(directory, 'label.csv')
class_df = pd.read_csv(class_path)
print("Available labels:", class_df['label'].values)

In [None]:
# peek at prediction result
seed = 54321
index = predictions[seed].argmax()
print("Title:", text_df[seed])
print("Predicted label:", class_df[index])

In [11]:
for index, prediction in enumerate(predictions):
    if prediction.argmax() != 53:
        print(index, prediction.argmax())

In [None]:
dataset_df['subject_id'] = predictions.apply(lambda x: class_df['id'][x.argmax()])

# Save Data

In [None]:
# save to directory
if not os.path.exists(directory):
    os.mkdir(directory)

saved_data_path = os.path.join(directory, 'predicted_jdihn.csv')
dataset_df.to_csv(saved_data_path, index=False)