In [1]:
import tensorflow as tf
import os
import pandas as pd
import bert_tf_hub
import tensorflow_hub as hub

In [2]:
device = '/GPU:0' if len(tf.config.list_physical_devices('GPU')) > 0 else '/CPU:0'
device

'/GPU:0'

# Load Data

In [3]:
# load prediction data
directory = 'data/'
dataset_path = os.path.join(directory, 'prediction_dataset.csv')
dataset_df = pd.read_csv(dataset_path)
print("Shape:", dataset_df.shape)

Shape: (52836, 2)


In [4]:
# take the text data to be predicted
sequences = dataset_df['judul_dokumen']
sequences.head()

0    atur kepala badan duduk keluarga rencana nasio...
1    atur menteri uang nomor pmk tatacara hitung ba...
2    atur menteri uang nomor pmk ubah ata atur ment...
3    atur menteri uang nomor pmk kembali bea masuk ...
4    atur menteri uang nomor pmk alokasi kurang bay...
Name: judul_dokumen, dtype: object

# Transform

In [5]:
# create tokenizer instance
bert_model = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/2'
tokenizer = bert_tf_hub.create_tokenizer(bert_model)

In [6]:
# convert sequences into bert inputs
max_seq_length = 512
transformed_seq = bert_tf_hub.build_inputs(sequences, tokenizer, max_seq_length)

In [7]:
# convert to tf dataset
batch_size = 64
ds = tf.data.Dataset.from_tensor_slices(transformed_seq)
ds = ds.shuffle(sequences.shape[0] // 4).batch(batch_size)

# Inference

## Load Model

In [8]:
# load keras saved model
saved_model_path = 'bert_kaidah_model.h5'
model = tf.keras.models.load_model(saved_model_path, custom_objects={'KerasLayer':hub.KerasLayer})
model.summary()

Model: "bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
bert_embedding (KerasLayer)     {'default': (None, 1 4782465     input_2[0][0]                    
                                                                 input_3[0][0]           

## Predict

In [9]:
# predict from text
predictions = model.predict(ds, batch_size=batch_size, verbose=1)



# Prediction Label

In [10]:
# load class csv
class_path = os.path.join(directory, 'label.csv')
class_df = pd.read_csv(class_path)

In [11]:
# peek at prediction result
seed = 44388    
index = predictions[seed].argmax()
print("Title:", dataset_df['judul_dokumen'][seed])
print("Predicted label:", class_df['class'][index], '(', index+1, ')')

Title: putus menteri ppn kepala bappena nomor kep m ppn hk angkat anggota forum masyarakat statistik masa kerja tahun
Predicted label: Lain-lain ( 54 )


In [12]:
# create a new df with prediction result and document's id
saved_df = pd.DataFrame()
saved_df['id'] = dataset_df.index + 1
saved_df['subject_id'] = saved_df['id'].apply(lambda x: predictions[x-1].argmax() + 1)
saved_df['subject'] = saved_df['subject_id'].apply(lambda x: class_df['class'][x-1])
saved_df.head()

Unnamed: 0,id,subject_id,subject
0,1,4,Kementerian Keuangan
1,2,54,Lain-lain
2,3,54,Lain-lain
3,4,54,Lain-lain
4,5,54,Lain-lain


# Save Data

In [13]:
# save to directory
if not os.path.exists(directory):
    os.mkdir(directory)

saved_data_path = os.path.join(directory, 'predicted_jdihn.csv')
saved_df.to_csv(saved_data_path, index=False)