In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from matplotlib.pyplot import plt

In [None]:
df_pidana_umum = pd.read_csv("../data/dataset_csv/dataset_pidana_umum.csv")

In [None]:
df_pidana_umum.head(), df_pidana_umum.info()

In [None]:
features = ['riwayat_perkara', 'riwayat_tuntutan', 'riwayat_dakwaan', 'fakta']
target = 'sub_klasifikasi'

# cleaned data
df_pidana_umum.dropna(subset=features + [target])

# Combine textual features into a single column
df_pidana_umum['combined_text'] = df_pidana_umum[features].fillna('').agg(' '.join, axis=1)  

In [None]:
# Prepare input (X) and output (y)
X = df_pidana_umum['combined_text']
y = df_pidana_umum[target]

# Encode the target (pasal) into integer classes
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(y_encoded)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
   X, y_encoded, test_size=0.2, random_state=42
)

In [None]:
# Text tokenization and padding
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding="post")
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding="post")

In [None]:
# Build the TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation="softmax"),
])

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
history = model.fit(
    X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=32
)

In [None]:
model.save("../models/category_classification_model.h5")