### Import Library

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization

### Membaca File CSV dan Menampilkan Data Awal

In [3]:
df_pidana_umum = pd.read_csv("../data/dataset_csv/dataset_pidana_umum.csv")

df_pidana_umum.head()

Unnamed: 0,amar,amar_lainnya,id,klasifikasi,lama_hukuman,lembaga_peradilan,provinsi,status,sub_klasifikasi,url,...,identitas,riwayat_penahanan,riwayat_perkara,riwayat_tuntutan,riwayat_dakwaan,fakta,amar_putusan,penutup,fakta_hukum,pertimbangan_hukum
0,pidana,hukum,00035681c8d944203f25d2e8215ae2bf,pidana-umum,210,pn-kudus,jateng,berkekuatan-hukum-tetap,pemalsuan,https://putusan3.mahkamahagung.go.id/direktori...,...,nama lengkap eny sulistiyaningsih binti mashad...,terdakwa ditahan dengan jenis tahanan rutan se...,pengadilan negeri tersebut\nsetelah membaca be...,setelah mendengar tuntutan requsitoir penuntut...,menimbang bahwa terdakwa diajukan di persidang...,menimbang bahwa selanjutnya untuk membuktikan ...,mengadili 1 menyatakan terdakwa eny sulistiyan...,demikian diputuskan dalam rapat permusyawarata...,,
1,pidana,hukum,000399ce26773e18695ce14f519cb9e6,pidana-umum,720,pn-demak,jateng,berkekuatan-hukum-tetap,pencurian,https://putusan3.mahkamahagung.go.id/direktori...,...,nama lengkap ali maftuhin bin nur salim tempat...,terdakwa ditahan di rumah tahanan negara berda...,pengadilan negeri tersebut\nsetelah membaca\np...,setelah mendengar surat tuntutan pidana requis...,menimbang bahwa terdakwa didakwa oleh penuntut...,menimbang bahwa untuk menguatkan dakwaan terse...,mengadili 1 menyatakan terdakwa ali maftuhin b...,demikianlah diputuskan dalam rapat permusyawar...,menimbang bahwa berdasarkan keterangan saksi s...,
2,pidana,jatuh-pidana-oleh-karena-itu-kepada-dakwa-ir-b...,0006582ad67cd9bd1ddf4261a09bf382,pidana-umum,120,pn-kediri,jatim,berkekuatan-hukum-tetap,kejahatan-terhadap-keamanan-negara,https://putusan3.mahkamahagung.go.id/direktori...,...,nama lengkap ir bambang sasongko bin r soewarn...,terdakwa tidak ditahan,terdakwa didampingi oleh penasehat hukumnya ya...,telah mendengar pembacaan tuntutan pidana oleh...,menimbang bahwa terdakwa diajukan di persidang...,menimbang bahwa selanjutnya dipersidangan tela...,mengadili\n1 menyatakan terdakwa ir bambang sa...,demikian diputuskan dalam rapat musyawarah maj...,,
3,pidana,hukum,00122b1be15a10ad474bb3b7ec0dea73,pidana-umum,90,pn-sampang,jatim,berkekuatan-hukum-tetap,penghinaan,https://putusan3.mahkamahagung.go.id/direktori...,...,nama lengkap ahmad al pak saki tempat lahir sa...,,terdakwa dipersidangan tidak didampingi oleh p...,telah mendengar tuntutan pidana dari penuntut ...,menimbang bahwa berdasarkan catatan penuntut u...,menimbang bahwa dalam persidangan telah dideng...,mengadili 1 menyatakan terdakwa ahmad al pak s...,demikian diputuskan pada hari senin tanggal 5 ...,menimbang bahwa berdasarkan keterangan saksi s...,
4,pidana,hukum,00136d1554e18c63256deac42aad0c58,pidana-umum,210,pn-cirebon,jabar,berkekuatan-hukum-tetap,pencurian,https://putusan3.mahkamahagung.go.id/direktori...,...,1 nama lengkap muhamad rizki als rizki bin edi...,terdakwa ditangkap pada tanggal juli 2019\nter...,terdakwa tidak didampingi penasihat hukum\npen...,setelah mendengar pembacaan tuntutan pidana ya...,menimbang bahwa terdakwa diajukan ke persidang...,menimbang bahwa untuk membuktikan dakwaannya p...,mengadili\n1 menyatakan terdakwa muhamad rizki...,demikian diputuskan dalam sidang permusyawarat...,menimbang bahwa berdasarkan alat bukti dan bar...,menimbang bahwa selanjutnya majelis hakim akan...


In [4]:
df_pidana_umum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10729 entries, 0 to 10728
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   amar                10729 non-null  object
 1   amar_lainnya        10729 non-null  object
 2   id                  10729 non-null  object
 3   klasifikasi         10729 non-null  object
 4   lama_hukuman        10729 non-null  int64 
 5   lembaga_peradilan   10729 non-null  object
 6   provinsi            10729 non-null  object
 7   status              10729 non-null  object
 8   sub_klasifikasi     10729 non-null  object
 9   url                 10729 non-null  object
 10  kepala_putusan      10727 non-null  object
 11  identitas           9481 non-null   object
 12  riwayat_penahanan   8830 non-null   object
 13  riwayat_perkara     10630 non-null  object
 14  riwayat_tuntutan    10228 non-null  object
 15  riwayat_dakwaan     10577 non-null  object
 16  fakta               10

In [5]:
# Select relevant features and target
features = ['riwayat_perkara', 'riwayat_tuntutan', 'riwayat_dakwaan', 'fakta']
target = 'sub_klasifikasi'  # Target column for "pasal yang berlaku"

# Drop rows with missing target or all features empty
dataset_cleaned = df_pidana_umum.dropna(subset=features + [target])

# Combine textual features into a single column
dataset_cleaned['combined_text'] = dataset_cleaned[features].fillna('').agg(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_cleaned['combined_text'] = dataset_cleaned[features].fillna('').agg(' '.join, axis=1)


In [6]:
# Prepare input (X) and output (y)
X = dataset_cleaned['combined_text']
y = dataset_cleaned[target]


# Encode the target (pasal) into integer classes
pasal_classes = y.unique()
class_mapping = {label: idx for idx, label in enumerate(pasal_classes)}
y_encoded = y.map(class_mapping)

In [7]:
# Tokenization and padding
max_words = 20000  # Increase the vocabulary size
max_len = 300      # Increase the sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len, padding='post', truncating='post')

# Convert the target to categorical (one-hot encoding)
y_categorical = to_categorical(y_encoded)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

In [9]:
# Build the Sequential model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    BatchNormalization(),

    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    BatchNormalization(),

    LSTM(128, return_sequences=True),
    Dropout(0.5),
    LSTM(64, return_sequences=False),

    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(pasal_classes), activation='softmax')  # Output layer for multi-class classification
])

model.summary()



In [10]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
# Train Model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 930ms/step - accuracy: 0.1779 - loss: 2.8221 - val_accuracy: 0.1741 - val_loss: 2.5944
Epoch 2/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 777ms/step - accuracy: 0.3939 - loss: 1.9799 - val_accuracy: 0.1431 - val_loss: 2.8106
Epoch 3/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 681ms/step - accuracy: 0.4552 - loss: 1.6616 - val_accuracy: 0.4284 - val_loss: 1.6624
Epoch 4/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 704ms/step - accuracy: 0.5111 - loss: 1.4502 - val_accuracy: 0.5061 - val_loss: 1.4094
Epoch 5/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 639ms/step - accuracy: 0.5666 - loss: 1.3257 - val_accuracy: 0.5523 - val_loss: 1.2870
Epoch 6/10
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 660ms/step - accuracy: 0.5978 - loss: 1.1950 - val_accuracy: 0.5614 - val_loss: 1.2720
Epoch 7/1

In [12]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2%}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 109ms/step - accuracy: 0.6291 - loss: 1.2424
Test Accuracy: 62.23%


In [13]:
# Predict on new cases
def predict_pasal(input_text):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=max_len, padding='post', truncating='post')
    prediction = model.predict(input_padded)
    predicted_class = pasal_classes[prediction.argmax()]
    return predicted_class

In [17]:
# Example usage
sample_case = "Kasus pencurian"
predicted_pasal = predict_pasal(sample_case)
print(f"Predicted Pasal: {predicted_pasal}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Predicted Pasal: pemalsuan
