In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import TFAutoModel, AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.layers import Input, Dense, Layer
from tensorflow.keras.models import Model

In [None]:
df = pd.read_csv("../data/dataset_csv/dataset_pidana_umum.csv")

df.head()

In [None]:
# 1. Pilih fitur dan label
features = df['riwayat_dakwaan']  # Teks yang digunakan untuk prediksi
labels = df['sub_klasifikasi']    # Kategori pidana

# Drop nilai kosong dan pastikan input valid
df = df.dropna(subset=['riwayat_dakwaan', 'sub_klasifikasi'])
df['riwayat_dakwaan'] = df['riwayat_dakwaan'].astype(str)
encodings = tokenizer(
    texts,
    max_length=200,
    padding="max_length",
    truncation=True,
    return_tensors="tf"
)
# Encode Label
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split data (contoh manual split)
train_encodings = encodings
y_train = encoded_labels
test_encodings = encodings
y_test = encoded_labels

In [None]:
model_name = "indobenchmark/indobert-base-p2"
pretrained_bert = TFAutoModel.from_pretrained(model_name)

# Custom layer untuk membungkus BERT
class BERTLayer(Layer):
    def __init__(self, **kwargs):
        super(BERTLayer, self).__init__(**kwargs)
        self.bert = pretrained_bert

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]  # Ambil token CLS


In [None]:
# 3. Functional API untuk Model
input_ids = Input(shape=(200,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(200,), dtype=tf.int32, name="attention_mask")

# Gunakan BERT layer
cls_token = BERTLayer()([input_ids, attention_mask])

# Tambahkan dense layer untuk prediksi
dense_layer = Dense(128, activation='relu')(cls_token)
output = Dense(1, activation='sigmoid')(dense_layer)

# Bangun model
model = Model(inputs=[input_ids, attention_mask], outputs=output)
model.summary()

In [None]:
# Compile Model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Train Model
history = model.fit(
    x={
        "input_ids": train_encodings['input_ids'],
        "attention_mask": train_encodings['attention_mask']
    },
    y=y_train,
    validation_data=(
        {
            "input_ids": test_encodings['input_ids'],
            "attention_mask": test_encodings['attention_mask']
        },
        y_test
    ),
    epochs=5,
    batch_size=16
)

In [None]:
loss, accuracy = model.evaluate(
    x={
        "input_ids": test_encodings['input_ids'],
        "attention_mask": test_encodings['attention_mask']
    },
    y=y_test
)
print(f"Accuracy: {accuracy}")

In [None]:
# Prediksi pada Data Baru
predictions = model.predict({
    "input_ids": test_encodings['input_ids'][:5],
    "attention_mask": test_encodings['attention_mask'][:5]
})

# Ambil label prediksi
predicted_classes = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print("Predicted Categories:", predicted_classes)

In [None]:
example_text = "Menimbang, bahwa terdakwa..."
tokenized_example = tokenizer(
    example_text,
    max_length=200,
    padding="max_length",
    truncation=True,
    return_tensors="tf"
)

# Prediksi untuk contoh
example_prediction = model.predict({
    "input_ids": tokenized_example["input_ids"],
    "attention_mask": tokenized_example["attention_mask"]
})
predicted_class = label_encoder.inverse_transform(np.argmax(example_prediction, axis=1))
print("Predicted Category for Example:", predicted_class)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Layer
from tensorflow.keras.models import Model
from transformers import TFAutoModel, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load tokenizer dan model
model_name = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_bert = TFAutoModel.from_pretrained(model_name)

# Custom Layer untuk BERT
class BERTLayer(Layer):
    def __init__(self, **kwargs):
        super(BERTLayer, self).__init__(**kwargs)
        self.bert = pretrained_bert

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]  # Ambil token CLS

# Preprocessing Dataset
# Contoh data (gantikan dengan dataset Anda)
texts = ["Menimbang, bahwa terdakwa...", "Hakim memutuskan bahwa...", "Saksi memberikan keterangan..."]
labels = ["hukuman", "putusan", "kesaksian"]

# Tokenisasi data
encodings = tokenizer(
    texts,
    max_length=200,
    padding="max_length",
    truncation=True,
    return_tensors="tf"
)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split data (contoh manual split)
train_encodings = encodings
y_train = encoded_labels
test_encodings = encodings
y_test = encoded_labels

# Model Building
input_ids = Input(shape=(200,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(200,), dtype=tf.int32, name="attention_mask")

cls_token = BERTLayer()([input_ids, attention_mask])
dense_layer = Dense(128, activation='relu')(cls_token)
output = Dense(len(label_encoder.classes_), activation='softmax')(dense_layer)

model = Model(inputs=[input_ids, attention_mask], outputs=output)
model.summary()

# Compile Model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train Model
history = model.fit(
    x={
        "input_ids": train_encodings['input_ids'],
        "attention_mask": train_encodings['attention_mask']
    },
    y=y_train,
    validation_data=(
        {
            "input_ids": test_encodings['input_ids'],
            "attention_mask": test_encodings['attention_mask']
        },
        y_test
    ),
    epochs=5,
    batch_size=16
)

# Evaluasi Model
loss, accuracy = model.evaluate(
    x={
        "input_ids": test_encodings['input_ids'],
        "attention_mask": test_encodings['attention_mask']
    },
    y=y_test
)
print(f"Accuracy: {accuracy}")

# Prediksi pada Data Baru
predictions = model.predict({
    "input_ids": test_encodings['input_ids'][:5],
    "attention_mask": test_encodings['attention_mask'][:5]
})

# Decode hasil prediksi
predicted_classes = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print("Predicted Categories:", predicted_classes)

# Prediksi untuk Kalimat Baru
example_text = "Menimbang, bahwa terdakwa..."
tokenized_example = tokenizer(
    example_text,
    max_length=200,
    padding="max_length",
    truncation=True,
    return_tensors="tf"
)

example_prediction = model.predict({
    "input_ids": tokenized_example["input_ids"],
    "attention_mask": tokenized_example["attention_mask"]
})
predicted_class = label_encoder.inverse_transform(np.argmax(example_prediction, axis=1))
print("Predicted Category for Example:", predicted_class)


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# **1. Load Dataset**
dataset_path = '../outputtesting4.csv'
data = pd.read_csv(dataset_path)

# **2. Pastikan Kolom Data**
print(data.head())
texts = data['PASAL'].values
labels = data['ISI_PASAL'].values

# **3. Cek Jumlah Teks dan Labels**
print("Jumlah Teks:", len(texts))
print("Jumlah Labels:", len(labels))
print("Missing values:", data.isnull().sum())

# **4. Hapus Baris dengan Missing Values**
data = data.dropna(subset=['PASAL', 'ISI_PASAL'])
texts = data['PASAL'].values
labels = data['ISI_PASAL'].values

# **5. Encode Labels**
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# **6. Load IndoBERT and Tokenizer**
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
bert_model = BertModel.from_pretrained("indobenchmark/indobert-base-p2")

# **7. Tokenize and Prepare Dataset**
max_length = 200

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encodings = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        return {
            'input_ids': encodings['input_ids'].squeeze(0),
            'attention_mask': encodings['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)
train_dataset = TextDataset(X_train, y_train, tokenizer, max_length)
test_dataset = TextDataset(X_test, y_test, tokenizer, max_length)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# **8. Define Model**
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout(cls_output)
        return self.fc(cls_output)

# Instantiate the model
num_classes = len(label_encoder.classes_)
model = BertClassifier(bert_model, num_classes)

# **9. Define Training Components**
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# **10. Training Function**
def train_model(model, train_loader, criterion, optimizer, device, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {correct / total:.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, device)

# **11. Evaluation Function**
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model
evaluate_model(model, test_loader, device)

# **12. Save Model**
torch.save(model.state_dict(), "indobert_classification_model.pth")

# **13. Test Prediction**
def predict_text(model, tokenizer, text, label_encoder, device, max_length):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        predicted_class_index = torch.argmax(outputs, dim=1).item()

    return label_encoder.inverse_transform([predicted_class_index])[0]

# Example Prediction



In [None]:
example_text = "senjata"
predicted_class = predict_text(model, tokenizer, example_text, label_encoder, device, max_length)
print(f"Predicted Class: {predicted_class}")