In [2]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from datasets import load_dataset

# Path ke dataset
conll_file = r"D:\Semester7\NLP\ner_proyek\ner_dataset_convertedV3.conll"

# Load dataset CoNLL ke HuggingFace Datasets
raw_datasets = load_dataset("text", data_files={"train": conll_file})

# Cek contoh baris pertama
print(raw_datasets["train"][0])

Generating train split: 74925 examples [00:00, 1895039.72 examples/s]

{'text': '[CLS]\tO'}





In [6]:
# Parsing CoNLL ke format HuggingFace
def parse_conll(path):
    sentences, labels = [], []
    tokens, tags = [], []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # separator kalimat
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
                continue
            token, label = line.split("\t")
            tokens.append(token)
            tags.append(label)
    return sentences, labels

# Parse file CoNLL
sentences, labels = parse_conll(conll_file)

print("Contoh kalimat:", sentences[0])
print("Contoh label:", labels[0])


Contoh kalimat: ['[CLS]', 'sumpah', 'guy', '##ss', 'rusak', 'fasilitas', 'kita', '##a', 'pakai', 'rambu', 'lintas', 'km', '##ren', 'keli', '##st', 'rusak', '##k', 'wo', '##i', 'lampu', 'redup', 'bn', '##get', '##t', 'bahaya', '##a', 'kalo', 'berant', '##a', 'jalan', 'raya', 'nii', 'kondusif', 'orang', 'sibuk', 'yak', '##an', '[SEP]']
Contoh label: ['O', 'O', 'O', 'O', 'B-PROB', 'O', 'O', 'O', 'O', 'B-INFRA', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-INFRA', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-INFRA', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [7]:
from datasets import Dataset

def parse_conll(path):
    data = []
    tokens, tags = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    data.append({"tokens": tokens, "ner_tags": tags})
                    tokens, tags = [], []
                continue
            token, label = line.split("\t")
            tokens.append(token)
            tags.append(label)
    if tokens:  # sisa terakhir
        data.append({"tokens": tokens, "ner_tags": tags})
    return data

# Parse file
dataset_list = parse_conll(conll_file)

# Konversi ke HuggingFace Dataset
hf_dataset = Dataset.from_list(dataset_list)

print(hf_dataset[0])


{'tokens': ['[CLS]', 'sumpah', 'guy', '##ss', 'rusak', 'fasilitas', 'kita', '##a', 'pakai', 'rambu', 'lintas', 'km', '##ren', 'keli', '##st', 'rusak', '##k', 'wo', '##i', 'lampu', 'redup', 'bn', '##get', '##t', 'bahaya', '##a', 'kalo', 'berant', '##a', 'jalan', 'raya', 'nii', 'kondusif', 'orang', 'sibuk', 'yak', '##an', '[SEP]'], 'ner_tags': ['O', 'O', 'O', 'O', 'B-PROB', 'O', 'O', 'O', 'O', 'B-INFRA', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-INFRA', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-INFRA', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [8]:
# Buat HuggingFace DatasetDict
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split train/val/test (80/10/10)
train_sents, test_sents, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)
val_sents, test_sents, val_labels, test_labels = train_test_split(
    test_sents, test_labels, test_size=0.5, random_state=42
)

# Buat HuggingFace Dataset
train_dataset = Dataset.from_dict({"tokens": train_sents, "ner_tags": train_labels})
val_dataset   = Dataset.from_dict({"tokens": val_sents, "ner_tags": val_labels})
test_dataset  = Dataset.from_dict({"tokens": test_sents, "ner_tags": test_labels})

datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(datasets)
print(datasets["train"][0])



DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2617
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 327
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 328
    })
})
{'tokens': ['[CLS]', 'user', 'rusak', 'indonesia', 'man', 'banjir', 'hutan', 'parah', 'gundul', 'orang', 'biang', 'ker', '##ok', 'buat', 'sempat', 'adil', 'pakai', 'hukum', 'jalan', '[SEP]'], 'ner_tags': ['O', 'O', 'B-PROB', 'O', 'O', 'B-PROB', 'O', 'B-SEV', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-INFRA', 'O']}


In [9]:
# Encode Labels
# Ambil semua label unik
unique_tags = set(tag for tags in labels for tag in tags)
label_list = sorted(list(unique_tags))

# Mapping label ↔ id
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print("Label mapping:", label2id)


Label mapping: {'B-DESC': 0, 'B-INFRA': 1, 'B-LOC': 2, 'B-PROB': 3, 'B-SEV': 4, 'B-TIME': 5, 'I-DESC': 6, 'I-INFRA': 7, 'I-LOC': 8, 'I-PROB': 9, 'O': 10}


In [10]:
# Tokenisasi & Align Labels

from transformers import AutoTokenizer

# Load tokenizer IndoBERT
model_checkpoint = "taufiqdp/indonesian-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Fungsi untuk align labels ke subword tokens
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # ignore loss
            elif word_id != previous_word:
                label_ids.append(label2id[label[word_id]])
            else:
                # Token lanjutan → I-XXX kalau awalnya B-XXX
                current_label = label2id[label[word_id]]
                label_ids.append(current_label)
            previous_word = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply preprocessing
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

print(tokenized_datasets["train"][0])


Map: 100%|██████████| 2617/2617 [00:00<00:00, 2738.03 examples/s]
Map: 100%|██████████| 327/327 [00:00<00:00, 3767.27 examples/s]
Map: 100%|██████████| 328/328 [00:00<00:00, 3005.48 examples/s]

{'tokens': ['[CLS]', 'user', 'rusak', 'indonesia', 'man', 'banjir', 'hutan', 'parah', 'gundul', 'orang', 'biang', 'ker', '##ok', 'buat', 'sempat', 'adil', 'pakai', 'hukum', 'jalan', '[SEP]'], 'ner_tags': ['O', 'O', 'B-PROB', 'O', 'O', 'B-PROB', 'O', 'B-SEV', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-INFRA', 'O'], 'input_ids': [3, 3, 12287, 5236, 1718, 1781, 3726, 3283, 5695, 24247, 1646, 21117, 1678, 7, 7, 2623, 3815, 2941, 6187, 7563, 2156, 2050, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0




In [12]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True  # penting untuk ganti classification head
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at taufiqdp/indonesian-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([11]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Metrics Evaluator
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [21]:
import transformers
print(transformers.__version__)


4.56.2


In [23]:
# training
from transformers import TrainingArguments, Trainer

batch_size = 16
args = TrainingArguments(
    output_dir="ner-indobert",
    do_train=True,
    do_eval=True,
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
50,0.8229
100,0.702
150,0.6871
200,0.4921
250,0.3892
300,0.3025
350,0.2177
400,0.1911
450,0.1778
500,0.1567




TrainOutput(global_step=820, training_loss=0.3008243946040549, metrics={'train_runtime': 9552.747, 'train_samples_per_second': 1.37, 'train_steps_per_second': 0.086, 'total_flos': 854836566539520.0, 'train_loss': 0.3008243946040549, 'epoch': 5.0})

In [25]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    # 🔹 laporan detail per entitas
    print("📊 Laporan Per Entitas:")
    print(results)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [26]:
# evaluation
print("Evaluation...")
# Evaluation di test set (final evaluation)
metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)




Evaluation...
{'eval_loss': 0.13564752042293549, 'eval_precision': 0.8805409466566492, 'eval_recall': 0.915625, 'eval_f1': 0.8977403293757181, 'eval_accuracy': 0.9570147407483764, 'eval_runtime': 59.898, 'eval_samples_per_second': 5.476, 'eval_steps_per_second': 0.351, 'epoch': 5.0}


In [27]:
from seqeval.metrics import classification_report
import numpy as np

# ambil prediksi dari trainer
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# konversi id -> label (hilangkan -100)
true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
true_predictions = [
    [id2label[p] for (p, l) in zip(pred, label) if l != -100]
    for pred, label in zip(predictions, labels)
]

# laporan per entitas
print(classification_report(true_labels, true_predictions, digits=4))


              precision    recall  f1-score   support

        DESC     0.8043    0.9024    0.8506        41
       INFRA     0.9284    0.9618    0.9448       445
         LOC     0.6767    0.7344    0.7044       305
        PROB     0.9879    1.0000    0.9939       407
         SEV     0.9524    0.8000    0.8696        25
        TIME     0.9333    0.9825    0.9573        57

   micro avg     0.8805    0.9156    0.8977      1280
   macro avg     0.8805    0.8969    0.8868      1280
weighted avg     0.8841    0.9156    0.8992      1280



Output pertama (classification report per entitas):

* **DESC, INFRA, PROB, TIME** punya skor F1 tinggi (≥0.85), artinya model sangat baik mendeteksi entitas ini.
* **LOC** lebih rendah (F1 ≈0.70), berarti model cukup kesulitan mengenali lokasi dengan benar.
* **SEV** hanya 25 data (support kecil), sehingga meskipun precision tinggi (0.95), recall rendah (0.80) membuat model kadang gagal mendeteksi semua entitas SEV.
* **Rata-rata** (micro/macro/weighted) menunjukkan kinerja keseluruhan stabil (F1 ≈0.89), meski ada variasi antar-entitas.

Output kedua (evaluasi global dari HuggingFace Trainer):

* **Eval loss rendah (0.1356)** → model fit dengan baik, tidak overfit.
* **Precision 0.88, Recall 0.91, F1 0.89** → konsisten dengan classification report.
* **Accuracy 0.957** → secara token-level, model mengklasifikasikan label dengan benar pada ±96% token.

Ringkasnya:
Model NER bekerja sangat baik untuk sebagian besar entitas, terutama PROB, TIME, INFRA. Tantangan terbesar ada di **LOC** (mungkin karena variasi nama lokasi tinggi) dan **SEV** (karena data sedikit). Secara keseluruhan, model sudah **stabil dan handal (F1 global ~0.90, akurasi ~96%)**.

Mau saya bikinkan juga saran **perbaikan spesifik untuk LOC dan SEV**?


In [28]:
# Save Model + Tokenizer + Trainer State
save_dir = "./ner_indobert_model"

# Simpan model + tokenizer (untuk inference / deployment)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# Simpan juga trainer state (untuk resume training nanti)
trainer.save_model(save_dir)

print(f"Model, tokenizer, dan trainer state saved to {save_dir}")


Model, tokenizer, dan trainer state saved to ./ner_indobert_model


In [31]:
from transformers import pipeline

# Load model & tokenizer dari folder simpanan
inference_dir = "./ner_indobert_model"

ner_pipeline = pipeline(
    "token-classification",
    model=inference_dir,
    tokenizer=inference_dir,
    aggregation_strategy="simple"  # gabungkan multi-token entity (contoh: "lampu jalan" jadi 1 entitas)
)

# Contoh inference pada laporan masyarakat
example_text = "Jalan Sudirman di Jakarta mengalami kerusakan parah karena aspal jalan berlubang sejak minggu lalu, kondisi ini sangat membahayakan karena jalan menjadi licin dan gelap di malam hari."

predictions = ner_pipeline(example_text)

print("🔹 Input:", example_text)
print("🔹 Output NER:")
for pred in predictions:
    print(pred)


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🔹 Input: Jalan Sudirman di Jakarta mengalami kerusakan parah karena aspal jalan berlubang sejak minggu lalu, kondisi ini sangat membahayakan karena jalan menjadi licin dan gelap di malam hari.
🔹 Output NER:
{'entity_group': 'INFRA', 'score': np.float32(0.9971186), 'word': 'jalan', 'start': 0, 'end': 5}
{'entity_group': 'LOC', 'score': np.float32(0.7175946), 'word': 'sudirman', 'start': 6, 'end': 14}
{'entity_group': 'LOC', 'score': np.float32(0.70571965), 'word': 'di', 'start': 15, 'end': 17}
{'entity_group': 'LOC', 'score': np.float32(0.92562366), 'word': 'jakarta', 'start': 18, 'end': 25}
{'entity_group': 'PROB', 'score': np.float32(0.74550533), 'word': 'mengalami', 'start': 26, 'end': 35}
{'entity_group': 'PROB', 'score': np.float32(0.9138829), 'word': 'kerusakan', 'start': 36, 'end': 45}
{'entity_group': 'SEV', 'score': np.float32(0.8121005), 'word': 'parah', 'start': 46, 'end': 51}
{'entity_group': 'INFRA', 'score': np.float32(0.9969313), 'word': 'aspal', 'start': 59, 'end': 64}
{

In [32]:
from transformers import pipeline

# Load model & tokenizer dari folder simpanan
inference_dir = "./ner_indobert_model"

ner_pipeline = pipeline(
    "token-classification",
    model=inference_dir,
    tokenizer=inference_dir,
    aggregation_strategy="simple"  
)

# Contoh inference pada laporan masyarakat
example_text = "Di Jalan Ahmad Yani, Bandung, sebuah pohon besar tumbang menimpa trotoar dan lampu jalan pada tadi malam, menyebabkan kondisi berbahaya karena area menjadi gelap dan sulit dilalui, dengan tingkat kerusakan cukup parah."

predictions = ner_pipeline(example_text)

print("🔹 Input:", example_text)
print("🔹 Output NER:")
for pred in predictions:
    print(pred)


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🔹 Input: Di Jalan Ahmad Yani, Bandung, sebuah pohon besar tumbang menimpa trotoar dan lampu jalan pada tadi malam, menyebabkan kondisi berbahaya karena area menjadi gelap dan sulit dilalui, dengan tingkat kerusakan cukup parah.
🔹 Output NER:
{'entity_group': 'INFRA', 'score': np.float32(0.98606145), 'word': 'jalan', 'start': 3, 'end': 8}
{'entity_group': 'LOC', 'score': np.float32(0.86693376), 'word': 'bandung', 'start': 21, 'end': 28}
{'entity_group': 'INFRA', 'score': np.float32(0.9874736), 'word': 'sebuah', 'start': 30, 'end': 36}
{'entity_group': 'INFRA', 'score': np.float32(0.9867684), 'word': 'pohon', 'start': 37, 'end': 42}
{'entity_group': 'DESC', 'score': np.float32(0.6130531), 'word': 'besar', 'start': 43, 'end': 48}
{'entity_group': 'PROB', 'score': np.float32(0.975183), 'word': 'tumbang', 'start': 49, 'end': 56}
{'entity_group': 'SEV', 'score': np.float32(0.3671959), 'word': 'menimpa', 'start': 57, 'end': 64}
{'entity_group': 'INFRA', 'score': np.float32(0.98611313), 'word'

In [33]:
from transformers import pipeline

# Load model & tokenizer dari folder simpanan
inference_dir = "./ner_indobert_model"

ner_pipeline = pipeline(
    "token-classification",
    model=inference_dir,
    tokenizer=inference_dir,
    aggregation_strategy="simple"  
)

# Contoh inference pada laporan masyarakat
example_text = "ada pohon tumabang di depan spbu Gatot Subroto pagi hari"

predictions = ner_pipeline(example_text)

print("🔹 Input:", example_text)
print("🔹 Output NER:")
for pred in predictions:
    print(pred)


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🔹 Input: ada pohon tumabang di depan spbu Gatot Subroto pagi hari
🔹 Output NER:
{'entity_group': 'INFRA', 'score': np.float32(0.9918521), 'word': 'pohon', 'start': 4, 'end': 9}
{'entity_group': 'LOC', 'score': np.float32(0.88788754), 'word': 'depan', 'start': 22, 'end': 27}
{'entity_group': 'LOC', 'score': np.float32(0.63920444), 'word': 'spbu gatot subroto', 'start': 28, 'end': 46}
{'entity_group': 'TIME', 'score': np.float32(0.7546842), 'word': 'pagi', 'start': 47, 'end': 51}
{'entity_group': 'TIME', 'score': np.float32(0.9845931), 'word': 'hari', 'start': 52, 'end': 56}
