In [1]:
import requests
import pandas as pd
import numpy as np

In [2]:
file_urls = {
    "dev": "https://raw.githubusercontent.com/khairunnisaor/idner-news-2k/main/dev.txt",
    "test": "https://raw.githubusercontent.com/khairunnisaor/idner-news-2k/main/test.txt",
    "train": "https://raw.githubusercontent.com/khairunnisaor/idner-news-2k/main/train.txt",
}

def todf(fileurl):
    data = []
    response = requests.get(fileurl)
    content = response.text.split("\n")  # Read file line by line
    
    sentence_id = 1  # Start sentence ID
    
    for line in content:
        line = line.strip()
        if not line:  # Empty line means new sentence
            sentence_id += 1
            continue
        parts = line.split()  # Split into word, POS, Tag
        if len(parts) == 3:
            word, pos, tag = parts
            data.append([sentence_id, word, tag])
    return pd.DataFrame(data, columns=["sentence_id", "words", "labels"])

df_train = todf(file_urls["train"])
df_dev = todf(file_urls["dev"])
df_test = todf(file_urls["test"])


In [3]:
df_train

Unnamed: 0,sentence_id,words,labels
0,1,Berikut,O
1,1,adalah,O
2,1,tujuh,O
3,1,kota,O
4,1,di,O
...,...,...,...
30243,1464,kita,O
30244,1464,",",O
30245,1464,ucap,O
30246,1464,Fadli,B-PER


In [4]:
df_dev

Unnamed: 0,sentence_id,words,labels
0,1,Meski,O
1,1,demikian,O
2,1,",",O
3,1,ia,O
4,1,menegaskan,O
...,...,...,...
7858,367,sebesar,O
7859,367,50,O
7860,367,miliar,O
7861,367,francs,O


In [5]:
df_test

Unnamed: 0,sentence_id,words,labels
0,1,Presiden,O
1,1,terpilih,O
2,1,Joko,B-PER
3,1,Widodo,I-PER
4,1,mengungkapkan,O
...,...,...,...
10583,509,keluarga,O
10584,509,-lah,O
10585,509,",",O
10586,509,ujarnya,O


In [6]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.43.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316

In [7]:
from simpletransformers.ner import NERModel, NERArgs
import pandas as pd
import logging
import sys

In [8]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [9]:
labels = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']

In [10]:
# Configure the model
model_args = NERArgs()
model_args.num_train_epochs = 5
model_args.train_batch_size = 32
model_args.evaluate_during_training = True
model_args.output_dir = '/output/ner/distilbert-base-indonesian'
model_args.best_model_dir = '/output/ner/distilbert-base-indonesian/best_model'
model_args.overwrite_output_dir = True
model_args.fp16 = False
model_args.labels_list=labels
model_args.do_lower_case = True

In [11]:
model_args


NERArgs(adafactor_beta1=None, adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_eps=(1e-30, 0.001), adafactor_relative_step=True, adafactor_scale_parameter=True, adafactor_warmup_init=True, adam_betas=(0.9, 0.999), adam_epsilon=1e-08, best_model_dir='/output/ner/distilbert-base-indonesian/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=0, do_lower_case=True, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, eval_batch_size=100, evaluate_during_training=True, evaluate_during_training_silent=True, evaluate_during_training_steps=2000, evaluate_during_training_verbose=False, evaluate_each_epoch=True, fp16=False, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, loss_type=None, loss_

In [12]:
model_bert_base = NERModel(
    "distilbert", "cahya/distilbert-base-indonesian", labels=labels, args=model_args, use_cuda=False
)

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at cahya/distilbert-base-indonesian and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/230k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


model.safetensors:   0%|          | 0.00/273M [00:00<?, ?B/s]

In [13]:
model_bert_base.train_model(df_train, eval_data=df_dev)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

(230,
 defaultdict(list,
             {'global_step': [46, 92, 138, 184, 230],
              'train_loss': [0.051575955003499985,
               0.010419116355478764,
               0.006902304943650961,
               0.014593607746064663,
               0.012653488665819168],
              'eval_loss': [0.09958097152411938,
               0.06395912077277899,
               0.05796801019459963,
               0.05983888264745474,
               0.061799717135727406],
              'precision': [0.7369337979094077,
               0.8624338624338624,
               0.8442211055276382,
               0.8416666666666667,
               0.8386023294509152],
              'recall': [0.7526690391459074,
               0.8701067615658363,
               0.896797153024911,
               0.8985765124555161,
               0.896797153024911],
              'f1_score': [0.744718309859155,
               0.8662533215234721,
               0.8697152717860225,
               0.8691910499139414,
  

In [14]:
result, model_outputs, preds_list = model_bert_base.eval_model(df_dev)


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
result, model_outputs, preds_list = model_bert_base.eval_model(df_test)


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]

In [16]:
result

{'eval_loss': 0.041997781954705715,
 'precision': 0.9161041465766635,
 'recall': 0.9322865554465162,
 'f1_score': 0.924124513618677}

In [17]:
texts = [
    "Gubernur Bank Indonesia Agus Martowardojo bersama jajaran deputi Gubernur Bank Indonesia menggelar konferensi pers usai Rapat Dewan Gubernur di Bank Indonesia, Jakarta, Kamis (17/5/2015)",
    "Selama 24 jam puncak Mahameru di Malang kebanjiran pendaki dari Wina",
]

In [18]:
predictions, raw_outputs = model_bert_base.predict(texts)
predictions

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'Gubernur': 'O'},
  {'Bank': 'B-ORG'},
  {'Indonesia': 'I-ORG'},
  {'Agus': 'B-PER'},
  {'Martowardojo': 'I-PER'},
  {'bersama': 'O'},
  {'jajaran': 'O'},
  {'deputi': 'O'},
  {'Gubernur': 'O'},
  {'Bank': 'B-ORG'},
  {'Indonesia': 'I-ORG'},
  {'menggelar': 'O'},
  {'konferensi': 'O'},
  {'pers': 'O'},
  {'usai': 'O'},
  {'Rapat': 'O'},
  {'Dewan': 'O'},
  {'Gubernur': 'O'},
  {'di': 'O'},
  {'Bank': 'B-LOC'},
  {'Indonesia,': 'I-LOC'},
  {'Jakarta,': 'B-LOC'},
  {'Kamis': 'O'},
  {'(17/5/2015)': 'O'}],
 [{'Selama': 'O'},
  {'24': 'O'},
  {'jam': 'O'},
  {'puncak': 'O'},
  {'Mahameru': 'B-LOC'},
  {'di': 'O'},
  {'Malang': 'B-LOC'},
  {'kebanjiran': 'O'},
  {'pendaki': 'O'},
  {'dari': 'O'},
  {'Wina': 'B-LOC'}]]

In [19]:
model_bert_base.model.save_pretrained('ner')
model_bert_base.tokenizer.save_pretrained('ner')
model_bert_base.config.save_pretrained('ner/')

In [20]:
!zip -r ner.zip ner

  adding: ner/ (stored 0%)
  adding: ner/vocab.txt (deflated 51%)
  adding: ner/tokenizer_config.json (deflated 74%)
  adding: ner/model.safetensors (deflated 7%)
  adding: ner/special_tokens_map.json (deflated 42%)
  adding: ner/config.json (deflated 50%)


In [21]:
from IPython.display import FileLink

FileLink('ner.zip')