In [1]:
import spacy
from spacy.pipeline import EntityRuler
import pickle
from spacy.util import minibatch, compounding
from spacy.training import Example
from pathlib import Path
import random

In [2]:
def load_patterns_from_file(file_path, label):
    patterns = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                word = line.strip()  # Menghapus spasi ekstra dan newline (\n)
                if word:  # Pastikan baris tidak kosong
                    patterns.append({"label": label, "pattern": word})
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    return patterns

In [3]:
# Memuat data pelatihan
with open('dataset.pickle', 'rb') as file:
    training_data = pickle.load(file)

In [4]:
# Membuat model kosong untuk bahasa Indonesia
nlp_model = spacy.blank("id")


In [5]:
# Menambahkan NER pipe
ner = nlp_model.add_pipe('ner')

In [6]:
# Menambahkan label entitas ke model
for _, ann in training_data:
    for entity in ann.get("entities"):
        ner.add_label(entity[2])

In [7]:
# Menambahkan rules dari file
person_patterns = load_patterns_from_file("person.txt", "PERSON")
organization_patterns = load_patterns_from_file("organization.txt", "ORGANIZATION")
location_patterns = load_patterns_from_file("location.txt", "LOCATION")

In [8]:
# Menambahkan EntityRuler ke pipeline
ruler = EntityRuler(nlp_model, overwrite_ents=True)

In [9]:
# Menambahkan pola ke EntityRuler
ruler.add_patterns(person_patterns)
ruler.add_patterns(organization_patterns)
ruler.add_patterns(location_patterns)

In [10]:
# Menambahkan EntityRuler ke pipeline sebelum "ner"
nlp_model.add_pipe('entity_ruler', name="entity_ruler", before="ner")

<spacy.pipeline.entityruler.EntityRuler at 0x7a18857eb680>

In [11]:
# Training konfigurasi
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp_model.pipe_names if pipe not in pipe_exceptions]

In [12]:
nlp_model.begin_training()

<thinc.optimizers.Optimizer at 0x7a18955fefc0>

In [13]:
# TRAINING THE MODEL
with nlp_model.disable_pipes(*unaffected_pipes):
    for iteration in range(30):
        random.shuffle(training_data)
        losses = {}
        batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp_model.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            nlp_model.update(
                examples,
                drop=0.5,
                losses=losses,
            )
        print(f"Losses at iteration {iteration}: {losses}")



Losses at iteration 0: {'ner': 7753.64219232926}
Losses at iteration 1: {'ner': 5132.6565039421475}
Losses at iteration 2: {'ner': 4612.423197820806}
Losses at iteration 3: {'ner': 4203.122217077599}
Losses at iteration 4: {'ner': 3781.112536571112}
Losses at iteration 5: {'ner': 3705.317789390188}
Losses at iteration 6: {'ner': 3433.843288088124}
Losses at iteration 7: {'ner': 3204.850498067952}
Losses at iteration 8: {'ner': 3004.441510987616}
Losses at iteration 9: {'ner': 2948.825118249772}
Losses at iteration 10: {'ner': 2818.615239390111}
Losses at iteration 11: {'ner': 2677.60405686413}
Losses at iteration 12: {'ner': 2537.2625970231607}
Losses at iteration 13: {'ner': 2575.8202687639223}
Losses at iteration 14: {'ner': 2516.2622853796124}
Losses at iteration 15: {'ner': 2336.867048578372}
Losses at iteration 16: {'ner': 2402.3776604299496}
Losses at iteration 17: {'ner': 2278.242813518251}
Losses at iteration 18: {'ner': 2243.5570789066587}
Losses at iteration 19: {'ner': 2196.

In [14]:
# Evaluasi Model
def evaluate_model(text):
    doc = nlp_model(text)
    print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
    displacy.render(doc, style="ent")

In [15]:
from spacy import displacy

# Test Model
evaluate_model("Kementerian Perhubungan tidak mewajibkan rapid test COVID-19 untuk perjalanan darat lintas daerah, kecuali untuk tujuan Bali.")

Entities: [('COVID-19', 'DISEASE'), ('Bali', 'LOCATION')]




In [16]:
# Test Model
evaluate_model("Calon gubernur (cagub)-calon wakil gubernur (cawagub) Ridwan Kamil-Suswono menyoroti kesehatan mental di Jakarta. Menurut Ridwan Kamil, Jakarta menduduki peringkat ke-9 sebagai kota paling stres di dunia. Dia mengutip data Dinas Kesehatan (Dinkes) Jakarta yang menunjukkan bahwa penderita skizofrenia, jumlahnya lebih banyak dari pasien pneumonia, diare, diabetes hingga DBD.")


Entities: [('Suswono', 'PERSON'), ('Jakarta', 'LOCATION'), ('Ridwan Kamil', 'PERSON'), ('Jakarta', 'LOCATION'), ('Dinas Kesehatan (Dinkes) Jakarta', 'ORGANIZATION')]


