In [13]:
from transformers import AutoTokenizer
from collections import Counter
import pandas as pd
import re

# Load tokenizer dari model sentiment (agar tokenisasinya sesuai model BERT Bahasa Indonesia)
tokenizer = AutoTokenizer.from_pretrained("taufiqdp/indonesian-sentiment")

In [18]:
import re
import pandas as pd
from transformers import AutoTokenizer
from flashtext import KeywordProcessor  # 

# Load tokenizer dari HuggingFace
tokenizer = AutoTokenizer.from_pretrained("taufiqdp/indonesian-sentiment")

# Kamus entitas diperluas (multi-token frasa juga)
ENTITY_DICTIONARY = {
    "INFRA": [
        "jalan", "jalan tol", "tol", "aspal", "trotoar", "jembatan", "flyover", "underpass", "rel", "lintasan", "busway", "pohon", "menara",
        "halte", "terminal", "stasiun", "bandara", "pelabuhan", "gerbang", "pos", "troli", "tiang", "pagar", "tembok", "dinding",
        "lampu", "penerangan", "lampu merah", "lampu jalan", "rambu", "traffic light", "marka", "pembatas", "batas", "monumen", "saluran air", "patung",
        "drainase", "got", "selokan", "parit", "saluran", "irigasi", "bendungan", "embung", "waduk",
        "sumur", "pompa", "pipa", "tandon", "manhole", "gorong-gorong", "rumah", "gedung", "kantor", "balai", "puskesmas", 
        "rumah sakit", "klinik", "sekolah", "kampus", "prasarana umum","sarana", "prasarana", "prasda",
        "masjid", "gereja", "wihara", "pura", "pos ronda", "balai desa", "fasilitas umum"
    ],
    "PROB": [
        "rusak", "retak", "berlubang", "bolong", "hancur", "rapuh", "amblas", "terbakar",
        "terbakar", "terbakar habis", "terbakar sebagian", "kebakran", "tumbang", "roboh",
        "mati", "padam", "gelap", "terendam", "banjir", "macet", "tersumbat", "longsor",
        "berkarat", "lapuk", "bocor", "kotor", "berdebu", "berisik", "bau", "bakar", "demo",
        "patah", "hilang", "terpotong", "terkelupas", "mengelupas", "melebar", "runtuh", "ambruk"
    ],
    "LOC": [
        "jakarta", "bandung", "medan", "surabaya", "semarang", "makassar", "denpasar", "yogyakarta",
        "bogor", "depok", "tangerang", "bekasi", "malang", "padang", "pekanbaru", "palembang", "sidoarjo", "kebon siri raya",
        "sudirman", "thamrin", "merdeka", "gatot", "subroto", "ahmad yani", "djuanda", "margonda", "senayan",
        "diponegoro", "hayam wuruk", "cut nyak dien", "imam bonjol",
        "utara", "selatan", "timur", "barat", "tengah", "pusat", "kec.", "kec", "kec.", "kel.", "kel", "kab.", "kab",
        "jl.", "jl", "jln", "jln.", "jalan sudirman", "jalan thamrin", "jalan merdeka", "gatot subroto", "ahmad yani",
        "jalan djuanda", "margonda raya", "polda", "polres", "polsek", "spbu", "pom bensin", "pom minyak",
        "pasar", "perempatan", "simpang", "bundaran", "tugu", "alun-alun", "monas", "monumen nasional", "pajak",
        "kelurahan", "kecamatan", "kabupaten", "desa", "daerah", "provinsi", "rt", "rw", "wilayah", "kota"
    ],
    "TIME": [
        "kemarin", "tadi", "tadi malam", "tadi pagi", "siang", "malam", "subuh", "sore",
        "bulan", "tahun", "hari", "besok", "lusa", "lalu", "baru saja", "sebentar",
        "dinihari", "musim", "pekan", "akhir pekan", "hari ini", "setiap hari", "setiap minggu","senin","selasa","rabu","kamis","jumat","sabtu", "minggu",
        "januari", "februari", "maret", "april", "mei", "juni", "juli", "agustus", "september", "oktober", "november", "desember",
        "awal tahun", "pertengahan tahun", "akhir tahun", "awal bulan", "pertengahan bulan", "akhir bulan",
        "pagi hari", "siang hari", "sore hari", "malam hari", "awal pekan", "pertengahan pekan", "akhir pekan",
        "jam", "menit", "detik", "waktu", "periode",
        "Jam 1", "Jam 2", "Jam 3", "Jam 4", "Jam 5", "Jam 6", "Jam 7", "Jam 8", "Jam 9", "Jam 10", "Jam 11", "Jam 12",
        "WIB", "WITA", "WIT"
        
        
    ],
    "DESC": [
        "bahaya", "berantakan", "ramai", "sepi", "gelap", "terang", "rawan", "licin", "padat",
        "semrawut", "susah", "ribet", "membahayakan", "menakutkan", "mengganggu", "aman", "nyaman",
        "kacau", "berisik", "kumuh", "jorok", "rapi", "tertib", "panas", "dekat", "jauh", "tinggi", "rendah"
    ],
    "SEV": [
        "ringan", "sedang", "parah", "fatal", "berat", "kronis", "akut", "darurat", "gawat",
        "kritis", "serius", "ekstrem"
    ]
}

# 🔹 Tambahan: load LOC.csv dan gabungkan dengan LOC yang sudah ada
try:
    loc_df = pd.read_csv("LOC.csv")
    loc_list = loc_df["name"].dropna().astype(str).str.lower().tolist()
    ENTITY_DICTIONARY["LOC"] = list(set(ENTITY_DICTIONARY["LOC"] + loc_list))
    print(f"✅ LOC diperbarui, total entitas LOC: {len(ENTITY_DICTIONARY['LOC'])}")
except Exception as e:
    print(f"⚠️ Tidak bisa load LOC.csv: {e}")

#🔹 Siapkan FlashText + mapping keyword → ent_type
keyword_processor = KeywordProcessor(case_sensitive=False)
keyword2type = {}

for ent_type, keywords in ENTITY_DICTIONARY.items():
    for kw in keywords:
        kw_norm = kw.strip().lower()
        if kw_norm:
            keyword_processor.add_keyword(kw_norm)
            keyword2type[kw_norm] = ent_type


def label_sentence_with_flashtext(sentence, keyword_processor, keyword2type, tokenizer):
    encoding = tokenizer(
        sentence,
        return_offsets_mapping=True,
        return_attention_mask=False,
        return_special_tokens_mask=False
    )
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    offsets = encoding["offset_mapping"]
    labels = ["O"] * len(tokens)

    lowered = sentence.lower()

    # Cari entitas dengan FlashText (keyword, start, end)
    matches = keyword_processor.extract_keywords(lowered, span_info=True)

    for ent, start_char, end_char in matches:
        ent_type = keyword2type.get(ent.lower(), None)
        if not ent_type:
            continue

        first = True
        for idx, (off_start, off_end) in enumerate(offsets):
            if off_start is None or off_end is None:
                continue
            if not (off_end <= start_char or off_start >= end_char):
                if first:
                    labels[idx] = f"B-{ent_type}"
                    first = False
                else:
                    labels[idx] = f"I-{ent_type}"

    # Validasi
    assert len(tokens) == len(labels), f"Mismatch: {len(tokens)} tokens vs {len(labels)} labels"
    return list(zip(tokens, labels))


# 🔹 Uji coba
test = "Lampu jalan di Jl. Sudirman mati sejak malam kemarin dan kondisi sangat gelap parah"
labeled = label_sentence_with_flashtext(test, keyword_processor, keyword2type, tokenizer)
for tok, lab in labeled:
    print(f"{tok}\t{lab}")


# 🔹 Proses dataset full
df = pd.read_csv("./preprocessed_data.csv")
all_sentences = []
for sent in df["cleaned_text"].dropna().tolist():
    labeled = label_sentence_with_flashtext(sent, keyword_processor, keyword2type, tokenizer)
    all_sentences.append(labeled)

# 🔹 Simpan ke CoNLL
with open("./ner_dataset_convertedV3.conll", "w", encoding="utf-8") as f:
    for sent in all_sentences:
        for token, label in sent:
            f.write(f"{token}\t{label}\n")
        f.write("\n")

print("✅ Dataset CoNLL berhasil dibuat dengan FlashText + mapping entitas")

✅ LOC diperbarui, total entitas LOC: 7339
[CLS]	O
lampu	B-INFRA
jalan	I-INFRA
di	O
jl	B-LOC
.	I-LOC
sudirman	B-LOC
mati	B-PROB
sejak	O
malam	B-TIME
kemarin	B-TIME
dan	O
kondisi	O
sangat	O
gelap	B-DESC
parah	B-SEV
[SEP]	O
✅ Dataset CoNLL berhasil dibuat dengan FlashText + mapping entitas


In [None]:
# import re
# import pandas as pd
# from transformers import AutoTokenizer

# # Load tokenizer dari HuggingFace
# tokenizer = AutoTokenizer.from_pretrained("taufiqdp/indonesian-sentiment")

# # Kamus entitas diperluas (multi-token frasa juga)
# ENTITY_DICTIONARY = {
#     "INFRA": [
#         "jalan", "jalan tol", "aspal", "trotoar", "jembatan", "flyover", "underpass", "rel", "lintasan",
#         "halte", "terminal", "stasiun", "bandara", "pelabuhan", "gerbang", "pos", "troli", "tiang",
#         "lampu", "penerangan", "lampu merah", "lampu jalan", "rambu", "traffic light", "marka", "pembatas", "batas",
#         "drainase", "got", "selokan", "parit", "saluran", "irigasi", "bendungan", "embung", "waduk",
#         "sumur", "pompa", "pipa", "tandon", "manhole", "gorong-gorong", "rumah", "gedung", "kantor", "balai", "puskesmas", 
#         "rumah sakit", "klinik", "sekolah", "kampus", "prasarana umum","sarana", "prasarana", "prasda",
#         "masjid", "gereja", "wihara", "pura", "pos ronda", "balai desa", "fasilitas umum"
#     ],
#     "PROB": [
#         "rusak", "retak", "berlubang", "bolong", "hancur", "rapuh", "amblas", "terbakar",
#         "mati", "padam", "gelap", "terendam", "banjir", "macet", "tersumbat", "longsor",
#         "berkarat", "lapuk", "bocor", "kotor", "berdebu", "berisik", "bau", "bakar", "demo",
#         "patah", "hilang", "terpotong", "terkelupas", "mengelupas", "melebar", "runtuh", "ambruk"
#     ],
#     "LOC": [
#         "jakarta", "bandung", "medan", "surabaya", "semarang", "makassar", "denpasar", "yogyakarta",
#         "bogor", "depok", "tangerang", "bekasi", "malang", "padang", "pekanbaru", "palembang", "sidoarjo", "kebon siri raya",
#         "sudirman", "thamrin", "merdeka", "gatot", "subroto", "ahmad yani", "djuanda", "margonda",
#         "diponegoro", "hayam wuruk", "cut nyak dien", "imam bonjol",
#         "jalan sudirman", "jalan thamrin", "jalan merdeka", "gatot subroto", "ahmad yani",
#         "jalan djuanda", "margonda raya", "polda", "polres", "polsek",
#         "kelurahan", "kecamatan", "kabupaten", "desa", "daerah", "provinsi", "rt", "rw", "wilayah", "kota"
#     ],
#     "TIME": [
#         "kemarin", "tadi", "tadi malam", "tadi pagi", "siang", "malam", "subuh", "sore",
#         "minggu", "bulan", "tahun", "hari", "besok", "lusa", "lalu", "baru saja", "sebentar",
#         "dinihari", "musim", "pekan", "akhir pekan", "hari ini", "setiap hari", "setiap minggu"
#     ],
#     "DESC": [
#         "bahaya", "berantakan", "ramai", "sepi", "gelap", "terang", "rawan", "licin", "padat",
#         "semrawut", "susah", "ribet", "membahayakan", "menakutkan", "mengganggu",
#         "kacau", "berisik", "kumuh", "jorok", "rapi", "tertib", "panas", "dekat", "jauh", "tinggi", "rendah"
#     ],
#     "SEV": [
#         "ringan", "sedang", "parah", "fatal", "berat", "kronis", "akut", "darurat", "gawat",
#         "kritis", "serius", "ekstrem"
#     ]
# }

# # 🔹 Tambahan: load LOC.csv dan gabungkan dengan LOC yang sudah ada
# try:
#     loc_df = pd.read_csv("LOC.csv")
#     loc_list = loc_df["name"].dropna().astype(str).str.lower().tolist()
#     ENTITY_DICTIONARY["LOC"] = list(set(ENTITY_DICTIONARY["LOC"] + loc_list))
#     print(f"✅ LOC diperbarui, total entitas LOC: {len(ENTITY_DICTIONARY['LOC'])}")
# except Exception as e:
#     print(f"⚠️ Tidak bisa load LOC.csv: {e}")

# # Persiapkan pola
# ENTITY_PATTERNS = []
# for ent_type, keywords in ENTITY_DICTIONARY.items():
#     for kw in keywords:
#         phrase = kw.strip().lower()
#         if phrase:
#             ENTITY_PATTERNS.append((phrase, ent_type))
# ENTITY_PATTERNS.sort(key=lambda x: len(x[0].split()), reverse=True)


# def label_sentence_with_bert_tokenizer(sentence, patterns, tokenizer):
#     encoding = tokenizer(
#         sentence,
#         return_offsets_mapping=True,
#         return_attention_mask=False,
#         return_special_tokens_mask=False
#     )
#     tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
#     offsets = encoding["offset_mapping"]

#     labels = ["O"] * len(tokens)
#     lowered = sentence.lower()

#     for phrase, ent_type in patterns:
#         for match in re.finditer(re.escape(phrase), lowered):
#             start_char, end_char = match.start(), match.end()
#             first = True
#             for idx, (off_start, off_end) in enumerate(offsets):
#                 if off_start is None or off_end is None:
#                     continue
#                 if not (off_end <= start_char or off_start >= end_char):
#                     if first:
#                         labels[idx] = f"B-{ent_type}"
#                         first = False
#                     else:
#                         labels[idx] = f"I-{ent_type}"

#     # 🔹 Validasi tambahan
#     # 1. Pastikan jumlah token == jumlah label
#     assert len(tokens) == len(labels), f"Mismatch: {len(tokens)} tokens vs {len(labels)} labels"

#     # 2. Pastikan tidak ada entity yang dimulai langsung dengan I-
#     for i, lab in enumerate(labels):
#         if lab.startswith("I-") and (i == 0 or labels[i-1] == "O"):
#             print(f"⚠️ Warning: entity tanpa B- di token {tokens[i]} ({lab})")

#     return list(zip(tokens, labels))


# # Uji coba
# test = "Lampu jalan di Jl. Sudirman mati sejak malam kemarin dan kondisi sangat gelap parah"
# labeled = label_sentence_with_bert_tokenizer(test, ENTITY_PATTERNS, tokenizer)
# for tok, lab in labeled:
#     print(f"{tok}\t{lab}")


# # Proses dataset
# df = pd.read_csv("./preprocessed_data.csv")
# all_sentences = []
# for sent in df["cleaned_text"].dropna().tolist():
#     labeled = label_sentence_with_bert_tokenizer(sent, ENTITY_PATTERNS, tokenizer)
#     all_sentences.append(labeled)

# # Simpan ke CoNLL
# with open("./ner_dataset_convertedV2.conll", "w", encoding="utf-8") as f:
#     for sent in all_sentences:
#         for token, label in sent:
#             f.write(f"{token}\t{label}\n")
#         f.write("\n")

# print("✅ Dataset CoNLL berhasil dibuat dengan validasi")


In [None]:
# # Kamus entitas diperluas (multi-token frasa juga)

# ENTITY_DICTIONARY = {
#     "INFRA": [
#         # Transportasi & jalan
#         "jalan", "jalan tol", "aspal", "trotoar", "jembatan", "flyover", "underpass", "rel", "lintasan",
#         "halte", "terminal", "stasiun", "bandara", "pelabuhan", "gerbang", "pos", "troli",
#         # Penerangan & lalu lintas
#         "lampu", "penerangan", "lampu merah", "lampu jalan", "rambu", "traffic light", "marka",
#         # Air & drainase
#         "drainase", "got", "selokan", "parit", "saluran", "irigasi", "bendungan", "embung", "waduk",
#         "sumur", "pompa", "pipa", "tandon", "bak", "manhole", "gorong-gorong",
#         # Bangunan publik
#         "gedung", "kantor", "balai", "puskesmas", "rumah sakit", "klinik", "sekolah", "kampus",
#         "masjid", "gereja", "wihara", "pura", "pos ronda", "balai desa"
#     ],

#     "PROB": [
#         "rusak", "retak", "berlubang", "bolong", "hancur", "rapuh", "amblas", "terbakar",
#         "mati", "padam", "gelap", "terendam", "banjir", "macet", "tersumbat", "longsor",
#         "berkarat", "lapuk", "bocor", "kotor", "berdebu", "berisik", "bau",
#         "patah", "hilang", "terpotong", "terkelupas", "mengelupas", "melebar", "runtuh", "ambruk"
#     ],

#     "LOC": [
#         # Kota besar
#         "jakarta", "bandung", "medan", "surabaya", "semarang", "makassar", "denpasar", "yogyakarta",
#         "bogor", "depok", "tangerang", "bekasi", "malang", "padang", "pekanbaru", "palembang",
#         # Nama jalan umum
#         "sudirman", "thamrin", "merdeka", "gatot", "subroto", "ahmad yani", "djuanda", "margonda",
#         "diponegoro", "hayam wuruk", "cut nyak dien", "imam bonjol",
#         "jalan sudirman", "jalan thamrin", "jalan merdeka", "gatot subroto", "ahmad yani",
#         "jalan djuanda", "margonda raya"
#         # Wilayah administrasi
#         "kelurahan", "kecamatan", "kabupaten", "desa", "provinsi", "rt", "rw"
#     ],

#     "TIME": [
#         "kemarin", "tadi", "tadi malam", "tadi pagi", "siang", "malam", "subuh", "sore",
#         "minggu", "bulan", "tahun", "hari", "besok", "lusa", "lalu", "baru saja", "sebentar",
#         "dinihari", "musim", "pekan", "akhir pekan", "hari ini", "setiap hari", "setiap minggu"
#     ],

#     "DESC": [
#         "bahaya", "berantakan", "ramai", "sepi", "gelap", "terang", "rawan", "licin", "padat",
#         "semrawut", "susah", "ribet", "membahayakan", "menakutkan", "mengganggu",
#         "kacau", "berisik", "kumuh", "jorok", "rapi", "tertib"
#     ],

#     "SEV": [
#         "ringan", "sedang", "parah", "fatal", "berat", "kronis", "akut", "darurat", "gawat",
#         "kritis", "serius", "ekstrem"
#     ]
# }


# # Persiapkan pola (phrase, ent_type), dan urutkan berdasarkan jumlah kata (agar pencocokan multi-token dulu)
# ENTITY_PATTERNS = []
# for ent_type, keywords in ENTITY_DICTIONARY.items():
#     for kw in keywords:
#         # cleanup spasi ekstra & lowercase
#         phrase = kw.strip().lower()
#         if phrase:
#             ENTITY_PATTERNS.append((phrase, ent_type))
# # sort, frasa panjang (banyak kata) dulu
# ENTITY_PATTERNS.sort(key=lambda x: len(x[0].split()), reverse=True)


# def label_sentence_with_bert_tokenizer(sentence, patterns, tokenizer):
#     """
#     Tokenisasi dengan tokenizer BERT → menghasilkan input_ids + tokens (wordpiece),
#     lalu cocokkan entitas multi-token di level kata → ubah label ke token wordpiece.
#     """

#     # Tokenisasi dengan output mapping ke kata asli
#     encoding = tokenizer(sentence, return_offsets_mapping=True, return_attention_mask=False, return_special_tokens_mask=False)
#     tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
#     offsets = encoding["offset_mapping"]  # tiap token wordpiece punya (start, end) karakter dalam kalimat
#     # offsets[0] dan offsets[-1] biasanya special tokens ([CLS], [SEP]) tergantung tokenizer

#     labels = ["O"] * len(tokens)

#     lowered = sentence.lower()

#     for phrase, ent_type in patterns:
#         # cari semua posisi kemunculan frasa (start char index) di teks
#         for match in re.finditer(re.escape(phrase), lowered):
#             start_char = match.start()
#             end_char = match.end()
#             # token-level: cari semua token whose offsets overlap dengan interval [start_char, end_char)
#             # Kita beri label B-ent untuk token pertama overlapping, sisanya I-ent
#             first = True
#             for idx, (off_start, off_end) in enumerate(offsets):
#                 if off_start is None or off_end is None:
#                     continue
#                 # cek overlap
#                 if not (off_end <= start_char or off_start >= end_char):
#                     if first:
#                         labels[idx] = f"B-{ent_type}"
#                         first = False
#                     else:
#                         labels[idx] = f"I-{ent_type}"

#     # Pair token + label
#     return list(zip(tokens, labels))


# # Uji coba kecil
# # contoh kalimat
# test = "Lampu jalan di Jl. Sudirman mati sejak malam kemarin dan kondisi sangat gelap parah"
# labeled = label_sentence_with_bert_tokenizer(test, ENTITY_PATTERNS, tokenizer)
# for tok, lab in labeled:
#     print(f"{tok}\t{lab}")

# # Kalau mau proses dataset:
# df = pd.read_csv("./preprocessed_data.csv")
# all_sentences = []
# for sent in df["cleaned_text"].dropna().tolist():
#     labeled = label_sentence_with_bert_tokenizer(sent, ENTITY_PATTERNS, tokenizer)
#     all_sentences.append(labeled)

# # Simpan ke CoNLL
# with open("./ner_dataset_converted.conll", "w", encoding="utf-8") as f:
#     for sent in all_sentences:
#         for token, label in sent:
#             f.write(f"{token}\t{label}\n")
#         f.write("\n")
