In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Pastikan path file-nya benar
df = pd.read_csv('/content/drive/MyDrive/Datasets/artikel_gabungan.csv')

In [None]:
# Tampilkan 5 baris pertama
print(df.head())

                                               Judul  \
0  Leasing Astra Group (ASII) Jadi Perusahaan Mul...   
1  Multifinance Gencar Berburu Surat Utang, Amuni...   
2  Saldo Warga Kelas Menengah Atas RI Susut saat ...   
3  Indonesian Market Suffers Whiplash from US Tar...   
4  Harga Emas Antam Hari Ini Bisa Ditebus Rp1,9 J...   

                                         Isi Artikel  
0  Perusahaan multifinance di bawah PT Astra Inte...  
1  Perusahaan multifinance atau leasing gencar me...  
2  Lembaga Penjamin Simpanan (LPS) mencatat bahwa...  
3                                    Tidak ditemukan  
4  Harga emas Antam tercatat stagnan pada akhir p...  


## Crawling

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import validators

# Daftar jalur yang dilarang berdasarkan robots.txt
DISALLOWED_PATHS = [
    "/wp-admin/",
    "/wp-content/",
    "/assets/",
    "/admin/",
    "/javascript/",
    "/rss/",
    "/interaktif/",
    "/cdn-cgi/",
    "/bisnis/",
    "/static/",
    "/multimedia/",
    "/bisnis-syariah/",
    "/feed/",
    "/kabar-banten/",
    "/spektrum/",
    "/fake_image.png"
]

# Fungsi untuk memeriksa apakah URL diizinkan oleh robots.txt
def is_allowed_url(url):
    if not validators.url(url) or url == "#" or url.startswith("javascript:"):
        return False
    for path in DISALLOWED_PATHS:
        if path in url:
            return False
    return True

# Fungsi untuk mengambil semua link dari halaman indeks
def crawl_index_page(url, headers):
    all_links = set()
    pagination_links = set()

    try:
        print(f"Accessing: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            # Ambil semua tag <a>
            links = soup.find_all("a")
            for link in links:
                href = link.get("href")
                if href:
                    if href.startswith("/"):
                        href = "https://www.bisnis.com" + href
                    if is_allowed_url(href):
                        all_links.add(href)

            # Ambil link paginasi (tag <a> dengan class="pagingLink")
            paging_links = soup.find_all("a", class_="pagingLink")
            for link in paging_links:
                href = link.get("href")
                if href:
                    if href.startswith("/"):
                        href = "https://www.bisnis.com" + href
                    if is_allowed_url(href):
                        pagination_links.add(href)

        else:
            print(f"Gagal mengakses {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error saat mengakses {url}: {str(e)}")

    return all_links, pagination_links

# Fungsi untuk mengambil link artikel lain dari halaman artikel
def crawl_article_page(url, headers):
    related_links = set()

    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            # Ambil semua tag <a> di dalam konten artikel
            content = soup.find("div", class_="content") or soup.find("article")
            if content:
                links = content.find_all("a")
                for link in links:
                    href = link.get("href")
                    if href:
                        if href.startswith("/"):
                            href = "https://www.bisnis.com" + href
                        if is_allowed_url(href) and ("/read/" in href or "/artikel/" in href):
                            related_links.add(href)
        else:
            print(f"Gagal mengakses artikel {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error saat mengakses artikel {url}: {str(e)}")

    return related_links

# Fungsi utama
def main():
    base_url = "https://www.bisnis.com/index?categoryId=5&type=indeks"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Set untuk menyimpan semua link
    all_links = set()
    visited_pages = set()
    pagination_urls = {base_url}

    # Crawl semua halaman indeks
    print("Crawling halaman indeks...")
    while pagination_urls:
        url = pagination_urls.pop()
        if url in visited_pages or not is_allowed_url(url):
            continue

        visited_pages.add(url)
        page_links, new_pagination_links = crawl_index_page(url, headers)

        # Tambahkan link ke set utama
        all_links.update(page_links)
        # Tambahkan link paginasi baru
        pagination_urls.update(new_pagination_links)

        print(f"Processed: {url} | Links found: {len(page_links)} | Pagination links: {len(new_pagination_links)}")
        time.sleep(2)  # Delay untuk menghindari rate-limiting

    # Crawl halaman artikel untuk link terkait
    print("\nCrawling halaman artikel untuk link terkait...")
    article_links = {link for link in all_links if "/read/" in link or "/artikel/" in link}
    for i, article_url in enumerate(article_links, 1):
        related_links = crawl_article_page(article_url, headers)
        all_links.update(related_links)
        print(f"Processed article {i}/{len(article_links)}: {article_url} | Related links found: {len(related_links)}")
        time.sleep(2)  # Delay untuk menghindari rate-limiting

    # Simpan semua link ke file CSV
    with open("bisnis_news_links.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Link"])
        for link in all_links:
            writer.writerow([link])

    print(f"\nSelesai! Total link unik: {len(all_links)}")
    print("Hasil disimpan ke bisnis_news_links.csv")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Error di main: {str(e)}")

## Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import logging
from urllib.parse import urljoin, urlparse, urlunparse
import validators

# Setup logging
logging.basicConfig(filename='scrape_bisnis.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Header untuk menyamarkan sebagai browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# File input dan output
input_file = '/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/bisnis_news.csv'  # File dari crawling sebelumnya
output_file = '/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/artikel_bisnis.csv'
progress_file = '/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/crawled_urls.txt'

# Daftar jalur yang dilarang berdasarkan robots.txt
DISALLOWED_PATHS = [
    "/wp-admin/", "/wp-content/", "/assets/", "/admin/", "/javascript/",
    "/rss/", "/interaktif/", "/cdn-cgi/", "/bisnis/", "/static/",
    "/multimedia/", "/bisnis-syariah/", "/feed/", "/kabar-banten/",
    "/spektrum/", "/fake_image.png"
]

# Fungsi untuk normalisasi URL
def normalize_url(url):
    parsed = urlparse(url)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path.rstrip('/'), '', '', ''))

# Fungsi untuk memeriksa apakah URL diizinkan oleh robots.txt
def is_allowed_url(url):
    if not validators.url(url) or url == "#" or url.startswith("javascript:"):
        return False
    for path in DISALLOWED_PATHS:
        if path in url:
            return False
    return "/read/" in url or "/artikel/" in url  # Hanya ambil link artikel

# Fungsi untuk menyimpan URL yang sudah di-crawl
def save_progress(url):
    with open(progress_file, 'a', encoding='utf-8') as f:
        f.write(url + '\n')

# Fungsi untuk memeriksa URL yang sudah di-crawl
def load_progress():
    try:
        with open(progress_file, 'r', encoding='utf-8') as f:
            return set(f.read().splitlines())
    except FileNotFoundError:
        return set()

# Fungsi untuk scrape satu artikel
def scrape_article(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Ekstrak judul
        title = soup.find('h1', class_='detailsTitleCaption')
        title_text = title.text.strip() if title else 'Tidak ditemukan'

        # Ekstrak tanggal
        date = soup.find('div', class_='detailsAttributeDates')
        date_text = date.text.strip() if date else 'Tidak ditemukan'

        # Ekstrak isi artikel
        article_content = soup.find('article', class_='detailsContent force-17 mt40')
        paragraphs = article_content.find_all('p') if article_content else []
        content_text = '\n'.join(p.text.strip() for p in paragraphs) if paragraphs else 'Tidak ditemukan'

        return {
            'Judul': title_text,
            'Tanggal': date_text,
            'Isi Artikel': content_text
        }

    except requests.exceptions.RequestException as e:
        logging.error(f"Error mengakses {url}: {e}")
        return {'Judul': 'Error', 'Tanggal': 'Error', 'Isi Artikel': str(e)}
    except Exception as e:
        logging.error(f"Error scraping {url}: {e}", exc_info=True)
        return {'Judul': 'Error', 'Tanggal': 'Error', 'Isi Artikel': str(e)}

# Fungsi untuk scrape daftar artikel dari file CSV
def scrape_articles_from_csv(max_articles=1200):
    articles_data = []
    crawled_urls = load_progress()
    article_urls = set()

    # Baca link dari file CSV
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)  # Lewati header
            for row in reader:
                if row and len(row) > 0:
                    url = normalize_url(row[0])
                    if is_allowed_url(url) and url not in crawled_urls:
                        article_urls.add(url)
    except FileNotFoundError:
        logging.error(f"File {input_file} tidak ditemukan.")
        print(f"File {input_file} tidak ditemukan.")
        return articles_data

    logging.info(f"Ditemukan {len(article_urls)} URL artikel untuk di-scrape.")
    print(f"Ditemukan {len(article_urls)} URL artikel untuk di-scrape.")

    # Scraping artikel
    with open(output_file, 'a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['Judul', 'Tanggal', 'Isi Artikel'])
        if file.tell() == 0:  # Tulis header jika file kosong
            writer.writeheader()

        for i, url in enumerate(article_urls, 1):
            if url not in crawled_urls and i <= max_articles:
                print(f"Scraping artikel {i}/{min(max_articles, len(article_urls))}: {url}")
                logging.info(f"Scraping artikel: {url}")
                article_data = scrape_article(url)
                writer.writerow(article_data)
                articles_data.append(article_data)
                save_progress(url)
                crawled_urls.add(url)
                time.sleep(1)  # Jeda antar artikel

    return articles_data

# Jalankan scraper
if __name__ == "__main__":
    logging.info("Memulai scraping...")
    print("Memulai scraping...")
    scrape_articles_from_csv(max_articles=1200)
    print(f"Scraping selesai. Data disimpan ke {output_file}")
    logging.info("Scraping selesai.")

Memulai scraping...
Ditemukan 19 URL artikel untuk di-scrape.
Scraping artikel 1/19: https://finansial.bisnis.com/read/20250422/215/1870821/iuran-dan-aset-dana-pensiun-sukarela-lesu-ini-analisis-bos-dapen-bca
Scraping artikel 2/19: https://finansial.bisnis.com/read/20250425/90/1872060/bank-sulutgo-kantongi-laba-rp9914-miliar-per-kuartal-i2025-naik-328
Scraping artikel 3/19: https://finansial.bisnis.com/read/20250411/563/1868457/ojk-catat-outstanding-fintech-p2p-lending-tembus-rp8007-triliun-tumbuh-3106
Scraping artikel 4/19: https://finansial.bisnis.com/read/20250320/55/1863240/tips-tetap-bisa-menabung-agar-tak-terus-menerus-makan-tabungan
Scraping artikel 5/19: https://finansial.bisnis.com/read/20250424/563/1871592/idscore-rata-rata-plafon-pinjaman-paylater-warga-ri-rp994000-per-bulan
Scraping artikel 6/19: https://market.bisnis.com/read/20250430/192/1873255/biaya-kredit-jadi-tantangan-bfi-finance-bfin-saat-bukukan-kinerja-impresif
Scraping artikel 7/19: https://semarang.bisnis.com/re

In [None]:
!pip install -U spacy
!python -m spacy download id_core_news_sm


[38;5;1m✘ No compatible package found for 'id_core_news_sm' (spaCy v3.8.5)[0m



In [None]:
import pandas as pd
import re
import spacy
from tqdm import tqdm

# Load spaCy model untuk bahasa Indonesia, dengan fallback ke model kosong + sentencizer
try:
    nlp = spacy.load("id_core_news_sm")  # Model spaCy untuk bahasa Indonesia
    print("Menggunakan model spaCy 'id_core_news_sm'.")
except:
    nlp = spacy.blank("id")  # Model kosong untuk bahasa Indonesia
    nlp.add_pipe("sentencizer")  # Tambahkan sentencizer untuk pemisahan kalimat
    print("Model spaCy 'id_core_news_sm' tidak ditemukan. Menggunakan model kosong dengan sentencizer.")

def clean_text(text):
    """Membersihkan teks dari noise, metadata, dan format yang tidak diinginkan."""
    # Hapus metadata dan frasa berulang spesifik dari artikel
    text = re.sub(r'Bisnis\.com, JAKARTA —', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Baca Juga.*?\n', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(dikutip pada.*?)\.', '', text, flags=re.IGNORECASE)

    # Hapus karakter khusus, kecuali yang relevan untuk entitas (misalnya, tanda baca dasar)
    text = re.sub(r'[^\w\s.,-/%]', '', text)

    # Hapus spasi berlebih dan baris kosong
    text = re.sub(r'\s+', ' ', text).strip()

    # Normalisasi encoding
    text = text.encode('ascii', 'ignore').decode('ascii')

    return text

def tokenize_sentences(text):
    """Memecah teks menjadi kalimat menggunakan spaCy."""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 5]
    return sentences

def preprocess_articles(input_file, output_file):
    """Fungsi utama untuk preprocessing artikel."""
    # Baca data artikel
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        raise ValueError(f"Gagal membaca file CSV: {e}")

    # Pastikan kolom yang diperlukan ada
    required_columns = ['Judul', 'Isi Artikel']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Kolom {required_columns} tidak ditemukan di dataset.")

    # 1. Hapus duplikasi berdasarkan Judul atau Isi Artikel
    df = df.drop_duplicates(subset=['Judul', 'Isi Artikel'], keep='first')

    # 2. Terapkan pembersihan teks
    tqdm.pandas(desc="Cleaning texts")
    df['teks_bersih'] = df['Isi Artikel'].progress_apply(clean_text)

    # 3. Tokenisasi ke kalimat
    tqdm.pandas(desc="Tokenizing sentences")
    df['kalimat'] = df['teks_bersih'].progress_apply(tokenize_sentences)

    # 4. Filter artikel dengan teks kosong atau terlalu pendek
    df = df[df['teks_bersih'].str.len() > 50]  # Minimal 50 karakter

    # 5. Simpan hasil preprocessing
    df.to_csv(output_file, index=False)
    print(f"Hasil preprocessing disimpan ke: {output_file}")
    print(f"Jumlah artikel setelah preprocessing: {len(df)}")

    return df

# Contoh penggunaan
if __name__ == "__main__":
    input_file = "/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/artikel_bisnis.csv"  # Ganti dengan path file Anda
    output_file = "/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/articles_preprocessed.csv"

    # Jalankan preprocessing
    processed_df = preprocess_articles(input_file, output_file)

    # Tampilkan 5 baris pertama hasil
    print("\nContoh 5 baris pertama hasil preprocessing:")
    print(processed_df[['Judul', 'teks_bersih', 'kalimat']].head())

Model spaCy 'id_core_news_sm' tidak ditemukan. Menggunakan model kosong dengan sentencizer.


Cleaning texts: 100%|██████████| 19/19 [00:00<00:00, 2335.63it/s]
Tokenizing sentences: 100%|██████████| 19/19 [00:00<00:00, 140.71it/s]

Hasil preprocessing disimpan ke: /content/drive/MyDrive/usk/Semester 6/NLP/semester 6/articles_preprocessed.csv
Jumlah artikel setelah preprocessing: 17

Contoh 5 baris pertama hasil preprocessing:
                                               Judul  \
0  Iuran dan Aset Dana Pensiun Sukarela Lesu, Ini...   
1  Bank Sulutgo Kantongi Laba Rp99,14 Miliar per ...   
2  OJK Catat Outstanding Fintech P2P Lending Temb...   
3  Tips Tetap Bisa Menabung Agar Tak Terus Meneru...   
4  IdScore: Rata-Rata Plafon Pinjaman Paylater Wa...   

                                         teks_bersih  \
0  Bisnis.com, JAKARTA Otoritas Jasa Keuangan OJK...   
1  PT Bank Pembangunan Daerah Sulawesi Utara Goro...   
2  Otoritas Jasa Keuangan OJK mencatatkan pertumb...   
3  Bisnis.com, JAKARTA -- Fenomenn Makan Tabungan...   
4  Pefindo Biro Kredit IdScore mengeluarkan data ...   

                                             kalimat  
0  [Bisnis.com, JAKARTA Otoritas Jasa Keuangan OJ...  
1  [PT Bank Pemban




## Anotasi - label studio

In [None]:
import pandas as pd
import nltk
import re
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*`grouped_entities` is deprecated.*")
warnings.filterwarnings("ignore", message=".*Asking to truncate to max_length.*")
warnings.filterwarnings("ignore", message=".*Some weights of the model checkpoint.*")

# Unduh resource NLTK
print("Mengunduh resource NLTK...")
try:
    nltk.download('punkt')
    nltk.download('punkt_tab')
    print("Resource NLTK berhasil diunduh.")
except Exception as e:
    print(f"Error mengunduh resource NLTK: {e}")
    exit(1)

# --- 1. Pra-pemrosesan Dataset ---
def preprocess_data(csv_path):
    print(f"Membaca CSV dari {csv_path}...")
    try:
        df = pd.read_csv(csv_path)
        print(f"Berhasil membaca CSV dengan {len(df)} baris.")
    except FileNotFoundError:
        print(f"File {csv_path} tidak ditemukan. Pastikan file ada di direktori yang sama.")
        return []
    except Exception as e:
        print(f"Error membaca CSV: {e}")
        return []

    data = []
    for idx, row in df.iterrows():
        try:
            text = row['Judul'] + ' ' + row['Isi Artikel']
            sentences = sent_tokenize(text)
            for sentence in sentences:
                data.append({
                    'article_id': idx,
                    'sentence': sentence,
                    'tokens': word_tokenize(sentence)
                })
        except KeyError:
            print(f"Kolom 'Judul' atau 'Isi Artikel' tidak ditemukan di baris {idx}. Pastikan nama kolom benar.")
            columns = df.columns.tolist()
            print(f"Kolom yang tersedia: {columns}")
            return []
        except Exception as e:
            print(f"Error memproses baris {idx}: {e}")
            continue

    print(f"Berhasil memproses {len(data)} kalimat dari {len(df)} artikel.")
    with open('sentences.json', 'w') as f:
        json.dump(data, f)
    return data

# --- 2. Anotasi Berbasis Aturan untuk B-FIN ---
def annotate_fin(sentence):
    patterns = [
        r'Rp\.?\s?\d+[,.]?\d*\s?(miliar|triliun|juta|ribu)',  # Rp 1,5 triliun
        r'\$\s?\d+[,.]?\d*\s?(million|billion|trillion)',     # $1.5 billion
        r'\d+[,.]?\d*\s?(miliar|triliun|juta)(?=\s[^orang|jiwa])',  # 10 miliar
        r'\d+[,.]?\d*(?=\s*(laba|pembiayaan|utang|pinjaman|transaksi|biaya))'  # Konteks finansial
    ]
    tokens = word_tokenize(sentence)
    labels = ['O'] * len(tokens)
    for pattern in patterns:
        matches = re.finditer(pattern, sentence)
        for match in matches:
            matched_text = match.group()
            matched_tokens = word_tokenize(matched_text)
            start_idx = len(word_tokenize(sentence[:match.start()]))
            if start_idx < len(labels):
                labels[start_idx] = 'B-FIN'
                for i in range(1, len(matched_tokens)):
                    if start_idx + i < len(labels):
                        labels[start_idx + i] = 'I-FIN'
    return labels

# --- Load model NER sekali saja ---
print("Memuat model NER Indonesia...")
try:
    model_name = "cahya/bert-base-indonesian-NER"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", framework="pt")
    print("Model NER berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat model NER: {e}")
    ner_pipeline = None

# --- 3. Aturan untuk Memfilter Orang Finansial ---
# Konteks untuk orang finansial (eksekutif)
FINANCIAL_ROLES = [
    'Ketua', 'Direktur', 'CEO', 'CFO', 'Presiden', 'Wakil', 'Manajer', 'Pimpinan',
    'Komisaris', 'Anggota', 'Dewan', 'Eksekutif', 'Pengurus'
]
# Blocklist untuk peran non-finansial
NON_FINANCIAL_ROLES = ['Reporter', 'Editor', 'Penulis', 'Jurnalis','Mahasiswa']

def is_financial_person(sentence, entity_text, start_idx):
    # Token sebelum nama untuk mendeteksi peran
    tokens = word_tokenize(sentence[:start_idx])
    prev_tokens = tokens[-3:] if len(tokens) >= 3 else tokens  # Ambil 3 token sebelumnya
    prev_text = ' '.join(prev_tokens).lower()

    # Periksa apakah ada peran finansial
    for role in FINANCIAL_ROLES:
        if role.lower() in prev_text:
            return True

    # Periksa apakah ada peran non-finansial
    for role in NON_FINANCIAL_ROLES:
        if role.lower() in prev_text:
            print(f"Nama '{entity_text}' dianggap non-finansial karena peran '{role}'.")
            return False

    # Default: Anggap finansial jika tidak ada konteks non-finansial
    return True

# --- 4. Anotasi Semi-Otomatis untuk B-ORG dan B-PER ---
def annotate_org_per(sentence, ner_pipe=None):
    if ner_pipe is None:
        print("NER pipeline tidak tersedia.")
        return ['O'] * len(word_tokenize(sentence))

    tokens = word_tokenize(sentence)
    try:
        entities = ner_pipe(sentence)
    except Exception as e:
        print(f"Error memproses kalimat: {sentence[:50]}... Error: {e}")
        return ['O'] * len(tokens)

    labels = ['O'] * len(tokens)
    for entity in entities:
        start = entity['start']
        end = entity['end']
        label = entity['entity_group']
        entity_text = sentence[start:end]
        entity_tokens = word_tokenize(entity_text)

        try:
            token_idx = len(word_tokenize(sentence[:start]))
            if label == 'ORG' and token_idx < len(labels):
                labels[token_idx] = 'B-ORG'
                for i in range(1, len(entity_tokens)):
                    if token_idx + i < len(labels):
                        labels[token_idx + i] = 'I-ORG'
            elif label == 'PER' and token_idx < len(labels):
                # Hanya label sebagai B-PER jika konteks finansial
                if is_financial_person(sentence, entity_text, start):
                    labels[token_idx] = 'B-PER'
                    for i in range(1, len(entity_tokens)):
                        if token_idx + i < len(labels):
                            labels[token_idx + i] = 'I-PER'
                else:
                    print(f"Nama '{entity_text}' tidak diberi label B-PER karena bukan orang finansial.")
        except Exception as e:
            print(f"Error saat mengindeks token: {e}")
            continue

    return labels

# --- 5. Penggabungan Anotasi ---
def combine_annotations(fin_labels, org_per_labels, tokens):
    labels = ['O'] * len(tokens)
    for i, fin_label in enumerate(fin_labels):
        if fin_label != 'O':
            labels[i] = fin_label
    for i, org_per_label in enumerate(org_per_labels):
        if labels[i] == 'O':
            labels[i] = org_per_label
    return labels

# --- 6. Validasi BIO ---
def validate_bio(data):
    errors = []
    for item in data:
        tokens, labels = item['tokens'], item['labels']
        for i, label in enumerate(labels):
            if label.startswith('I-') and (i == 0 or labels[i-1] not in [label, label.replace('I-', 'B-')]):
                errors.append(f"Error di {tokens}: I- tanpa B- pada indeks {i}")
    return errors

# --- 7. Ekspor untuk Label Studio ---
def export_to_label_studio(data, output_file):
    label_studio_data = []
    for item in data:
        text = ' '.join(item['tokens'])
        predictions = []
        result = []
        current_pos = 0
        for i, (token, label) in enumerate(zip(item['tokens'], item['labels'])):
            if label != 'O':
                start = current_pos
                end = current_pos + len(token)
                result.append({
                    'value': {
                        'start': start,
                        'end': end,
                        'text': token,
                        'labels': [label]
                    },
                    'from_name': 'label',
                    'to_name': 'text',
                    'type': 'labels'
                })
            current_pos += len(token) + 1
        label_studio_data.append({
            'data': {'text': text},
            'predictions': [{'result': result}]
        })
    with open(output_file, 'w') as f:
        json.dump(label_studio_data, f)
    print(f"Data Label Studio disimpan ke {output_file}")

# --- 8. Main Pipeline ---
def main(csv_path, output_json='annotated_data.json', label_studio_output='label_studio_data.json', batch_size=50):
    print("Memproses dataset...")
    data = preprocess_data(csv_path)
    if not data:
        print("Tidak ada data untuk diproses. Program berhenti.")
        return

    annotated_data = []
    total = len(data)

    print(f"Mulai anotasi untuk {total} kalimat...")
    for i, item in enumerate(data):
        if i % 10 == 0:
            print(f"Memproses {i}/{total} ({(i/total)*100:.1f}%)...")

        sentence = item['sentence']
        tokens = item['tokens']

        fin_labels = annotate_fin(sentence)
        org_per_labels = annotate_org_per(sentence, ner_pipeline)

        if len(fin_labels) != len(tokens) or len(org_per_labels) != len(tokens):
            print(f"Peringatan: Jumlah token/label tidak cocok untuk kalimat: {sentence[:50]}...")
            continue

        combined_labels = combine_annotations(fin_labels, org_per_labels, tokens)
        annotated_data.append({
            'article_id': item['article_id'],
            'sentence': sentence,
            'tokens': tokens,
            'labels': combined_labels
        })

        if (i + 1) % batch_size == 0 or i == total - 1:
            with open(f"{output_json}.temp", 'w') as f:
                json.dump(annotated_data, f)

    print(f"Selesai memproses {len(annotated_data)} kalimat dari total {total}.")

    print("Memvalidasi format BIO...")
    errors = validate_bio(annotated_data)
    if errors:
        print(f"Kesalahan ditemukan: {len(errors)} kesalahan")
        for i, error in enumerate(errors[:10]):
            print(error)
        if len(errors) > 10:
            print(f"...dan {len(errors) - 10} kesalahan lainnya.")
    else:
        print("Tidak ada kesalahan BIO.")

    print(f"Menyimpan hasil ke {output_json}...")
    with open(output_json, 'w') as f:
        json.dump(annotated_data, f)

    print("Mengekspor ke format Label Studio...")
    export_to_label_studio(annotated_data, label_studio_output)

    print(f"Proses selesai. Hasil disimpan di {output_json} dan {label_studio_output}")

if __name__ == "__main__":
    csv_path = 'artikel_gabungan.csv'  # Ganti dengan path CSV Anda
    main(csv_path, batch_size=50)

Mengunduh resource NLTK...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Resource NLTK berhasil diunduh.
Memuat model NER Indonesia...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/230k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the model checkpoint at cahya/bert-base-indonesian-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Model NER berhasil dimuat.
Memproses dataset...
Membaca CSV dari artikel_gabungan.csv...
Berhasil membaca CSV dengan 2199 baris.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Error memproses baris 1062: can only concatenate str (not "float") to str
Error memproses baris 1103: can only concatenate str (not "float") to str
Error memproses baris 1105: can only concatenate str (not "float") to str
Error memproses baris 1167: can only concatenate str (not "float") to str
Error memproses baris 1217: can only concatenate str (not "float") to str
Berhasil memproses 36327 kalimat dari 2199 artikel.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Mulai anotasi untuk 36327 kalimat...
Memproses 0/36327 (0.0%)...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Memproses 10/36327 (0.0%)...
Memproses 20/36327 (0.1%)...
Memproses 30/36327 (0.1%)...
Memproses 40/36327 (0.1%)...
Memproses 50/36327 (0.1%)...
Memproses 60/36327 (0.2%)...
Memproses 70/36327 (0.2%)...
Memproses 80/36327 (0.2%)...
Memproses 90/36327 (0.2%)...
Memproses 100/36327 (0.3%)...
Memproses 110/36327 (0.3%)...
Memproses 120/36327 (0.3%)...
Memproses 130/36327 (0.4%)...
Memproses 140/36327 (0.4%)...
Memproses 150/36327 (0.4%)...
Memproses 160/36327 (0.4%)...
Memproses 170/36327 (0.5%)...
Memproses 180/36327 (0.5%)...
Memproses 190/36327 (0.5%)...
Memproses 200/36327 (0.6%)...
Memproses 210/36327 (0.6%)...
Memproses 220/36327 (0.6%)...
Memproses 230/36327 (0.6%)...
Memproses 240/36327 (0.7%)...
Memproses 250/36327 (0.7%)...
Memproses 260/36327 (0.7%)...
Memproses 270/36327 (0.7%)...
Memproses 280/36327 (0.8%)...
Memproses 290/36327 (0.8%)...
Memproses 300/36327 (0.8%)...
Memproses 310/36327 (0.9%)...
Memproses 320/36327 (0.9%)...
Memproses 330/36327 (0.9%)...
Memproses 340/36327

KeyboardInterrupt: 

In [None]:
import json
with open('corrected_data.json', 'r') as f:
    data = json.load(f)
orgs, persons, fins = [], [], []
for item in data:
    for annotation in item['predictions'][0]['result']:
        label = annotation['value']['labels'][0]
        text = annotation['value']['text']
        if label.startswith('B-ORG'):
            orgs.append(text)
        elif label.startswith('B-PER'):
            persons.append(text)
        elif label.startswith('B-FIN'):
            fins.append(text)
print(f"Organisasi: {set(orgs)}")
print(f"Orang: {set(persons)}")
print(f"Angka Finansial: {set(fins)}")

Organisasi: {'OJK', 'Lending', 'Ketua', 'Umum', 'CNAF', 'Editor', 'lending', 'Otoritas', 'BI', 'ASPI', 'Bank', 'PT', 'Informatika', 'PJP', 'Presiden', 'AFPI'}
Orang: {'Ristiawan', 'Kontan', 'ASPI', 'Entjik', 'UMi'}
Angka Finansial: {'Rp1,74', '90', 'Rp', '85'}


In [None]:
import json
with open('annotated_data.json', 'r') as f:
    data = json.load(f)
for item in data[:5]:
    print(f"Kalimat: {item['sentence']}")
    print(f"Token: {item['tokens']}")
    print(f"Label: {item['labels']}\n")

Kalimat: Leasing Astra Group (ASII) Jadi Perusahaan Multifinance Paling Cuan pada 2024 Perusahaan multifinance di bawah PT Astra International Tbk.
Token: ['Leasing', 'Astra', 'Group', '(', 'ASII', ')', 'Jadi', 'Perusahaan', 'Multifinance', 'Paling', 'Cuan', 'pada', '2024', 'Perusahaan', 'multifinance', 'di', 'bawah', 'PT', 'Astra', 'International', 'Tbk', '.']
Label: ['B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']

Kalimat: (ASII) menduduki peringkat pertama perusahaan multifinance dengan laba bersih paling besar untuk tahun buku 2024.
Token: ['(', 'ASII', ')', 'menduduki', 'peringkat', 'pertama', 'perusahaan', 'multifinance', 'dengan', 'laba', 'bersih', 'paling', 'besar', 'untuk', 'tahun', 'buku', '2024', '.']
Label: ['O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Kalimat: Berdasarkan laporan keuangan audited perusahaan tahun buku 2024, PT Fed

Anotasi tanpa Label Studio

In [None]:
import pandas as pd
import nltk
import re
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*`grouped_entities` is deprecated.*")
warnings.filterwarnings("ignore", message=".*Asking to truncate to max_length.*")
warnings.filterwarnings("ignore", message=".*Some weights of the model checkpoint.*")

# Unduh resource NLTK
print("Mengunduh resource NLTK...")
try:
    nltk.download('punkt')
    nltk.download('punkt_tab')
    print("Resource NLTK berhasil diunduh.")
except Exception as e:
    print(f"Error mengunduh resource NLTK: {e}")
    exit(1)

# --- 1. Pra-pemrosesan Dataset ---
def preprocess_data(csv_path):
    print(f"Membaca CSV dari {csv_path}...")
    try:
        df = pd.read_csv(csv_path)
        print(f"Berhasil membaca CSV dengan {len(df)} baris.")
    except FileNotFoundError:
        print(f"File {csv_path} tidak ditemukan.")
        return []
    except Exception as e:
        print(f"Error membaca CSV: {e}")
        return []

    data = []
    for idx, row in df.iterrows():
        try:
            text = row['Judul'] + ' ' + row['Isi Artikel']
            sentences = sent_tokenize(text)
            for sentence in sentences:
                data.append({
                    'article_id': idx,
                    'sentence': sentence,
                    'tokens': word_tokenize(sentence)
                })
        except KeyError:
            print(f"Kolom 'Judul' atau 'Isi Artikel' tidak ditemukan di baris {idx}.")
            columns = df.columns.tolist()
            print(f"Kolom yang tersedia: {columns}")
            return []
        except Exception as e:
            print(f"Error memproses baris {idx}: {e}")
            continue

    print(f"Berhasil memproses {len(data)} kalimat.")
    with open('sentences.json', 'w') as f:
        json.dump(data, f)
    return data

# --- 2. Anotasi Berbasis Aturan untuk B-FIN ---
def annotate_fin(sentence):
    patterns = [
        r'Rp\.?\s?\d+[,.]?\d*\s?(miliar|triliun|juta|ribu)',
        r'\$\s?\d+[,.]?\d*\s?(million|billion|trillion)',
        r'\d+[,.]?\d*\s?(miliar|triliun|juta)(?=\s[^orang|jiwa])',
        r'\d+[,.]?\d*(?=\s*(laba|pembiayaan|utang|pinjaman|transaksi|biaya))'
    ]
    tokens = word_tokenize(sentence)
    labels = ['O'] * len(tokens)
    for pattern in patterns:
        matches = re.finditer(pattern, sentence)
        for match in matches:
            matched_text = match.group()
            matched_tokens = word_tokenize(matched_text)
            start_idx = len(word_tokenize(sentence[:match.start()]))
            if start_idx < len(labels):
                labels[start_idx] = 'B-FIN'
                for i in range(1, len(matched_tokens)):
                    if start_idx + i < len(labels):
                        labels[start_idx + i] = 'I-FIN'
    return labels

# --- Load model NER sekali saja ---
print("Memuat model NER Indonesia...")
try:
    model_name = "cahya/bert-base-indonesian-NER"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", framework="pt")
    print("Model NER berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat model NER: {e}")
    ner_pipeline = None

# --- 3. Aturan untuk Memfilter Orang Finansial ---
FINANCIAL_ROLES = [
    'Ketua', 'Direktur', 'CEO', 'CFO', 'Presiden', 'Wakil', 'Manajer', 'Pimpinan',
    'Komisaris', 'Anggota', 'Dewan', 'Eksekutif', 'Pengurus'
]
NON_FINANCIAL_ROLES = ['Reporter', 'Editor', 'Penulis', 'Jurnalis', 'Mahasiswa']

def is_financial_person(sentence, entity_text, start_idx):
    tokens = word_tokenize(sentence[:start_idx])
    prev_tokens = tokens[-3:] if len(tokens) >= 3 else tokens
    prev_text = ' '.join(prev_tokens).lower()
    for role in FINANCIAL_ROLES:
        if role.lower() in prev_text:
            return True
    for role in NON_FINANCIAL_ROLES:
        if role.lower() in prev_text:
            return False
    return True

# --- 4. Anotasi Semi-Otomatis untuk B-ORG dan B-PER ---
def annotate_org_per(sentence, ner_pipe=None):
    if ner_pipe is None:
        return ['O'] * len(word_tokenize(sentence))
    tokens = word_tokenize(sentence)
    try:
        entities = ner_pipe(sentence)
    except Exception as e:
        print(f"Error memproses kalimat: {sentence[:50]}... Error: {e}")
        return ['O'] * len(tokens)
    labels = ['O'] * len(tokens)
    for entity in entities:
        start = entity['start']
        end = entity['end']
        label = entity['entity_group']
        entity_text = sentence[start:end]
        entity_tokens = word_tokenize(entity_text)
        try:
            token_idx = len(word_tokenize(sentence[:start]))
            if label == 'ORG' and token_idx < len(labels):
                labels[token_idx] = 'B-ORG'
                for i in range(1, len(entity_tokens)):
                    if token_idx + i < len(labels):
                        labels[token_idx + i] = 'I-ORG'
            elif label == 'PER' and token_idx < len(labels):
                if is_financial_person(sentence, entity_text, start):
                    labels[token_idx] = 'B-PER'
                    for i in range(1, len(entity_tokens)):
                        if token_idx + i < len(labels):
                            labels[token_idx + i] = 'I-PER'
        except Exception as e:
            print(f"Error saat mengindeks token: {e}")
            continue
    return labels

# --- 5. Penggabungan Anotasi ---
def combine_annotations(fin_labels, org_per_labels, tokens):
    labels = ['O'] * len(tokens)
    for i, fin_label in enumerate(fin_labels):
        if fin_label != 'O':
            labels[i] = fin_label
    for i, org_per_label in enumerate(org_per_labels):
        if labels[i] == 'O':
            labels[i] = org_per_label
    return labels

# --- 6. Validasi BIO ---
def validate_bio(data):
    errors = []
    for item in data:
        tokens, labels = item['tokens'], item['labels']
        for i, label in enumerate(labels):
            if label.startswith('I-') and (i == 0 or labels[i-1] not in [label, label.replace('I-', 'B-')]):
                errors.append(f"Error di {tokens}: I- tanpa B- pada indeks {i}")
    return errors

# --- 7. Main Pipeline ---
def main(csv_path, output_json='annotated_data.json', batch_size=50):
    print("Memproses dataset...")
    data = preprocess_data(csv_path)
    if not data:
        print("Tidak ada data untuk diproses.")
        return

    annotated_data = []
    total = len(data)

    for i, item in enumerate(data):
        if i % 100 == 0:
            print(f"Memproses {i}/{total} ({(i/total)*100:.1f}%)...")
        sentence = item['sentence']
        tokens = item['tokens']
        fin_labels = annotate_fin(sentence)
        org_per_labels = annotate_org_per(sentence, ner_pipeline)

        if len(fin_labels) != len(tokens) or len(org_per_labels) != len(tokens):
            print(f"Peringatan: Jumlah token/label tidak cocok untuk kalimat: {sentence[:50]}...")
            continue

        combined_labels = combine_annotations(fin_labels, org_per_labels, tokens)
        annotated_data.append({
            'article_id': item['article_id'],
            'sentence': sentence,
            'tokens': tokens,
            'labels': combined_labels
        })

        if (i + 1) % batch_size == 0 or i == total - 1:
            with open(f"{output_json}.temp", 'w') as f:
                json.dump(annotated_data, f)

    print(f"Selesai memproses {len(annotated_data)} kalimat.")

    print("Memvalidasi format BIO...")
    errors = validate_bio(annotated_data)
    if errors:
        print(f"Kesalahan BIO: {len(errors)} ditemukan")
        for i, error in enumerate(errors[:5]):
            print(error)
        if len(errors) > 5:
            print(f"...dan {len(errors) - 5} kesalahan lainnya.")
    else:
        print("Tidak ada kesalahan BIO.")

    print(f"Menyimpan hasil ke {output_json}...")
    with open(output_json, 'w') as f:
        json.dump(annotated_data, f)

    print("Proses selesai.")

if __name__ == "__main__":
    csv_path = 'artikel_gabungan.csv'
    main(csv_path, batch_size=50)

Mengunduh resource NLTK...
Resource NLTK berhasil diunduh.
Memuat model NER Indonesia...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Some weights of the model checkpoint at cahya/bert-base-indonesian-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Model NER berhasil dimuat.
Memproses dataset...
Membaca CSV dari artikel_gabungan.csv...
Berhasil membaca CSV dengan 2199 baris.
Error memproses baris 1062: can only concatenate str (not "float") to str
Error memproses baris 1103: can only concatenate str (not "float") to str
Error memproses baris 1105: can only concatenate str (not "float") to str
Error memproses baris 1167: can only concatenate str (not "float") to str
Error memproses baris 1217: can only concatenate str (not "float") to str
Berhasil memproses 36327 kalimat.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Memproses 0/36327 (0.0%)...
Memproses 100/36327 (0.3%)...
Memproses 200/36327 (0.6%)...
Memproses 300/36327 (0.8%)...
Memproses 400/36327 (1.1%)...
Memproses 500/36327 (1.4%)...
Memproses 600/36327 (1.7%)...
Memproses 700/36327 (1.9%)...
Memproses 800/36327 (2.2%)...
Memproses 900/36327 (2.5%)...
Memproses 1000/36327 (2.8%)...
Memproses 1100/36327 (3.0%)...
Memproses 1200/36327 (3.3%)...
Memproses 1300/36327 (3.6%)...
Memproses 1400/36327 (3.9%)...
Memproses 1500/36327 (4.1%)...
Memproses 1600/36327 (4.4%)...
Memproses 1700/36327 (4.7%)...
Memproses 1800/36327 (5.0%)...
Memproses 1900/36327 (5.2%)...
Memproses 2000/36327 (5.5%)...
Memproses 2100/36327 (5.8%)...
Memproses 2200/36327 (6.1%)...
Memproses 2300/36327 (6.3%)...
Memproses 2400/36327 (6.6%)...
Memproses 2500/36327 (6.9%)...
Memproses 2600/36327 (7.2%)...
Memproses 2700/36327 (7.4%)...
Memproses 2800/36327 (7.7%)...
Memproses 2900/36327 (8.0%)...
Memproses 3000/36327 (8.3%)...
Memproses 3100/36327 (8.5%)...
Memproses 3200/36327

## Fine Tuning

In [None]:
# awal
# ======================== Tambahan Setup Google Colab ========================
# Jalankan cell ini dulu
try:
    import google.colab
    print("📍 Menjalankan di Google Colab...")

    # Install library hanya jika belum tersedia
    #!pip install -q --upgrade datasets transformers seqeval accelerate

    # Cek dan aktifkan GPU
    import torch
    if torch.cuda.is_available():
        print(f"✅ GPU aktif: {torch.cuda.get_device_name(0)}")
    else:
        print("⚠️ GPU tidak terdeteksi. Aktifkan lewat: Runtime → Change runtime type → GPU")

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
except:
    print("🖥️ Bukan di Google Colab, melanjutkan tanpa setup tambahan.")
# =============================================================================



📍 Menjalankan di Google Colab...
✅ GPU aktif: Tesla T4
Mounted at /content/drive


In [None]:
# Fine-tuning indoroberta

import json
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import torch
import transformers
import random

# --- Cek versi transformers ---
print(f"Versi Transformers: {transformers.__version__}")

# --- 1. Persiapan Dataset ---
def load_annotated_data(json_path):
    print(f"Memuat data dari {json_path}...")
    with open(json_path, 'r') as f:
        data = json.load(f)
    valid_labels = set(['O', 'B-FIN', 'I-FIN', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'])
    for item in data:
        for label in item['labels']:
            assert label in valid_labels, f"Label tidak valid: {label}"
    print(f"Berhasil memuat {len(data)} item data.")
    return data

def prepare_dataset(data, tokenizer, label_list):
    tokenized_inputs = {'input_ids': [], 'attention_mask': [], 'labels': []}
    label_to_id = {label: idx for idx, label in enumerate(label_list)}

    for idx, item in enumerate(data):
        tokens = item['tokens']
        labels = item['labels']
        if len(tokens) != len(labels):
            print(f"Peringatan: Jumlah token ({len(tokens)}) dan label ({len(labels)}) tidak cocok di kalimat: {item['sentence'][:50]}... (index: {idx})")
            continue
        # Tokenisasi ulang dengan tokenizer model, tambahkan padding ke max_length
        encoding = tokenizer(
            tokens,
            is_split_into_words=True,
            return_offsets_mapping=True,
            truncation=True,
            max_length=512,
            padding='max_length'
        )
        word_ids = encoding.word_ids()
        aligned_labels = [-100] * len(encoding['input_ids'])  # -100 untuk token khusus (CLS, SEP)

        # Menyelaraskan label dengan subword token
        for i, word_idx in enumerate(word_ids):
            if word_idx is not None and word_idx < len(labels):
                aligned_labels[i] = label_to_id[labels[word_idx]]

        tokenized_inputs['input_ids'].append(encoding['input_ids'])
        tokenized_inputs['attention_mask'].append(encoding['attention_mask'])
        tokenized_inputs['labels'].append(aligned_labels)

    print(f"Berhasil menyiapkan {len(tokenized_inputs['input_ids'])} data tokenisasi.")
    return tokenized_inputs

# --- 2. Fine-Tuning Model ---
def fine_tune_model(model_name, dataset, label_list, output_dir):
    print(f"Memuat model dan tokenizer dari {model_name}...")
    # Muat tokenizer dan model
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

    # Konversi ke dataset Hugging Face
    print("Menyiapkan dataset...")
    tokenized_data = prepare_dataset(dataset, tokenizer, label_list)

    # 🔧 Perubahan: pembagian berdasarkan data tokenized, bukan raw dataset
    total = len(tokenized_data['input_ids'])
    train_end = int(0.8 * total)
    val_end = int(0.9 * total)

    dataset_dict = DatasetDict({
        'train': Dataset.from_dict({
            'input_ids': tokenized_data['input_ids'][:train_end],
            'attention_mask': tokenized_data['attention_mask'][:train_end],
            'labels': tokenized_data['labels'][:train_end]
        }),
        'validation': Dataset.from_dict({
            'input_ids': tokenized_data['input_ids'][train_end:val_end],
            'attention_mask': tokenized_data['attention_mask'][train_end:val_end],
            'labels': tokenized_data['labels'][train_end:val_end]
        }),
        'test': Dataset.from_dict({
            'input_ids': tokenized_data['input_ids'][val_end:],
            'attention_mask': tokenized_data['attention_mask'][val_end:],
            'labels': tokenized_data['labels'][val_end:]
        })
    })

    print(f"Dataset split: Train={len(dataset_dict['train'])}, Validation={len(dataset_dict['validation'])}, Test={len(dataset_dict['test'])}")

    # Gunakan DataCollatorForTokenClassification untuk padding dinamis
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

    # Konfigurasi pelatihan
    print("Mengatur argumen pelatihan...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir="./logs",      # log akan disimpan di sini
        report_to="tensorboard"    # aktifkan TensorBoard
    )

    # Fungsi untuk menghitung metrik
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
        pred_labels = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

        # Log beberapa contoh untuk debugging
        print("Contoh true_labels (5 pertama):", true_labels[:2])
        print("Contoh pred_labels (5 pertama):", pred_labels[:2])

        try:
            results = classification_report(true_labels, pred_labels, output_dict=True)
            print("Hasil classification_report:", results)

            # Gunakan weighted avg untuk metrik keseluruhan
            precision = results.get('weighted avg', {}).get('precision', 0.0)
            recall = results.get('weighted avg', {}).get('recall', 0.0)
            f1 = results.get('weighted avg', {}).get('f1-score', 0.0)

            return {
                "precision": precision,
                "recall": recall,
                "f1": f1
            }
        except Exception as e:
            print(f"Error dalam compute_metrics: {e}")
            return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

    # Inisialisasi Trainer
    print("Menginisialisasi trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_dict['train'],
        eval_dataset=dataset_dict['validation'],
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )

    # Latih model
    print("Memulai pelatihan...")
    trainer.train()

    # Evaluasi pada test set
    print("Mengevaluasi model pada test set...")
    test_results = trainer.evaluate(dataset_dict['test'])
    print(f"Hasil evaluasi pada test set: {test_results}")

    # Simpan model
    print(f"Menyimpan model ke {output_dir}...")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model, tokenizer

# --- 3. Main ---
if __name__ == "__main__":

    # Install dependensi jika dijalankan di Colab
    try:
        import google.colab
        print("Mendeteksi Google Colab environment, menginstall dependensi...")
        from google.colab import drive
        drive.mount('/content/drive')
        output_dir = "/content/drive/MyDrive/NER_Model"
    except:
        print("Bukan di Google Colab, melanjutkan tanpa install dependensi...")
        output_dir = "./indo_roberta_ner"

    # Daftar label
    label_list = ['O', 'B-FIN', 'I-FIN', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

    # Muat data anotasi (pastikan file annotated_data.json sudah diupload)
    data = load_annotated_data('annotated_data.json')

    # Fine-tune IndoRoBERTa
    print("Fine-tuning IndoRoBERTa...")
    indo_roberta_model, indo_roberta_tokenizer = fine_tune_model(
        model_name="cahya/roberta-base-indonesian-522M",
        dataset=data,
        label_list=label_list,
        output_dir = output_dir
    )


Versi Transformers: 4.52.0
Mendeteksi Google Colab environment, menginstall dependensi...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Memuat data dari annotated_data.json...
Berhasil memuat 36327 item data.
Fine-tuning IndoRoBERTa...
Memuat model dan tokenizer dari cahya/roberta-base-indonesian-522M...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/926k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/468k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/507M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at cahya/roberta-base-indonesian-522M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Menyiapkan dataset...


model.safetensors:   0%|          | 0.00/507M [00:00<?, ?B/s]

Berhasil menyiapkan 36327 data tokenisasi.
Dataset split: Train=29061, Validation=3633, Test=3633
Mengatur argumen pelatihan...
Menginisialisasi trainer...
Memulai pelatihan...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1086,0.109967,0.758697,0.759659,0.759052
2,0.0698,0.099945,0.815816,0.74499,0.778246
3,0.05,0.104198,0.817384,0.761493,0.787797


Contoh true_labels (5 pertama): [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
Contoh pred_labels (5 pertama): [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
Hasil classification_report: {'FIN': {'precision': np.float64(0.9484936831875608), 'recall': np.float64(0.9740518962075848), 'f1-score': np.float64(0.9611029049729197), 'support': np.int64(1002)}, 'ORG': {'precision': np.float64(0.6802471961547265), 'recall': np.float64(0.6644310306282137), 'f1-score': np.float64(0.6722460981678353), 'support': np.int64(4473)}, 'PER': {'precision': np.float64(0.8331084120557805), 'recall': np.float64(0.8574074074074074), 'f1-score': np.float64(0.8450832762947751), 'support': np.int64(2160)}, 'micro avg': {'precision': np.float64(0.7610549796614617), 'recall': np.float64(0.759

Contoh true_labels (5 pertama): [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
Contoh pred_labels (5 pertama): [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
Hasil classification_report: {'FIN': {'precision': np.float64(0.9646680942184154), 'recall': np.float64(0.9646680942184154), 'f1-score': np.float64(0.9646680942184154), 'support': np.int64(934)}, 'ORG': {'precision': np.float64(0.7538143263511766), 'recall': np.float64(0.7185112151836333), 'f1-score': np.float64(0.7357395254921757), 'support': np.int64(4057)}, 'PER': {'precision': np.float64(0.8881151346332404), 'recall': np.float64(0.8860583603520148), 'f1-score': np.float64(0.887

### Fine Tuning Roberta

#### Evaluasi Model Roberta

In [None]:
import json
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import torch
import logging
import plotly.express as px
import pandas as pd

# Setup logging
logging.basicConfig(filename='evaluation.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- 1. Fungsi untuk Memuat Data Anotasi ---
def load_annotated_data(json_path):
    logging.info(f"Memuat data dari {json_path}...")
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
        valid_labels = set(['O', 'B-FIN', 'I-FIN', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'])
        for item in data:
            if not all(key in item for key in ['sentence', 'tokens', 'labels']):
                raise ValueError(f"Item tidak memiliki kunci yang diperlukan: {item}")
            if len(item['tokens']) != len(item['labels']):
                raise ValueError(f"Jumlah token dan label tidak cocok di item: {item['sentence'][:50]}...")
            for label in item['labels']:
                if label not in valid_labels:
                    raise ValueError(f"Label tidak valid: {label}")
        logging.info(f"Berhasil memuat {len(data)} item data.")
        return data
    except Exception as e:
        logging.error(f"Gagal memuat data: {e}")
        raise

# --- 2. Fungsi untuk Menyiapkan Dataset ---
def prepare_dataset(data, tokenizer, label_list):
    tokenized_inputs = {'input_ids': [], 'attention_mask': [], 'labels': []}
    label_to_id = {label: idx for idx, label in enumerate(label_list)}

    for idx, item in enumerate(data):
        logging.info(f"Memproses item {idx}: {item['sentence'][:50]}...")
        tokens = item['tokens']
        labels = item['labels']
        if len(tokens) != len(labels):
            logging.warning(f"Jumlah token dan label tidak cocok di item {idx}")
            continue

        encoding = tokenizer(
            tokens,
            is_split_into_words=True,
            return_offsets_mapping=True,
            truncation=True,
            max_length=512,
            padding='max_length'
        )
        word_ids = encoding.word_ids()
        aligned_labels = [-100] * len(encoding['input_ids'])
        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            if word_idx != previous_word_idx:
                aligned_labels[i] = label_to_id[labels[word_idx]]
            previous_word_idx = word_idx

        tokenized_inputs['input_ids'].append(encoding['input_ids'])
        tokenized_inputs['attention_mask'].append(encoding['attention_mask'])
        tokenized_inputs['labels'].append(aligned_labels)

    logging.info(f"Berhasil menyiapkan {len(tokenized_inputs['input_ids'])} data tokenisasi.")
    return tokenized_inputs

# --- 3. Fungsi untuk Evaluasi Model ---
def evaluate_model(model_path, dataset, label_list):
    logging.info(f"Memuat model dan tokenizer dari {model_path}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
        model = AutoModelForTokenClassification.from_pretrained(model_path)
    except Exception as e:
        logging.error(f"Gagal memuat model atau tokenizer: {e}")
        raise

    # Siapkan dataset test (10% terakhir dari data)
    tokenized_data = prepare_dataset(dataset, tokenizer, label_list)
    test_dataset = Dataset.from_dict({
        'input_ids': tokenized_data['input_ids'][int(0.9 * len(tokenized_data['input_ids'])):],
        'attention_mask': tokenized_data['attention_mask'][int(0.9 * len(tokenized_data['input_ids'])):],
        'labels': tokenized_data['labels'][int(0.9 * len(tokenized_data['input_ids'])):]
    })
    logging.info(f"Ukuran dataset test: {len(test_dataset)}")

    # Inisialisasi DataCollator
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

    # Fungsi untuk menghitung metrik
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
        pred_labels = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

        try:
            results = classification_report(true_labels, pred_labels, output_dict=True)
            logging.info(f"Hasil classification_report: {results}")

            # Metrik keseluruhan (weighted avg)
            precision = results.get('weighted avg', {}).get('precision', 0.0)
            recall = results.get('weighted avg', {}).get('recall', 0.0)
            f1 = results.get('weighted avg', {}).get('f1-score', 0.0)

            # Metrik per kelas
            per_class_metrics = {
                label: {
                    "precision": results.get(label, {}).get('precision', 0.0),
                    "recall": results.get(label, {}).get('recall', 0.0),
                    "f1-score": results.get(label, {}).get('f1-score', 0.0)
                } for label in label_list if label in results
            }

            return {
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "per_class_metrics": per_class_metrics
            }
        except Exception as e:
            logging.error(f"Error dalam compute_metrics: {e}")
            return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "per_class_metrics": {}}

    # Inisialisasi Trainer untuk evaluasi
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./eval_results",
            per_device_eval_batch_size=16,
            report_to="none"
        ),
        eval_dataset=test_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Evaluasi model
    logging.info("Mengevaluasi model pada test set...")
    eval_results = trainer.evaluate()

    # Tampilkan hasil
    print(f"\nHasil Evaluasi Model IndoRoBERTa ({model_path}):")
    print(f"Precision (weighted avg): {eval_results['eval_precision']:.4f}")
    print(f"Recall (weighted avg): {eval_results['eval_recall']:.4f}")
    print(f"F1-Score (weighted avg): {eval_results['eval_f1']:.4f}")
    print("\nMetrik per Kelas:")
    for label, metrics in eval_results['eval_per_class_metrics'].items():
        print(f"{label}:")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1-score']:.4f}")

    return eval_results

# --- 4. Visualisasi Hasil Evaluasi ---
def plot_metrics(eval_results, model_name):
    labels = list(eval_results['eval_per_class_metrics'].keys())
    precision = [eval_results['eval_per_class_metrics'][label]['precision'] for label in labels]
    recall = [eval_results['eval_per_class_metrics'][label]['recall'] for label in labels]
    f1 = [eval_results['eval_per_class_metrics'][label]['f1-score'] for label in labels]

    # Buat DataFrame untuk Plotly
    df = pd.DataFrame({
        'Label': labels * 3,
        'Metrik': ['Precision'] * len(labels) + ['Recall'] * len(labels) + ['F1-Score'] * len(labels),
        'Nilai': precision + recall + f1
    })

    # Plot menggunakan Plotly
    fig = px.bar(
        df,
        x='Label',
        y='Nilai',
        color='Metrik',
        barmode='group',
        title=f'Metrik Evaluasi {model_name}',
        color_discrete_map={'Precision': '#1f77b4', 'Recall': '#ff7f0e', 'F1-Score': '#2ca02c'}
    )
    fig.update_yaxes(range=[0, 1])
    return fig

# --- 5. Main ---
if __name__ == "__main__":
    # Daftar label
    label_list = ['O', 'B-FIN', 'I-FIN', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

    # Muat data anotasi
    try:
        data = load_annotated_data('/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/NER_Model/annotated_data.json')
    except Exception as e:
        print(f"Error: Gagal memuat annotated_data.json: {e}")
        exit(1)

    # Tentukan path model
    model_path = "/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/NER_Model/Roberta"  # Ganti dengan "/content/drive/MyDrive/NER_Model" jika menggunakan Google Colab

    # Evaluasi model IndoRoBERTa
    print("Mengevaluasi IndoRoBERTa...")
    try:
        indo_roberta_results = evaluate_model(
            model_path=model_path,
            dataset=data,
            label_list=label_list
        )

        # Visualisasi hasil
        fig = plot_metrics(indo_roberta_results, "IndoRoBERTa")
        fig.show()

        # Simpan hasil evaluasi ke file (opsional)
        with open('evaluation_results.json', 'w') as f:
            json.dump(indo_roberta_results, f, indent=4)
        print("Hasil evaluasi disimpan ke evaluation_results.json")

    except Exception as e:
        print(f"Error selama evaluasi: {e}")
        logging.error(f"Error selama evaluasi: {e}")

Mengevaluasi IndoRoBERTa...



Hasil Evaluasi Model IndoRoBERTa (/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/NER_Model/Roberta):
Precision (weighted avg): 0.8296
Recall (weighted avg): 0.8026
F1-Score (weighted avg): 0.8157

Metrik per Kelas:


Hasil evaluasi disimpan ke evaluation_results.json


## Fine Tuning IndoBERT

In [None]:
import json
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report
import torch
import logging
import os

# Setup logging
logging.basicConfig(filename='fine_tune_indobert.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- 1. Fungsi untuk Memuat Data Anotasi ---
def load_annotated_data(json_path):
    logging.info(f"Memuat data dari {json_path}...")
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
        valid_labels = set(['O', 'B-FIN', 'I-FIN', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'])
        for item in data:
            if not all(key in item for key in ['sentence', 'tokens', 'labels']):
                raise ValueError(f"Item tidak memiliki kunci yang diperlukan: {item}")
            if len(item['tokens']) != len(item['labels']):
                raise ValueError(f"Jumlah token dan label tidak cocok di item: {item['sentence'][:50]}...")
            for label in item['labels']:
                if label not in valid_labels:
                    raise ValueError(f"Label tidak valid: {label}")
        logging.info(f"Berhasil memuat {len(data)} item data.")
        return data
    except Exception as e:
        logging.error(f"Gagal memuat data: {e}")
        raise

# --- 2. Fungsi untuk Menyiapkan Dataset ---
def prepare_dataset(data, tokenizer, label_list):
    tokenized_inputs = {'input_ids': [], 'attention_mask': [], 'labels': []}
    label_to_id = {label: idx for idx, label in enumerate(label_list)}

    for idx, item in enumerate(data):
        logging.info(f"Memproses item {idx}: {item['sentence'][:50]}...")
        tokens = item['tokens']
        labels = item['labels']
        if len(tokens) != len(labels):
            logging.warning(f"Jumlah token dan label tidak cocok di item {idx}")
            continue

        encoding = tokenizer(
            tokens,
            is_split_into_words=True,
            return_offsets_mapping=True,
            truncation=True,
            max_length=512,
            padding='max_length'
        )
        word_ids = encoding.word_ids()
        aligned_labels = [-100] * len(encoding['input_ids'])
        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            if word_idx != previous_word_idx:
                aligned_labels[i] = label_to_id[labels[word_idx]]
            previous_word_idx = word_idx

        tokenized_inputs['input_ids'].append(encoding['input_ids'])
        tokenized_inputs['attention_mask'].append(encoding['attention_mask'])
        tokenized_inputs['labels'].append(aligned_labels)

    logging.info(f"Berhasil menyiapkan {len(tokenized_inputs['input_ids'])} data tokenisasi.")
    return tokenized_inputs

# --- 3. Fungsi untuk Fine-Tuning Model ---
def fine_tune_model(model_name, data, label_list, output_dir):
    logging.info(f"Memulai fine-tuning model {model_name}...")

    # Muat tokenizer dan model
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=len(label_list),
            id2label={i: label for i, label in enumerate(label_list)},
            label2id={label: i for i, label in enumerate(label_list)}
        )
    except Exception as e:
        logging.error(f"Gagal memuat model atau tokenizer: {e}")
        raise

    # Siapkan dataset
    tokenized_data = prepare_dataset(data, tokenizer, label_list)

    # Split dataset: 80% train, 10% validation, 10% test
    dataset = Dataset.from_dict({
        'input_ids': tokenized_data['input_ids'],
        'attention_mask': tokenized_data['attention_mask'],
        'labels': tokenized_data['labels']
    })
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    train_dataset = dataset.select(range(train_size))
    val_dataset = dataset.select(range(train_size, train_size + val_size))
    test_dataset = dataset.select(range(train_size + val_size, len(dataset)))

    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })
    logging.info(f"Ukuran dataset: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")

    # Inisialisasi DataCollator
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

    # Fungsi untuk menghitung metrik
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
        pred_labels = [[label_list[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

        try:
            results = classification_report(true_labels, pred_labels, output_dict=True)
            logging.info(f"Hasil classification_report: {results}")

            precision = results.get('weighted avg', {}).get('precision', 0.0)
            recall = results.get('weighted avg', {}).get('recall', 0.0)
            f1 = results.get('weighted avg', {}).get('f1-score', 0.0)

            per_class_metrics = {
                label: {
                    "precision": results.get(label, {}).get('precision', 0.0),
                    "recall": results.get(label, {}).get('recall', 0.0),
                    "f1-score": results.get(label, {}).get('f1-score', 0.0)
                } for label in label_list if label in results
            }

            return {
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "per_class_metrics": per_class_metrics
            }
        except Exception as e:
            logging.error(f"Error dalam compute_metrics: {e}")
            return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "per_class_metrics": {}}

    # Setup TrainingArguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    # Inisialisasi Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_dict['train'],
        eval_dataset=dataset_dict['validation'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Fine-tuning
    logging.info("Memulai pelatihan...")
    try:
        trainer.train()
    except Exception as e:
        logging.error(f"Error selama pelatihan: {e}")
        raise

    # Evaluasi pada test set
    logging.info("Mengevaluasi model pada test set...")
    eval_results = trainer.evaluate(dataset_dict['test'])

    # Tampilkan hasil evaluasi
    print(f"\nHasil Evaluasi Model IndoBERT ({output_dir}):")
    print(f"Precision (weighted avg): {eval_results['eval_precision']:.4f}")
    print(f"Recall (weighted avg): {eval_results['eval_recall']:.4f}")
    print(f"F1-Score (weighted avg): {eval_results['eval_f1']:.4f}")
    if eval_results.get('eval_per_class_metrics'):
        print("\nMetrik per Kelas:")
        for label, metrics in eval_results['eval_per_class_metrics'].items():
            print(f"{label}:")
            print(f"  Precision: {metrics['precision']:.4f}")
            print(f"  Recall: {metrics['recall']:.4f}")
            print(f"  F1-Score: {metrics['f1-score']:.4f}")

    # Simpan model dan tokenizer
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    logging.info(f"Model dan tokenizer disimpan ke {output_dir}")

    # Simpan hasil evaluasi
    eval_results_path = os.path.join(output_dir, 'evaluation_results.json')
    with open(eval_results_path, 'w') as f:
        json.dump(eval_results, f, indent=4)
    logging.info(f"Hasil evaluasi disimpan ke {eval_results_path}")

    return eval_results

# --- 4. Main ---
if __name__ == "__main__":
    # Daftar label
    label_list = ['O', 'B-FIN', 'I-FIN', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

    # Tentukan environment dan output directory
    try:
        import google.colab
        print("Mendeteksi Google Colab environment, menginstall dependensi...")
        from google.colab import drive
        # Cek apakah Drive sudah dimount
        if not os.path.exists('/content/drive'):
            drive.mount('/content/drive')
        else:
            print("Drive sudah dimount.")
        output_dir = "/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/NER_Model/IndoBERT"
        # Instal dependensi
        os.system("pip install transformers datasets seqeval torch")
    except ImportError:
        print("Bukan di Google Colab, melanjutkan tanpa install dependensi...")
        output_dir = "./indo_bert_ner"

    # Muat data anotasi
    try:
        data = load_annotated_data('/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/NER_Model/annotated_data.json')
    except Exception as e:
        print(f"Error: Gagal memuat annotated_data.json: {e}")
        exit(1)

    # Fine-tune IndoBERT
    print("Mengevaluasi IndoBERT...")
    try:
        eval_results = fine_tune_model(
            model_name="indobenchmark/indobert-base-p1",
            data=data,
            label_list=label_list,
            output_dir=output_dir
        )
    except Exception as e:
        print(f"Error selama fine-tuning IndoBERT: {e}")
        logging.error(f"Error selama fine-tuning: {e}")

Mendeteksi Google Colab environment, menginstall dependensi...
Drive sudah dimount.
Mengevaluasi IndoBERT...


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Per Class Metrics
1,0.0744,0.072631,0.784093,0.787859,0.78542,{}
2,0.0448,0.063604,0.818964,0.796868,0.807538,{}
3,0.0294,0.067885,0.820635,0.797297,0.808314,{}



Hasil Evaluasi Model IndoBERT (/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/NER_Model/IndoBERT):
Precision (weighted avg): 0.8418
Recall (weighted avg): 0.8167
F1-Score (weighted avg): 0.8288


In [None]:
import json
import pandas as pd
import plotly.express as px

def load_evaluation_results(model_name):
    path = f"/content/drive/MyDrive/usk/Semester 6/NLP/semester 6/NER_Model/{model_name}/evaluation_results.json"
    with open(path, 'r') as f:
        return json.load(f)

eval_results_roberta = load_evaluation_results("Roberta")
eval_results_bert = load_evaluation_results("IndoBERT")

df = pd.DataFrame({
    'Model': ['IndoRoBERTa', 'IndoBERT'],
    'Precision': [eval_results_roberta['eval_precision'], eval_results_bert['eval_precision']],
    'Recall': [eval_results_roberta['eval_recall'], eval_results_bert['eval_recall']],
    'F1-Score': [eval_results_roberta['eval_f1'], eval_results_bert['eval_f1']]
})
fig = px.bar(df.melt(id_vars='Model', value_vars=['Precision', 'Recall', 'F1-Score'], var_name='Metrik', value_name='Nilai'),
             x='Model', y='Nilai', color='Metrik', barmode='group',
             title='Perbandingan IndoRoBERTa vs IndoBERT')
fig.show()