<a href="https://colab.research.google.com/github/billycemerson/purbaya-net/blob/main/src/purbaya-ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/billycemerson/purbaya-net

In [None]:
%cd purbaya-net
!ls

#### Install Package

In [None]:
!pip install torch transformers pandas

#### Import Package

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

#### Load Model

In [None]:
# Load model
model_name = "cahya/bert-base-indonesian-NER"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Setup NER pipeline
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # Group tokens into entities
)

#### Load Data

In [None]:
# Load data
df = pd.read_csv("data/kompas.csv")
df.head()

In [None]:
# Get only 1 row for testing
row = df.iloc[0]

# Concate the title and description to text for NER
text = f"{row['title']}. {row['description']}"
print(text)

#### Apply NER

In [None]:
# Apply NER
entities = ner_pipeline(text)

# See the results
for ent in entities:
    print(f"{ent['word']} -> {ent['entity_group']} ({ent['score']:.2f})")

In [None]:
df_results = []

# Apply in all row data
for i, row in df.iterrows():
    text = f"{row['title']}. {row['description']}"
    entities = ner_pipeline(text)
    for ent in entities:
        df_results.append({
            'article_id': i,
            'entity': ent['word'],
            'label': ent['entity_group'],
            'score': ent['score']
        })

# Save results
df_ner = pd.DataFrame(df_results)
df_ner.to_csv("ner_results.csv", index=False)

In [None]:
df_ner.head()

In [None]:
df_ner.info()