# Overview

## Task
In this competition, you’ll work on addresses collected by us to build a model to correctly extract Point of Interest (POI) Names and Street Names from unformatted Indonesia addresses.

Participants are expected to build their own model for this competition, submissions by teams which directly call any third party APIs on the test set will not be taken into consideration.

In [3]:
import pandas as pd
import warnings
import numpy as np

warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv("train.csv")[:100]

In [5]:
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
95,95,"cau terr, gal,",causal terrace/gal
96,96,taman kota kedaung kali angke gg.h.musanif no ...,taman kota/
97,97,aren jaya sumb iv 319 rt 3 10 17111 bekasi timur,/sumb iv
98,98,"shi mel, nanggalo",/shi mel


## Build NER using Huggingface

## Create BIO Tagging

In [6]:
import pandas as pd

def create_bio_tags(text, poi_street):
    poi, street = poi_street.split('/')
    raw_tokens = text.split()
    poi_tokens = poi.split()
    street_tokens = street.split()

    bio_tags = []

    i = 0
    while i < len(raw_tokens):
        token = raw_tokens[i]
        
        if token in poi_tokens:
            if i == 0 or bio_tags[-1] == 'O':
                bio_tags.append('B-POI')
            else:
                bio_tags.append('I-POI')
        elif token in street_tokens:
            if i == 0 or bio_tags[-1] == 'O':
                bio_tags.append('B-Street')
            else:
                bio_tags.append('I-Street')
        else:
            bio_tags.append('O')
        
        i += 1

    return raw_tokens, bio_tags



In [7]:
df['tokens'] = df.apply(lambda row: create_bio_tags(row['raw_address'], row['POI/street'])[0], axis=1)
df['bio_tags'] = df.apply(lambda row: create_bio_tags(row['raw_address'], row['POI/street'])[1], axis=1)

In [8]:
sentences = list(df.raw_address.values)
tags = list(df.bio_tags.values)

In [9]:
df[['raw_address', 'bio_tags']]

Unnamed: 0,raw_address,bio_tags
0,jl kapuk timur delta sili iii lippo cika 11 a ...,"[B-Street, I-Street, I-Street, I-Street, I-Str..."
1,"aye, jati sampurna","[O, O, O]"
2,setu siung 119 rt 5 1 13880 cipayung,"[O, B-Street, O, O, O, O, O, O]"
3,"toko dita, kertosono","[B-POI, O, O]"
4,jl. orde baru,"[B-Street, I-Street, I-Street]"
...,...,...
95,"cau terr, gal,","[O, O, O]"
96,taman kota kedaung kali angke gg.h.musanif no ...,"[B-POI, I-POI, O, O, O, O, O, O, O, O, O]"
97,aren jaya sumb iv 319 rt 3 10 17111 bekasi timur,"[O, O, B-Street, I-Street, O, O, O, O, O, O, O]"
98,"shi mel, nanggalo","[B-Street, O, O]"


In [10]:
np.unique(np.hstack(tags))

array(['B-POI', 'B-Street', 'I-POI', 'I-Street', 'O'], dtype='<U8')

In [11]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Tokenize the sentences and align the labels with tokens
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_inputs = tokenizer(sentence, truncation=True, is_split_into_words=True, padding='max_length', max_length=32)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label_map[text_labels[word_idx]])
        else:
            labels.append(label_map[text_labels[word_idx]] if label_all_tokens else -100)
        previous_word_idx = word_idx
    return tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'], labels

label_list = ['B-POI', 'B-Street', 'I-POI', 'I-Street', 'O']
label_map = {label: i for i, label in enumerate(label_list)}
label_all_tokens = True

tokenized_inputs = []
tokenized_labels = []

for sentence, label in zip(sentences, tags):
    input_ids, attention_mask, labels = tokenize_and_preserve_labels(sentence.split(), label)
    tokenized_inputs.append({'input_ids': input_ids, 'attention_mask': attention_mask})
    tokenized_labels.append(labels)


In [12]:
label_map

{'B-POI': 0, 'B-Street': 1, 'I-POI': 2, 'I-Street': 3, 'O': 4}

In [14]:
import torch
from datasets import Dataset

# Combine the tokenized inputs and labels into a dataset
dataset = Dataset.from_dict({
    'input_ids': [input['input_ids'] for input in tokenized_inputs],
    'attention_mask': [input['attention_mask'] for input in tokenized_inputs],
    'labels': tokenized_labels
})


In [15]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset
)

# Train the model
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                               
 33%|███▎      | 13/39 [00:25<00:39,  1.52s/it]

{'eval_loss': 0.9207975268363953, 'eval_runtime': 3.6806, 'eval_samples_per_second': 27.169, 'eval_steps_per_second': 3.532, 'epoch': 1.0}


                                               
 67%|██████▋   | 26/39 [00:50<00:20,  1.59s/it]

{'eval_loss': 0.8023850321769714, 'eval_runtime': 4.0764, 'eval_samples_per_second': 24.532, 'eval_steps_per_second': 3.189, 'epoch': 2.0}


                                               
100%|██████████| 39/39 [01:16<00:00,  1.97s/it]

{'eval_loss': 0.7640522122383118, 'eval_runtime': 4.0934, 'eval_samples_per_second': 24.43, 'eval_steps_per_second': 3.176, 'epoch': 3.0}
{'train_runtime': 76.8407, 'train_samples_per_second': 3.904, 'train_steps_per_second': 0.508, 'train_loss': 0.9543897188626803, 'epoch': 3.0}





TrainOutput(global_step=39, training_loss=0.9543897188626803, metrics={'train_runtime': 76.8407, 'train_samples_per_second': 3.904, 'train_steps_per_second': 0.508, 'train_loss': 0.9543897188626803, 'epoch': 3.0})

In [16]:
# Evaluate the model
trainer.evaluate()

100%|██████████| 13/13 [00:03<00:00,  3.50it/s]


{'eval_loss': 0.7640522122383118,
 'eval_runtime': 4.1414,
 'eval_samples_per_second': 24.146,
 'eval_steps_per_second': 3.139,
 'epoch': 3.0}

In [17]:
new_sentences = ["jl. sipayang barat, depan monumen pancasila"]
tokenized_inputs = tokenizer(new_sentences, truncation=True, padding=True, is_split_into_words=True, return_tensors="pt")

with torch.no_grad():
    output = model(**tokenized_inputs)

logits = output.logits
predictions = torch.argmax(logits, dim=2)
predictions = predictions.numpy()

# Convert predictions to tags
pred_tags = [[label_list[p] for p in pred] for pred in predictions]
print(pred_tags)


[['O', 'B-Street', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


## Using LLM

In [1]:
# Chat with an intelligent assistant in your terminal
from openai import OpenAI

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

history = [
    {"role": "system", "content": "Kamu adalah NER engine yang dapat mengidentifikasi alamat dan patokan jalan."},
    {"role": "user", "content": "Hello, perkenalkan dirimu"},
]

while True:
    completion = client.chat.completions.create(
        model="local-model", # this field is currently unused
        messages=history,
        temperature=0.7,
        stream=True,
    )

    new_message = {"role": "assistant", "content": ""}
    
    for chunk in completion:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="", flush=True)
            new_message["content"] += chunk.choices[0].delta.content

    history.append(new_message)

    print()
    history.append({"role": "user", "content": input("> ")})

Nama saya adalah AI, dan saya dapat membantu Anda dengan berbagai tugas, termasuk mencari informasi tentang alamat dan patokan jalan. Bagaimana saya bisa membantu Anda?
Alamat dan patokan jalan "Jalan karet pedurenan no 39 depan hotel fave" dapat ditemukan di dalam kata-kata "Jalan karet pedurenan".
Alamat dan patokan jalan "jl. sipayang Barat, depan Monumen Pancasila" dapat ditemukan di dalam kata-kata "jl. sipayang Barat, depan Monumen Pancasila".
Alamat dan patokan jalan "jl kapuk timur delta sili iii lippo cika 11" tidak ditemukan di dalam teks yang diberikan.


BadRequestError: Error code: 400 - {'error': "'messages' array must only contain objects with a 'content' field that is not empty."}