# Overview

## Task
In this competition, you’ll work on addresses collected by us to build a model to correctly extract Point of Interest (POI) Names and Street Names from unformatted Indonesia addresses.

Participants are expected to build their own model for this competition, submissions by teams which directly call any third party APIs on the test set will not be taken into consideration.

In [1]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import warnings
import numpy as np

warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv("train.csv")[:1000]

In [3]:
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
995,995,"pawon solo, aryo beba,",pawon solo/aryo beba
996,996,"mawar iii, no 19 telukjambe timur",/mawar iii
997,997,soek - hatta 60 kepuhkembeng peterongan,/soek - hatta
998,998,"yaya pelayanan halieluyah, tebet raya, 30d rw ...",yayasan pelayanan halieluyah/tebet raya


In [4]:
def clean_text(text):
    # Lowercase text
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('indonesian'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into text
    cleaned_text = ' '.join(filtered_tokens)
    
    return cleaned_text

In [6]:
import pandas as pd

def create_bio_tags(text, poi_street):
    poi, street = poi_street.split('/')
    raw_tokens = text.split()
    poi_tokens = poi.split()
    street_tokens = street.split()

    bio_tags = []

    i = 0
    while i < len(raw_tokens):
        token = raw_tokens[i]
        
        if token in poi_tokens:
            if i == 0 or bio_tags[-1] == 'O':
                bio_tags.append('B-POI')
            else:
                bio_tags.append('I-POI')
        elif token in street_tokens:
            if i == 0 or bio_tags[-1] == 'O':
                bio_tags.append('B-Street')
            else:
                bio_tags.append('I-Street')
        else:
            bio_tags.append('O')
        
        i += 1

    return raw_tokens, bio_tags



In [7]:
df['tokens'] = df.apply(lambda row: create_bio_tags(row['raw_address'], row['POI/street'])[0], axis=1)
df['bio_tags'] = df.apply(lambda row: create_bio_tags(row['raw_address'], row['POI/street'])[1], axis=1)

In [8]:
sentences = list(df.raw_address.values)
tags = list(df.bio_tags.values)

In [9]:
from transformers import BertTokenizerFast

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Define a label map
label_list = ['B-POI', 'B-Street', 'I-POI', 'I-Street', 'O']
label_map = {label: i for i, label in enumerate(label_list)}

# Function to align labels with tokens
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, is_split_into_words=True)
    aligned_labels = []
    
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore [CLS] and [SEP] tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])
            else:
                label_ids.append(label_map[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

label_all_tokens = True
tokenized_inputs = tokenize_and_align_labels(sentences, tags)


IndexError: list index out of range

In [31]:
label_map

{'B-POI': 0, 'B-Street': 1, 'I-POI': 2, 'I-Street': 3, 'O': 4}

In [17]:
import torch
from datasets import Dataset

# Combine the tokenized inputs and labels into a dataset
dataset = Dataset.from_dict({
    'input_ids': [input['input_ids'] for input in tokenized_inputs],
    'attention_mask': [input['attention_mask'] for input in tokenized_inputs],
    'labels': tokenized_labels
})


In [None]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset
)

# Train the model
trainer.train()


In [None]:
# Evaluate the model
trainer.evaluate()

# Make predictions on new sentences
new_sentences = ["Jl Karet Pedurenan No 39, depan masjid al mukminin"]
tokenized_inputs = tokenizer(new_sentences, truncation=True, padding=True, is_split_into_words=True, return_tensors="pt")

with torch.no_grad():
    output = model(**tokenized_inputs)

logits = output.logits
predictions = torch.argmax(logits, dim=2)
predictions = predictions.numpy()

# Convert predictions to tags
pred_tags = [[label_list[p] for p in pred] for pred in predictions]
print(pred_tags)
