In [None]:
import pandas as pd
import re

def tokenize_amharic(text):
    # Simple whitespace tokenizer for Amharic (can be improved)
    return text.split()

def label_tokens(tokens):
    labels = ['O'] * len(tokens)
    # Regex patterns for price and location (can be improved)
    price_pattern = re.compile(r'(ዋጋ[:፦]?\s*\d{1,3}(?:,\d{3})*|\d{1,3}(?:,\d{3})*\s*ብር)')
    loc_keywords = ['አድራሻ', 'ሜክሲኮ', 'አዲስ', 'ቦሌ', 'ኮሜርስ', 'ፕላዛ', 'ኪኔሬት', 'ሕንፃ']
    # Product: first phrase before price or location
    # This is a heuristic for demonstration
    price_idx = -1
    for i, token in enumerate(tokens):
        if price_pattern.match(token):
            price_idx = i
            break
    # Label product (first 2-4 tokens before price/location)
    prod_end = price_idx if price_idx > 0 else min(4, len(tokens))
    if prod_end > 0:
        labels[0] = 'B-Product'
        for i in range(1, prod_end):
            labels[i] = 'I-Product'
    # Label price
    for i, token in enumerate(tokens):
        if price_pattern.match(token):
            labels[i] = 'B-PRICE'
            # Next token is likely the number or "ብር"
            if i+1 < len(tokens) and re.match(r'\d{1,3}(?:,\d{3})*', tokens[i+1]):
                labels[i+1] = 'I-PRICE'
            if i+2 < len(tokens) and tokens[i+2] == 'ብር':
                labels[i+2] = 'I-PRICE'
    # Label location
    for i, token in enumerate(tokens):
        for loc in loc_keywords:
            if loc in token:
                labels[i] = 'B-LOC'
                # If next token is also location, label as I-LOC
                if i+1 < len(tokens) and any(l in tokens[i+1] for l in loc_keywords):
                    labels[i+1] = 'I-LOC'
    return list(zip(tokens, labels))

# Select 40 messages from df_sample (or df if not enough)
sample_df = df_sample.dropna(subset=['Original Message']).head(40)

conll_lines = []
for msg in sample_df['Original Message']:
    tokens = tokenize_amharic(msg)
    labeled = label_tokens(tokens)
    for token, label in labeled:
        conll_lines.append(f"{token} {label}")
    conll_lines.append("")  # blank line between messages

with open("amharic_ner_conll.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(conll_lines))