In [6]:
import re
import pandas as pd

In [7]:
# Function to clean and tokenize Amharic text
def preprocess_text(text):
    if isinstance(text, str):  # Ensure that text is a string
        # Remove non-Amharic characters and emojis
        cleaned_text = re.sub(r'[^\u1200-\u137F ]+', '', text)  # Amharic Unicode range
        # Tokenize the text by splitting based on spaces
        tokens = cleaned_text.split()
        return tokens
    else:
        return []  # Return an empty list if the message is not a string

# Apply the preprocessing function
df['tokens'] = df['message'].apply(preprocess_text)

In [11]:
import pandas as pd

# Example messages from your dataset (to be replaced by actual data)
data = {
    'Message': [
        'የቤት እቃ ዋጋ 1000 ብር ቦሌ',
        'አስተማማኝ እንቅስቃሴ በ 500 ብር ሰላም ማርካት',
        'ጫማ ዋጋ 2000 ብር አዲስ አበባ'
    ]
}
df = pd.DataFrame(data)

# Entity labeling rules (you should modify these based on your dataset and actual requirements)
def label_token(token):
    # Rules for Product entity (just an example, expand as needed)
    product_keywords = ['ቤት', 'እቃ', 'ጫማ']  # Add more product-related keywords
    price_keywords = ['ብር', 'ዋጋ']  # Add more price-related keywords
    location_keywords = ['ቦሌ', 'አዲስ', 'አበባ', 'ሰላም']  # Add more location-related keywords

    if token in product_keywords:
        return 'B-Product'
    elif token in price_keywords:
        return 'B-PRICE'
    elif any(char.isdigit() for char in token):  # Detect numbers in tokens (for prices)
        return 'I-PRICE'
    elif token in location_keywords:
        return 'B-LOC'
    else:
        return 'O'

# Function to convert dataset to CoNLL format
def convert_to_conll_format(df):
    conll_data = []

    # Iterate through each message
    for idx, row in df.iterrows():
        message = row['Message']

        # Ensure message is a string
        if isinstance(message, str):
            tokens = message.split()  # Tokenize the message by spaces
            labels = [label_token(token) for token in tokens]  # Apply labeling logic

            # Append tokens and labels to conll_data
            for token, label in zip(tokens, labels):
                conll_data.append([token, label])

            # Append an empty row to signify the end of a message
            conll_data.append(["", ""])  # Empty row between messages

    return conll_data

# Convert to CoNLL format
conll_data = convert_to_conll_format(df)

# Create a DataFrame from the CoNLL data
conll_df = pd.DataFrame(conll_data, columns=['Token', 'Label'])

# Save to CSV for easier manual management
conll_df.to_csv('conll_formatted_data.csv', index=False)

# Display the CoNLL-formatted data
print(conll_df)


     Token      Label
0      የቤት          O
1       እቃ  B-Product
2       ዋጋ    B-PRICE
3     1000    I-PRICE
4       ብር    B-PRICE
5       ቦሌ      B-LOC
6                    
7   አስተማማኝ          O
8   እንቅስቃሴ          O
9        በ          O
10     500    I-PRICE
11      ብር    B-PRICE
12     ሰላም      B-LOC
13    ማርካት          O
14                   
15      ጫማ  B-Product
16      ዋጋ    B-PRICE
17    2000    I-PRICE
18      ብር    B-PRICE
19     አዲስ      B-LOC
20     አበባ      B-LOC
21                   
