In [None]:
import pandas as pd
import re
import os

# Load the preprocessed data
df = pd.read_csv("data/preprocessed_messages.csv")

# Display a few sample messages to manually annotate
print(df[['Message_ID', 'Cleaned_Text']].head(10))  # Select 10 messages for manual labeling

# --- MANUAL LABELING ---
# We will manually label a subset of messages in the format:
# Token B-Product/I-Product/B-LOC/I-LOC/B-PRICE/I-PRICE/O

labeled_sentences = [
    ["አዲስ", "B-LOC"], ["አበባ", "I-LOC"],
    ["ቤት", "B-Product"], ["ቤት", "I-Product"],
    ["ዋጋ", "B-PRICE"], ["1000", "I-PRICE"], ["ብር", "I-PRICE"],
    ["አልባሽ", "O"], ["አሰራር", "O"]
]

# Convert labeled sentences into CoNLL format
def save_conll_format(labeled_data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for sentence in labeled_data:
            for word, tag in sentence:
                f.write(f"{word} {tag}\n")
            f.write("\n")  # Separate sentences with a blank line

# Save the labeled dataset
os.makedirs("data", exist_ok=True)
conll_file = "data/labeled_conll.txt"
save_conll_format([labeled_sentences], conll_file)
print(f"✅ Labeled dataset saved to {conll_file}")


 ### Import libraries and Load Preprocessed Data:

In [3]:
import pandas as pd
import re
import os

# Load the preprocessed data
df = pd.read_csv("/home/bbm/Documents/projects/Amharic-NER-Telegram/data/preprocessed_messages.csv")

# Display a few sample messages to manually annotate
print(df[['Message_ID', 'Cleaned_Text']].head(20))  # Select 10 messages for manual labeling

    Message_ID                                       Cleaned_Text
0         6282                                                NaN
1         6281                                                NaN
2         6280                                                NaN
3         6279                                                NaN
4         6278                                                NaN
5         6277                                                NaN
6         6276  ፀረ ሌባ አላርም \n\n   በር  መስኮቶ ሲከፈት እስከ  ሜትር የሚሰማ ...
7         6275                                                NaN
8         6274                                                NaN
9         6273                                                NaN
10        6272  Door stoper\n\n__Prevents a door from opening ...
11        6271  ድብቅ ካሜራPen Security Camera \n\nየእስክሪብቶ ካሜራ\n\n...
12        6270                                                NaN
13        6269                                                NaN
14        

##### Read and Format Existing Labeled Data in CoNLL Format. This reads the given labeled data, instead of labelling manually(time shortage) from labeled_teleram_product_price_location.txt and formats it properly for NER training.

In [5]:
input_file = "/home/bbm/Documents/projects/Amharic-NER-Telegram/data/labeled_teleram_product_price_location.txt"
output_file = "/home/bbm/Documents/projects/Amharic-NER-Telegram/data/labeled_data.conll" #Standard CoNLL file name

# Function to Process the Existing Labeled Data
def process_labeled_data(input_path, output_path):
    try:
        with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
            for line in infile:
                line = line.strip()
                if line:  # Skip empty lines
                    parts = line.split()
                    if len(parts) >= 2: #Handle lines with no tag or only one word
                        word = parts[0]
                        tag = parts[-1] #Take the last word as tag as there can be more than one word for product
                        outfile.write(f"{word} {tag}\n")
                    else:
                        print(f"Warning: Invalid line format in input file: {line}") #Inform about the problem
                else:
                    outfile.write("\n") #Add blank line for sentence seperation

        print(f"✅ Labeled data processed and saved to {output_path}")

    except FileNotFoundError:
        print(f"Error: Input file {input_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

## Process and Save the Data


In [None]:
os.makedirs("/home/bbm/Documents/projects/Amharic-NER-Telegram/data/", exist_ok=True)
process_labeled_data(input_file, output_file)

def read_conll(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 2:
                    token, label = parts
                    sentence.append((token, label))
            elif sentence:
                data.append(sentence)
                sentence = []
        if sentence:  # Add the last sentence if there's no final blank line
            data.append(sentence)
    return data

# Example usage (after creating labeled_data.conll):
try:
    labeled_data = read_conll("/home/bbm/Documents/projects/Amharic-NER-Telegram/data/labeled_data.conll")
    print("\nExample of read CoNLL data:")
    print(labeled_data[0]) #Print the first labeled sentence
except FileNotFoundError:
    print("labeled_data.conll not found. Make sure you have created the file.")
except IndexError:
    print("labeled_data.conll is empty. Please label data")
except ValueError:
    print("Invalid CoNLL format. Each line must have two elements.")