# **Task 2 : Label a Subset of Dataset in CoNLL Format**

In [13]:
# Import necessary libraries
import pandas as pd
import logging
import os, sys
import matplotlib.pyplot as plt
from matplotlib import font_manager
from collections import Counter
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
# Import data preprocessor class
from text_processor import AmharicTextPreprocessor
from labeler import AmharicNERLabeler

# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

logger.info("Imported libraries and configured logging.")

2025-01-21 22:15:46,846 - INFO - Imported libraries and configured logging.


# Load the data

In [14]:
# Read the data
data = pd.read_csv('../data/clean_data.csv')
# Explore the first five rows
data.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14839,ለኮንዶሚኒየም ለጠባብ ቤቶች ገላግሌ የሆነ ከንፁህ የሲልከን ጥሬ እቃ የተ...,2024-09-26 12:24:51+00:00,photos\@sinayelj_14839.jpg
1,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14835,Baby romper ከላዩ ፈር ውስጡ ኮተን የሆነ\n\n2000 birr\...,2024-09-26 12:24:11+00:00,photos\@sinayelj_14835.jpg
2,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14832,Chicco 5 in 1 mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\...,2024-09-26 12:20:22+00:00,photos\@sinayelj_14832.jpg
3,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14825,aiebao HIP SEAT CARRIER\nምቹ ጠንካራ የልጆች ማዘያ\nበተለ...,2024-09-26 12:19:20+00:00,photos\@sinayelj_14825.jpg
4,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14815,Kids mat \n\n4100 ብር\n\nSize 1.80*2m\n\nfree ...,2024-09-25 06:15:53+00:00,photos\@sinayelj_14815.jpg


# Preprocess text data

In [15]:
# Preprocess and tokenizes the amharic message
if __name__ == "__main__":
    # Amharic text sample
    amharic_text = "ሰላም እንዴት ነህ? እንኳን ደህና መጣህ።"

    preprocessor = AmharicTextPreprocessor()

    # Preprocess the text
    tokens = preprocessor.preprocess_dataframe(data, 'Message')
    display(tokens)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,preprocessed_message
0,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14839,ለኮንዶሚኒየም ለጠባብ ቤቶች ገላግሌ የሆነ ከንፁህ የሲልከን ጥሬ እቃ የተ...,2024-09-26 12:24:51+00:00,photos\@sinayelj_14839.jpg,ለኮንዶሚኒየም ለጠባብ ቤቶች ገላግሌ የሆነ ከንፁህ የሲልከን ጥሬ እቃ የተ...
1,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14835,Baby romper ከላዩ ፈር ውስጡ ኮተን የሆነ\n\n2000 birr\...,2024-09-26 12:24:11+00:00,photos\@sinayelj_14835.jpg,ከላዩ ፈር ውስጡ ኮተን የሆነ 2000 0909003864 0905707448 ...
2,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14832,Chicco 5 in 1 mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\...,2024-09-26 12:20:22+00:00,photos\@sinayelj_14832.jpg,5 1 ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 3200 ብር 0909003864 09...
3,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14825,aiebao HIP SEAT CARRIER\nምቹ ጠንካራ የልጆች ማዘያ\nበተለ...,2024-09-26 12:19:20+00:00,photos\@sinayelj_14825.jpg,ምቹ ጠንካራ የልጆች ማዘያ በተለይ ለወንድ ልጆች ፍሬያቸው እንዳይጎዳ ተደ...
4,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14815,Kids mat \n\n4100 ብር\n\nSize 1.80*2m\n\nfree ...,2024-09-25 06:15:53+00:00,photos\@sinayelj_14815.jpg,4100 ብር 1802 0909003864 0905707448 እቃ ለማዘዝ ከስር...
...,...,...,...,...,...,...,...
1350,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,674,ምቹ የልጆች ፖፖ የሶፍት ማስቀመጫ ያለው 1000ብር\nFree deliver...,2021-04-28 19:12:19+00:00,photos\@sinayelj_674.jpg,ምቹ የልጆች የሶፍት ማስቀመጫ ያለው 1000ብር 0905707448 09450...
1351,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,640,ለልጆች ምቹ እና ኮንፈርተብል ማት \nከላይ ማጫወቻ ያለዉ ከተፈለገ ማጫወ...,2021-04-20 11:16:59+00:00,photos\@sinayelj_640.jpg,ለልጆች ምቹ እና ኮንፈርተብል ማት ከላይ ማጫወቻ ያለዉ ከተፈለገ ማጫወቻዉ...
1352,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,605,Baby potty \nበባትሪ የሚሰራ ድምፅ ያለው\nየልጆች ፖፖ 899 ብር...,2021-04-16 18:27:22+00:00,photos\@sinayelj_605.jpg,በባትሪ የሚሰራ ድምፅ ያለው የልጆች 899 ብር 0905707448 09450...
1353,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,598,Baby potty\n0905707448\n0945097042,2021-04-16 18:12:10+00:00,photos\@sinayelj_598.jpg,0905707448 0945097042


In [16]:
# Drop NaN 
data.dropna(subset='Message', inplace=True)

In [17]:
# Ensure there are no NaN values in the preprocessed column
preprocessed_texts = tokens['preprocessed_message'].dropna().tolist()
data = pd.Series(preprocessed_texts).reset_index(name='message')

In [18]:
data.head()

Unnamed: 0,index,message
0,0,ለኮንዶሚኒየም ለጠባብ ቤቶች ገላግሌ የሆነ ከንፁህ የሲልከን ጥሬ እቃ የተ...
1,1,ከላዩ ፈር ውስጡ ኮተን የሆነ 2000 0909003864 0905707448 ...
2,2,5 1 ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 3200 ብር 0909003864 09...
3,3,ምቹ ጠንካራ የልጆች ማዘያ በተለይ ለወንድ ልጆች ፍሬያቸው እንዳይጎዳ ተደ...
4,4,4100 ብር 1802 0909003864 0905707448 እቃ ለማዘዝ ከስር...


# Label a Subset of Dataset in CoNLL Format

In [19]:
# Initialize the labeler

labeler = AmharicNERLabeler()

# Ensure there are no NaN values in the preprocessed column
preprocessed_texts = tokens['preprocessed_message'].dropna().tolist()
data = pd.Series(preprocessed_texts).reset_index(name='message')
# data = data.iloc[10:15]
data['Tokenized'] = data['message'].apply(lambda x: x.split())
# Label the tokens in the DataFrame
labeled_data = labeler.label_dataframe(data, 'Tokenized')


# Save to CoNLL format
labeler.save_conll_format(labeled_data, '../telegram_labeled_data.conll')

In [20]:
labeled_data.drop(columns=['index'], inplace=True)

In [21]:
labeled_data['message'].duplicated().sum()

272