In [1]:
import pandas as pd
import re

In [2]:

def data_cleaning(df):
    # Check for NaN values in the 'Message' column
    print("Checking for NaN values in the 'Message' column:")
    nan_count = df['Message'].isnull().sum()
    print(f"Number of NaN values in 'Message' column: {nan_count}")

    # Drop NaN values
    df = df.dropna(subset=['Message'])
    print(df.info())
    return df

In [3]:

def message_clean(df):
    # Print the shape of the dataset after dropping NaN values in the "Message" column
    print(f"Dataset shape after dropping NaN values in 'Message' column: {df.shape}")

    # Define a function to remove emojis
    def remove_emojis(text):
        emoji_pattern = re.compile(
            "[" 
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F700-\U0001F77F"  # alchemical symbols
            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            "\U0001FA00-\U0001FA6F"  # Chess Symbols
            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            "\U00002702-\U000027B0"  # Dingbats
            "\U000024C2-\U0001F251" 
            "]+", 
            flags=re.UNICODE
        )
        return emoji_pattern.sub(r'', text)

    # Apply the function to the 'Message' column
    df['Message'] = df['Message'].apply(remove_emojis)

    # Display the updated DataFrame
    print("Updated DataFrame:")
    print(df.head())

    return df

In [4]:
def label_messages(df):
    # Define a function to label messages with entity types
    def label_message_utf8_with_birr(message):
        labeled_tokens = []
        
        # Tokenize the message
        tokens = re.findall(r'\S+', message)
        
        for token in tokens:
            # Check if token is a price (e.g., 500 ETB, $100, or ብር)
            if re.match(r'^\d+(\.\d{1,2})?$', token) or 'ETB' in token or 'ዋጋ' in token or '$' in token or 'ብር' in token:
                labeled_tokens.append(f"{token} B-PRICE")  # Assuming the first price token as B-PRICE
            # Check if token could be a location (e.g., cities or general location names)
            elif any(loc in token for loc in ['Addis Ababa', 'ለቡ', 'ለቡ መዳህኒዓለም', 'መገናኛ', 'ቦሌ', 'ሜክሲኮ']):
                labeled_tokens.append(f"{token} B-LOC")  # Assuming the first location token as B-LOC
            # Assume other tokens are part of a product name or general text
            else:
                labeled_tokens.append(f"{token} O")  # Outside any entity
                
        return "\n".join(labeled_tokens)

    # Sample 30-50 messages for labeling
    sample_size = 30
    sample_df = df.sample(n=sample_size)

    # Apply the labeling function to the sampled messages
    sample_df['Labeled_Message'] = sample_df['Message'].apply(label_message_utf8_with_birr)

    return sample_df

In [5]:
def label_telegram_data(df):
    # Display the labeled messages
    print("Sampled Labeled Messages:")
    print(df[['Message', 'Labeled_Message']])

    # Save the labeled dataset to a file in CoNLL format
    labeled_data_birr_path = 'labeled_telegram_product_price_location.txt'
    with open(labeled_data_birr_path, 'w', encoding='utf-8') as f:
        for index, row in df.iterrows():
            f.write(f"{row['Labeled_Message']}\n\n")

    print(f"Labeled data saved to {labeled_data_birr_path}")

    return labeled_data_birr_path

In [6]:
def main():
    # Load the dataset
    df = pd.read_csv('../../telegram_data.csv')
    print(df.head(5))

    # Data cleaning
    df = data_cleaning(df)

    # Message cleaning
    df = message_clean(df)

    # Label messages
    labeled_df = label_messages(df)

    # Label the entire dataset
    label_telegram_data(labeled_df)

    

In [7]:
main()

  Channel Title Channel Username    ID  \
0        ምርጥ ዕቃ        @MerttEka  6821   
1        ምርጥ ዕቃ        @MerttEka  6820   
2        ምርጥ ዕቃ        @MerttEka  6819   
3        ምርጥ ዕቃ        @MerttEka  6818   
4        ምርጥ ዕቃ        @MerttEka  6817   

                                             Message  \
0                                                NaN   
1                                                NaN   
2  📣 🔠🔠🔠🔠🔠🔠🔠 🔠🔠🔠🔠🔠\n\n✔️ ዘመናዊ በኤሌክትሮኒክስ የሚሰራ ሚዛን\...   
3  📣 🔠🔠🔠🔠🔠 🔠🔠🔠🔠🔠🔠🔠🔠 🔠🔠🔠🔠🔠🔠\n\n✔️ ቦርጭ የሚሰበስብ\n✔️የቦ...   
4  📣 Multi Functional Shoe and Hat Rack\n\n📎 ይሄንን...   

                        Date                 Media Path  
0  2024-09-29 15:35:04+00:00  photos/@MerttEka_6821.jpg  
1  2024-09-29 15:35:03+00:00  photos/@MerttEka_6820.jpg  
2  2024-09-29 15:35:03+00:00  photos/@MerttEka_6819.jpg  
3  2024-09-29 15:13:37+00:00  photos/@MerttEka_6818.jpg  
4  2024-09-29 14:53:49+00:00                        NaN  
Checking for NaN values in the 'Message' column:
Number of NaN