In [7]:
import pandas as pd
import os

- read data

In [3]:
df = pd.read_csv('../data/output_data/telegram_data.csv')

- separate metadata from message

In [4]:
messages = df['Message'].astype(str)

- function to remove emojis

In [5]:
import re
def remove_emojis(text):
    # Regular expression to match common emoji Unicode blocks
    # This can be complex, often involves specific emoji libraries or robust regex.
    # A common one might involve looking for broad unicode ranges.
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text) # Replace with empty string

- Normalization

In [6]:
def normalize_amharic_punctuation(text):
    text = text.replace('።', '.') # Replace Amharic full stop with Latin full stop
    text = text.replace('፣', ',') # Replace Amharic comma with Latin comma
    text = text.replace('፤', ';')
    text = text.replace('፧', '?')
    # You might also want to remove other specific symbols or extra spaces here
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space and strip leading/trailing
    return text

- Cleaning

In [None]:
print("Applying cleaning steps...")
df['cleaned_message'] = df['Message'].apply(remove_emojis)
df['cleaned_message'] = df['cleaned_message'].apply(normalize_amharic_punctuation)
print("Cleaning complete.")

- Tokenization

In [None]:
from transformers import AutoTokenizer

# This downloads the tokenizer configuration and vocabulary files
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
print("Tokenizer loaded successfully!")

# This will give you a list of subword tokens for each message
df['raw_tokens'] = df['cleaned_message'].apply(lambda x: tokenizer.tokenize(x))
print("Tokenization complete. Displaying first few rows with cleaned messages and raw tokens:")
print(df[['Message', 'cleaned_message', 'raw_tokens']].head()) # Display some results

- save the processed data

In [None]:
processed_df = df[['Id', 'cleaned_message', 'raw_tokens']].copy()

processed_data_folder = os.path.join('data', 'output_data', 'processed_for_labeling')
os.makedirs(processed_data_folder, exist_ok=True) # Ensure the directory exists

processed_csv_path = os.path.join(processed_data_folder, 'processed_telegram_messages.csv')

# Save the DataFrame to a new CSV file
# For lists in a CSV column (like 'raw_tokens'), Pandas by default stores them as strings
# (e.g., "['token1', 'token2']"). This is usually fine, you'll just parse them back later.
processed_df.to_csv(processed_csv_path, index=False, encoding='utf-8')

print(f"Processed data saved to: {processed_csv_path}")