In [2]:
import pandas as pd
import os
import re

- read data

In [3]:
df = pd.read_csv('../data/output_data/telegram_data.csv')

- separate metadata from message

In [4]:
df["Message"] = df['Message'].astype(str)

- function to remove emojis

In [5]:
import emoji

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

- Normalization

In [6]:
def normalize_amharic_punctuation(text):
    text = text.replace('።', '.') # Replace Amharic full stop with Latin full stop
    text = text.replace('፣', ',') # Replace Amharic comma with Latin comma
    text = text.replace('፤', ';')
    text = text.replace('፧', '?')
    # You might also want to remove other specific symbols or extra spaces here
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space and strip leading/trailing
    return text

- Cleaning

In [7]:
print("Applying cleaning steps...")
df['cleaned_message'] = df['Message'].apply(remove_emojis)
df['cleaned_message'] = df['cleaned_message'].apply(normalize_amharic_punctuation)
print("Cleaning complete.")

Applying cleaning steps...
Cleaning complete.


- Tokenization

In [8]:
from transformers import AutoTokenizer

# This downloads the tokenizer configuration and vocabulary files
tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-amharic")
print("Tokenizer loaded successfully!")

# This will give you a list of subword tokens for each message
df['raw_tokens'] = df['cleaned_message'].apply(lambda x: tokenizer.tokenize(x))
print("Tokenization complete. Displaying first few rows with cleaned messages and raw tokens:")
print(df[['Message', 'cleaned_message', 'raw_tokens']].head()) # Display some results

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Tokenizer loaded successfully!


Token indices sequence length is longer than the specified maximum sequence length for this model (653 > 512). Running this sequence through the model will result in indexing errors


Tokenization complete. Displaying first few rows with cleaned messages and raw tokens:
                                             Message  \
0  NEW HAIR LOTION\n\nየፀጉር መከታ አለኝታ የሆነ ግሩም የ Tha...   
1  በግንኙነት ጊዜ ቶሎ እየጨረሱ ተጨግረዋል❓\nየፍቅር አጋሮን ስሜት ማርካት...   
2  💯 vape and Hookah flavour💯\n📌high quality Vape...   
3  🤔የብልት መጠን ማነስ እንዲሁም በግንኙነት ወቅት ቶሎ እየረጩና እየደከሙ ...   
4  😍ለሴቶች የሚሆን ዜና😍\n\nstretch mark cream (dr james...   

                                     cleaned_message  \
0  NEW HAIR LOTION የፀጉር መከታ አለኝታ የሆነ ግሩም የ Thaila...   
1  በግንኙነት ጊዜ ቶሎ እየጨረሱ ተጨግረዋል የፍቅር አጋሮን ስሜት ማርካት አ...   
2  vape and Hookah flavour high quality Vape and ...   
3  የብልት መጠን ማነስ እንዲሁም በግንኙነት ወቅት ቶሎ እየረጩና እየደከሙ አ...   
4  ለሴቶች የሚሆን ዜና stretch mark cream (dr james) ለሁሉ...   

                                          raw_tokens  
0  [NEW, HA, ##IR, LO, ##T, ##ION, የፀጉር, መከታ, አለኝ...  
1  [በግንኙነት, ጊዜ, ቶሎ, እየጨ, ##ረሱ, ተጨ, ##ግረዋል, የፍቅር, ...  
2  [v, ##ape, and, H, ##ook, ##ah, fl, ##av, ##ou...  
3  [የብልት, መጠን, ማነስ,

- save the processed data

In [9]:
processed_df = df[['Id', 'cleaned_message', 'raw_tokens']].copy()

processed_data_folder = os.path.join('../data', 'output_data', 'processed_for_labeling')
os.makedirs(processed_data_folder, exist_ok=True) # Ensure the directory exists

processed_csv_path = os.path.join(processed_data_folder, 'processed_telegram_messages.csv')

# Save the DataFrame to a new CSV file
# For lists in a CSV column (like 'raw_tokens'), Pandas by default stores them as strings
# (e.g., "['token1', 'token2']"). This is usually fine, you'll just parse them back later.
processed_df.to_csv(processed_csv_path, index=False, encoding='utf-8')

print(f"Processed data saved to: {processed_csv_path}")

Processed data saved to: ../data/output_data/processed_for_labeling/processed_telegram_messages.csv
