# 📦 1. Import & Setup

In [1]:
import pandas as pd
from pathlib import Path

# Load our reusable cleaning functions
import sys
sys.path.append('../scripts')  # Add path to access preprocess.py

from preprocess import clean_text


In [2]:
from preprocess import preprocess_dataframe

# 2. Load raw Telegram data

In [3]:
csv_path = Path('../data/raw/telegram_data.csv')
df = pd.read_csv(csv_path)

print(f"Total messages loaded: {len(df)}")
df.head()

Total messages loaded: 11996


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,መነሻዬ,@meneshayeofficial,1036,ለውድ ልጆዎ #ስጦታ ከመነሻዬ\nምንም አይነት ባትሪና ኤሌክትሪክ የማይፈል...,2025-06-20 15:46:18+00:00,
1,መነሻዬ,@meneshayeofficial,1035,አማርኛና እንግሊዘኛ በቀላሉ ማንበብና መፃፍ የሚያስችል 150 ካርድ ያለው...,2025-06-19 15:18:55+00:00,photos\@meneshayeofficial_1035.jpg
2,መነሻዬ,@meneshayeofficial,1034,የሂሳብ ሊቅ መነሻ ጥቅል\n\nልጆችዎ ቁጥርን ከመቁጠር ጀምሮ ሂሳብን...,2025-06-18 16:55:12+00:00,
3,መነሻዬ,@meneshayeofficial,1032,ከእርሳስ አያያዝ ጀምሮ አማርኛና እንግሊዘኛ በቀላሉ ማንበብና መፃፍ የሚያ...,2025-06-18 09:06:42+00:00,photos\@meneshayeofficial_1032.jpg
4,መነሻዬ,@meneshayeofficial,1031,ገላግሌ የልጆች ምግብ መስሪያ\nየልጆችን ልብ የሚያሸንፍ፣ የእናትን ጊዜ ...,2025-06-17 16:00:09+00:00,


# 3. Apply full preprocessing pipeline to the 'Message' column

In [4]:
df_processed = preprocess_dataframe(df, text_col='Message')

print(f"Total messages after cleaning: {len(df_processed)}")
df_processed.head(3)


Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors


Total messages after cleaning: 7171


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,cleaned_text,tokens
0,መነሻዬ,@meneshayeofficial,1036,ለውድ ልጆዎ #ስጦታ ከመነሻዬ\nምንም አይነት ባትሪና ኤሌክትሪክ የማይፈል...,2025-06-20 15:46:18+00:00,,ውድ ልጆዎ ስጦታ መነሻዬ ምንም አይነት ባትሪና ኤሌክትሪክ ማይፈልጉ ልጆች...,"[▁ውድ, ▁ል, ጆ, ዎ, ▁ስጦታ, ▁መነሻ, ዬ, ▁ምንም, ▁አይነት, ▁,..."
1,መነሻዬ,@meneshayeofficial,1035,አማርኛና እንግሊዘኛ በቀላሉ ማንበብና መፃፍ የሚያስችል 150 ካርድ ያለው...,2025-06-19 15:18:55+00:00,photos\@meneshayeofficial_1035.jpg,አማርኛና እንግሊዘኛ ቀላሉ ማንበብና መፃፍ ሚያስችል 150 ካርድ ያለው ባ...,"[▁አማርኛ, ና, ▁, እንግ, ሊ, ዘ, ኛ, ▁ቀ, ላ, ሉ, ▁ማን, በብ,..."
2,መነሻዬ,@meneshayeofficial,1034,የሂሳብ ሊቅ መነሻ ጥቅል\n\nልጆችዎ ቁጥርን ከመቁጠር ጀምሮ ሂሳብን...,2025-06-18 16:55:12+00:00,,ሂሳብ ሊቅ መነሻ ጥቅል ልጆችዎ ቁጥርን መቁጠር ጀምሮ ሂሳብን ቀላሉ ተግባ...,"[▁, ሂ, ሳብ, ▁ሊ, ቅ, ▁መነሻ, ▁ጥ, ቅል, ▁ልጆች, ዎ, ▁ቁጥር,..."


# 4. View some examples with tokens

In [5]:
for i, row in df_processed.sample(5).iterrows():
    print(f"Original Message:\n{row['Message']}\n")
    print(f"Cleaned Text:\n{row['cleaned_text']}\n")
    print(f"Tokens:\n{row['tokens']}\n")
    print("-"*60)


Original Message:
Nike roshe run high
Made in Vietnam
Price 1300 birr
Contact me @Sofonias12 or call 0920238243

Cleaned Text:
1300 12 0920238243

Tokens:
['▁1300', '▁12', '▁09', '20', '23', '82', '43']

------------------------------------------------------------
Original Message:
❗️ጃር ዉሃ  ከመግዛት ይገላገሉ❗️
የውሀ ማጣሪያ በ በ 16 ሊትር የቀረበ
    ✔️ ውሀን በጥሩ ሁኔታ እና በጥራት ከላምንም ኬሚካል የሚያጣራ !

       16 ሊትር ✔️ 3200 ብር
ያሉበት ቦታ ያለተጨማሪ ክፍያ ይዘዙ
#ክፍያዎን_ ዕቃዉ _እጅዎ ሲደርስ_ከፈለጉ_በካሽ_አልያም_በሞባይል_ባንኪንግ_መፈፀም_ይችላሉ በተጨማሪ #ከ_1000_ብር በላይ የሚተመኑ #ሁለት_ዕቃዎችን_ ሲገዙ ስጦታ እንልክለዎታለን 
🎁 T.me/LeyueQa 👈ቻናላችንን ለጓደኛዎ ሸር ማድረግዎን አይርሱ 
           
  0933334444      @LeMazez_z
  0944109295      @Lemaze_z
  0946242424      @Le_Mazez

Cleaned Text:
ጃር ዉሃ መግዛት ይገላገሉ ውሀ ማጣሪያ በ በ 16 ሊትር ቀረበ ውሀን ጥሩ ሁኔታ እና ጥራት ላምንም ኬሚካል ሚያጣራ ! 16 ሊትር 3200 ብር ያሉበት ቦታ ያለተጨማሪ ክፍያ ይዘዙ ክፍያዎን ዕቃዉ እጅዎ ሲደርስከፈለጉበካሽአልያምበሞባይልባንኪንግመፈፀምይችላሉ ተጨማሪ 1000ብር ላይ ሚተመኑ ሁለትዕቃዎችን ሲገዙ ስጦታ እንልክለዎታለን . ቻናላችንን ጓደኛዎ ሸር ማድረግዎን አይርሱ 0933334444 0944109295 0946242424

Tokens:
['▁', 'ጃ', 'ር', '▁', '

# 5. Save the cleaned and tokenized data for next steps


In [7]:
output_path = Path('../data/processed/preprocessed_messages.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)

df_processed.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"[✅] Saved cleaned and tokenized data to {output_path}")


[✅] Saved cleaned and tokenized data to ..\data\processed\preprocessed_messages.csv
