In [5]:
import pandas as pd
from pathlib import Path

# Load our reusable cleaning functions
import sys
sys.path.append('../scripts')  # Add path to access preprocess.py

from preprocess import clean_text


In [6]:
from preprocess import preprocess_dataframe

In [7]:
csv_path = Path('../data/raw/rawtelegram_data.csv')
df = pd.read_csv(csv_path)

print(f"Total messages loaded: {len(df)}")
df.head(10)

Total messages loaded: 26713


Unnamed: 0,Channel Title,Channel Username,Message ID,Message Text,Date,Views,Replies,Forwards
0,Zemen Express®,@ZemenExpress,7007,💥💥...................................💥💥\n\n🎯 L...,2025-06-24 11:49:18+00:00,1483,0,1
1,Zemen Express®,@ZemenExpress,7006,💥💥...................................💥💥\n\n🎯 L...,2025-06-24 11:49:01+00:00,1223,0,2
2,Zemen Express®,@ZemenExpress,7005,💥💥...................................💥💥\n\n🎯 L...,2025-06-24 11:48:41+00:00,1246,0,0
3,Zemen Express®,@ZemenExpress,7004,💥💥👀 ...........💥💥\n\n📌 Electric Charcoal Burne...,2025-06-23 14:55:46+00:00,2368,0,3
4,Zemen Express®,@ZemenExpress,7003,,2025-06-23 14:55:40+00:00,1996,0,0
5,Zemen Express®,@ZemenExpress,7002,,2025-06-23 14:55:40+00:00,1988,0,0
6,Zemen Express®,@ZemenExpress,7001,,2025-06-23 14:55:40+00:00,1987,0,0
7,Zemen Express®,@ZemenExpress,7000,💥💥👀 ...........💥💥\n\n📌 Electric Charcoal Burne...,2025-06-23 14:55:40+00:00,1920,0,0
8,Zemen Express®,@ZemenExpress,6999,💥💥👀 ...........💥💥\n\n📌 Electric Charcoal Burne...,2025-06-23 14:55:30+00:00,1949,0,1
9,Zemen Express®,@ZemenExpress,6998,,2025-06-23 08:23:14+00:00,2813,0,4


In [8]:
df_processed = preprocess_dataframe(df, text_col='Message Text')

print(f"Total messages after cleaning: {len(df_processed)}")
df_processed.head(3)


Token indices sequence length is longer than the specified maximum sequence length for this model (843 > 512). Running this sequence through the model will result in indexing errors


Total messages after cleaning: 14772


Unnamed: 0,Channel Title,Channel Username,Message ID,Message Text,Date,Views,Replies,Forwards,cleaned_text,tokens
0,Zemen Express®,@ZemenExpress,7007,💥💥...................................💥💥\n\n🎯 L...,2025-06-24 11:49:18+00:00,1483,0,1,................................... ልጅዎ 8.5 ልጆ...,"[▁..., ................, ................, ▁ልጅ..."
1,Zemen Express®,@ZemenExpress,7006,💥💥...................................💥💥\n\n🎯 L...,2025-06-24 11:49:01+00:00,1223,0,2,................................... ልጅዎ 8.5 ልጆ...,"[▁..., ................, ................, ▁ልጅ..."
2,Zemen Express®,@ZemenExpress,7005,💥💥...................................💥💥\n\n🎯 L...,2025-06-24 11:48:41+00:00,1246,0,0,................................... ልጅዎ 8.5 ልጆ...,"[▁..., ................, ................, ▁ልጅ..."


In [9]:
for i, row in df_processed.sample(5).iterrows():
    print(f"Original Message:\n{row['Message Text']}\n")
    print(f"Cleaned Text:\n{row['cleaned_text']}\n")
    print(f"Tokens:\n{row['tokens']}\n")
    print("-"*60)


Original Message:
BERR BRAND 
25-30 ቁጥር
1550birr
0905707448
0945097042

Cleaned Text:
2530 ቁጥር 1550 0905707448 0945097042

Tokens:
['▁25', '30', '▁ቁጥር', '▁15', '50', '▁09', '05', '707', '448', '▁09', '450', '970', '42']

------------------------------------------------------------
Original Message:
ተገጣጣሚ 4D HUMAN ANATOMY
ትምህርታዊ መጫዎቻዎች!!

መነሻዬ የልጆችዎን መነሻ የሚያቀሉ ትምህርታዊ መጫዎቻዎችን የሚገኙበት ስፍራ!!

📍ጉርድ ሾላ ሆሊ ሲቲ ሴንተር 3ኛ ፎቅ

ይደውሉ 0989939393 / 0930323334
website:- https://meneshaye-official.com/en/meneshaye-4d-anatomy.html

#meneshaye #መነሻዬ #educational #humananatomy #anatomy #anatomydrawing #medicine  #drawing #medical  #anatomystudy #anatomyart #medicalstudent #physiology #artist #anatomynotes #biology #muscles   #humanbody #study #anatomysketch  #illustration #science #anatomypractice  #human

Cleaned Text:
ተገጣጣሚ 4 ትምህርታዊ መጫዎቻዎች!! መነሻዬ ልጆችዎን መነሻ ሚያቀሉ ትምህርታዊ መጫዎቻዎችን ሚገኙበት ስፍራ!! ጉርድ ሾላ ሆሊ ሲቲ ሴንተር 3ኛ ፎቅ ይደውሉ 0989939393 0930323334 መነሻዬ

Tokens:
['▁ተገ', 'ጣጣ', 'ሚ', '▁4', '▁ት', 'ምህር', 'ታዊ', '▁መ', 'ጫ', 

In [10]:
output_path = Path('../data/processed/cleaned_messages.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)

df_processed.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"[✅] Saved cleaned and tokenized data to {output_path}")


[✅] Saved cleaned and tokenized data to ..\data\processed\cleaned_messages.csv
