In [1]:
%pip install telethon

Defaulting to user installation because normal site-packages is not writeable
Collecting telethon
  Using cached Telethon-1.40.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyaes (from telethon)
  Using cached pyaes-1.6.1-py3-none-any.whl
Collecting rsa (from telethon)
  Using cached rsa-4.9.1-py3-none-any.whl.metadata (5.6 kB)
Collecting pyasn1>=0.1.3 (from rsa->telethon)
  Using cached pyasn1-0.6.1-py3-none-any.whl.metadata (8.4 kB)
Using cached Telethon-1.40.0-py3-none-any.whl (722 kB)
Using cached rsa-4.9.1-py3-none-any.whl (34 kB)
Using cached pyasn1-0.6.1-py3-none-any.whl (83 kB)
Installing collected packages: pyaes, pyasn1, rsa, telethon
Successfully installed pyaes-1.6.1 pyasn1-0.6.1 rsa-4.9.1 telethon-1.40.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import asyncio
import re
import unicodedata
from datetime import datetime
from telethon import TelegramClient
import pandas as pd
import json
import os
from pathlib import Path

In [10]:
# Configuration for Telegram API (replace with your credentials)
api_id = '20617136'  # Obtain from my.telegram.org
api_hash = '7b2180099f0e6695eae4a28b3d1a42d2'  # Obtain from my.telegram.org
phone = '+251910396023'  # Your phone number with country code
session_name = 'ethiomart'

In [11]:
# List of Ethiopian e-commerce Telegram channels
channels = [
    't.me/ZemenExpress',  
    't.me/ethio_brand_collection',           
    't.me/Leyueqa',               
    't.me/AwasMart',         
    't.me/marakibrand'     
]

In [12]:
# Initialize Telegram client
client = TelegramClient(session_name, api_id, api_hash)


In [13]:

async def fetch_messages(channel, limit=100):
    """
    Fetch messages from a specified Telegram channel.
    Args:
        channel (str): Telegram channel URL or username.
        limit (int): Number of messages to fetch.
    Returns:
        list: List of message dictionaries with text, images, and metadata.
    """
    messages_data = []
    async with client:
        entity = await client.get_entity(channel)
        async for message in client.iter_messages(entity, limit=limit):
            msg_data = {
                'channel': channel,
                'message_id': message.id,
                'timestamp': message.date.isoformat(),
                'sender_id': message.sender_id,
                'text': message.text if message.text else '',
                'has_image': bool(message.photo),
                'image_path': None,
                'has_document': bool(message.document),
                'document_path': None
            }
            
            # Download images if present
            if message.photo:
                image_path = f"data/images/{channel.split('/')[-1]}_{message.id}.jpg"
                Path(image_path).parent.mkdir(parents=True, exist_ok=True)
                await client.download_media(message, image_path)
                msg_data['image_path'] = image_path
            
            # Download documents if present
            if message.document:
                doc_path = f"data/documents/{channel.split('/')[-1]}_{message.id}_{message.document.attributes[0].file_name}"
                Path(doc_path).parent.mkdir(parents=True, exist_ok=True)
                await client.download_media(message, doc_path)
                msg_data['document_path'] = doc_path
            
            messages_data.append(msg_data)
    
    return messages_data


In [14]:

def normalize_amharic_text(text):
    """
    Normalize Amharic text by handling specific linguistic features.
    Args:
        text (str): Input text in Amharic or mixed language.
    Returns:
        str: Normalized text.
    """
    if not text:
        return ''
    
    # Normalize Unicode characters (e.g., combining characters in Amharic)
    text = unicodedata.normalize('NFC', text)
    
    # Replace common Amharic variations (e.g., ሃ/ሀ/ኃ to ሀ)
    amharic_normalizations = {
        r'[ሃኃ]': 'ሀ',
        r'[ሓኅ]': 'ሐ',
        r'[ጸፀ]': 'ጸ',
        # Add more mappings as needed
    }
    for pattern, replacement in amharic_normalizations.items():
        text = re.sub(pattern, replacement, text)
    
    # Remove extra whitespace and normalize punctuation
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'[।|።]', '.', text)  # Replace Amharic sentence terminator with period
    
    return text


In [15]:

def tokenize_amharic_text(text):
    """
    Tokenize Amharic text while preserving meaningful units.
    Args:
        text (str): Normalized Amharic text.
    Returns:
        list: List of tokens.
    """
    # Simple word tokenization, preserving Amharic characters
    tokens = re.findall(r'[\wሀ-ፙ]+|[.,!?;]', text, re.UNICODE)
    return tokens


In [16]:

def preprocess_message(msg_data):
    """
    Preprocess a single message's text content.
    Args:
        msg_data (dict): Message dictionary with text and metadata.
    Returns:
        dict: Updated message dictionary with processed text and tokens.
    """
    # Normalize and tokenize text
    normalized_text = normalize_amharic_text(msg_data['text'])
    tokens = tokenize_amharic_text(normalized_text)
    
    msg_data['normalized_text'] = normalized_text
    msg_data['tokens'] = tokens
    return msg_data


In [None]:

async def main():
    """
    Main function to fetch and preprocess messages from Telegram channels.
    """
    all_messages = []
    
    # Create directories for storing data
    Path('data/images').mkdir(parents=True, exist_ok=True)
    Path('data/documents').mkdir(parents=True, exist_ok=True)
    Path('data/processed').mkdir(parents=True, exist_ok=True)
    
    # Fetch messages from all channels
    for channel in channels:
        print(f"Fetching messages from {channel}...")
        messages = await fetch_messages(channel, limit=100)
        # Preprocess each message
        processed_messages = [preprocess_message(msg) for msg in messages]
        all_messages.extend(processed_messages)
    
    # Convert to DataFrame and save as CSV
    df = pd.DataFrame(all_messages)
    df.to_csv('data/processed/telegram_messages.csv', index=False, encoding='utf-8')
    
    # Save as JSON for machine-readable format
    with open('data/processed/telegram_messages.json', 'w', encoding='utf-8') as f:
        json.dump(all_messages, f, ensure_ascii=False, indent=2)
    
    print(f"Processed {len(all_messages)} messages and saved to data/processed/")

# Run the ingestion and preprocessing pipeline in Jupyter Notebook
await main()

Fetching messages from t.me/ZemenExpress...
