In [4]:
import os
import pandas as pd
from telethon.sync import TelegramClient
from dotenv import load_dotenv
import asyncio
import nest_asyncio

# Allows asyncio to run in a Jupyter notebook
nest_asyncio.apply()

In [6]:
# Load Credentials and Define Channels
load_dotenv()
api_id = int(os.getenv('API_ID'))
api_hash = os.getenv('API_HASH')

# List of target Telegram channels
channels = [
    'Shageronlinestore',
    'ethioonlinemarket1',
    'gebeyamart',
    'ethioamazon',
    'onlinemarketethiopia'
]

In [7]:
# Cell 3: Asynchronous Scraping Function
async def fetch_messages(channel_name, limit=500):
    """Scrapes messages from a single Telegram channel."""
    all_messages = []
    # Use 'anon' for the session name, Telethon will create a .session file
    async with TelegramClient('anon', api_id, api_hash) as client:
        print(f"Fetching messages from {channel_name}...")
        try:
            async for message in client.iter_messages(channel_name, limit=limit):
                if message.text: # Only collect messages with text
                    all_messages.append({
                        'channel': channel_name,
                        'message_id': message.id,
                        'text': message.text,
                        'date': message.date,
                        'views': message.views
                    })
        except Exception as e:
            print(f"Could not fetch from {channel_name}. Reason: {e}")
    return all_messages

In [8]:
async def main():
    """Main function to run scraping for all channels."""
    all_channel_data = []
    for channel in channels:
        messages = await fetch_messages(channel)
        all_channel_data.extend(messages)
        print(f"Found {len(messages)} messages in {channel}.")

    # Convert to DataFrame
    df = pd.DataFrame(all_channel_data)

    # Basic Preprocessing
    df.dropna(subset=['text'], inplace=True)
    df['text'] = df['text'].str.strip()
    df = df[df['text'] != '']

    # Save the raw data
    output_path = '../data/scraped_telegram_data.csv'
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"\nSuccessfully scraped and saved {len(df)} total messages to {output_path}")

In [9]:
# Cell 5: Run the scraper
# In a Jupyter cell, you can run the async main function directly using await
await main()

Signed in successfully as Abeni M; remember to not break the ToS or you will risk an account ban!
Fetching messages from Shageronlinestore...
Found 228 messages in Shageronlinestore.
Fetching messages from ethioonlinemarket1...
Found 0 messages in ethioonlinemarket1.
Fetching messages from gebeyamart...
Could not fetch from gebeyamart. Reason: Nobody is using this username, or the username is unacceptable. If the latter, it must match r"[a-zA-Z][\w\d]{3,30}[a-zA-Z\d]" (caused by ResolveUsernameRequest)
Found 0 messages in gebeyamart.
Fetching messages from ethioamazon...
Could not fetch from ethioamazon. Reason: Nobody is using this username, or the username is unacceptable. If the latter, it must match r"[a-zA-Z][\w\d]{3,30}[a-zA-Z\d]" (caused by ResolveUsernameRequest)
Found 0 messages in ethioamazon.
Fetching messages from onlinemarketethiopia...
Found 2 messages in onlinemarketethiopia.

Successfully scraped and saved 230 total messages to ../data/scraped_telegram_data.csv
