In [2]:
import os
import asyncio
import pandas as pd
from dotenv import load_dotenv
from telethon.sync import TelegramClient
from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument

class TelegramScraper:
    def __init__(self):
        load_dotenv(os.path.join('..', '..', 'config', 'api_keys.env'))
        self.api_id = os.getenv('TELEGRAM_API_ID')
        self.api_hash = os.getenv('TELEGRAM_API_HASH')
        self.phone = os.getenv('TELEGRAM_PHONE')
        
        # Create data directories if they don't exist
        os.makedirs(os.path.join('..', '..', 'data', 'raw'), exist_ok=True)
        os.makedirs(os.path.join('..', '..', 'data', 'raw', 'media'), exist_ok=True)
        
        # Load channels to scrape
        with open('channel_list.txt', 'r') as f:
            self.channels = [line.strip() for line in f.readlines() if line.strip()]

    async def scrape_channel(self, channel_name, limit=100):
        """Scrape messages from a single Telegram channel"""
        async with TelegramClient('ethiomart_session', self.api_id, self.api_hash) as client:
            await client.start(self.phone)
            
            messages = []
            async for message in client.iter_messages(channel_name, limit=limit):
                msg_data = {
                    'channel': channel_name,
                    'message_id': message.id,
                    'date': message.date.isoformat(),
                    'text': message.text,
                    'has_media': bool(message.media),
                    'media_type': None,
                    'media_path': None
                }
                
                if message.media:
                    if isinstance(message.media, MessageMediaPhoto):
                        msg_data['media_type'] = 'photo'
                        filename = f"{channel_name}_{message.id}.jpg"
                        filepath = os.path.join('..', '..', 'data', 'raw', 'media', filename)
                        await client.download_media(message.media, file=filepath)
                        msg_data['media_path'] = filepath
                    
                    elif isinstance(message.media, MessageMediaDocument):
                        msg_data['media_type'] = 'document'
                
                messages.append(msg_data)
            
            return messages

    async def scrape_all_channels(self):
        """Scrape all channels listed in channel_list.txt"""
        all_messages = []
        
        for channel in self.channels:
            try:
                print(f"Scraping {channel}...")
                messages = await self.scrape_channel(channel)
                all_messages.extend(messages)
                print(f"Collected {len(messages)} messages from {channel}")
            except Exception as e:
                print(f"Error scraping {channel}: {str(e)}")
        
        # Save raw data
        df = pd.DataFrame(all_messages)
        raw_data_path = os.path.join('..', '..', 'data', 'raw', 'telegram_messages.csv')
        df.to_csv(raw_data_path, index=False)
        print(f"Saved raw data to {raw_data_path}")
        return df

if __name__ == "__main__":
    scraper = TelegramScraper()
    await scraper.scrape_all_channels()  # Only works in Jupyter/IPython

Scraping ZemenExpress...
Collected 100 messages from ZemenExpress
Scraping nevacomputer...
Collected 100 messages from nevacomputer
Scraping meneshayeofficial...
Collected 100 messages from meneshayeofficial
Scraping ethio_brand_collection...
Collected 100 messages from ethio_brand_collection
Scraping Leyueqa...
Collected 100 messages from Leyueqa
Saved raw data to ..\..\data\raw\telegram_messages.csv
