In [1]:
pip install telethon python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
from datetime import datetime
from dotenv import load_dotenv
from telethon import TelegramClient
from telethon.errors import SessionPasswordNeededError
import logging


In [3]:
load_dotenv()

API_ID = int(os.getenv('TELE_API_ID'))
API_HASH = os.getenv('TELE_API_HASH')
PHONE_NUMBER = os.getenv('TELE_PHONE_NUMBER')



Setup Logging

In [4]:
logging.basicConfig(
    filename='telegram_scraping_new.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)


connect to telegram

In [5]:
client = TelegramClient("session_name", API_ID, API_HASH)

async def start_telegram_client():
    await client.connect()
    if not await client.is_user_authorized():
        await client.send_code_request(PHONE_NUMBER)
        code = input("Enter the code you received from Telegram: ")
        try:
            await client.sign_in(PHONE_NUMBER, code)
        except SessionPasswordNeededError:
            password = input("Enter your 2FA password: ")
            await client.sign_in(password=password)
    print("✅ Telegram client connected and authorized.")

await start_telegram_client()


✅ Telegram client connected and authorized.


Define Message Scraper Function

In [6]:
async def scrape_messages(channel_username, limit=100):
    messages = []
    try:
        async for message in client.iter_messages(channel_username, limit=limit):
            messages.append(message.to_dict())
        logging.info(f"Scraped {len(messages)} messages from {channel_username}")
    except Exception as e:
        logging.error(f"Error scraping messages from {channel_username}: {str(e)}")
    return messages


 Define Save Function

In [7]:
def sanitize(obj):
    """Convert non-serializable types to serializable ones."""
    if isinstance(obj, datetime):
        return obj.isoformat()
    if isinstance(obj, bytes):
        return obj.decode(errors='ignore')
    if isinstance(obj, dict):
        return {k: sanitize(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [sanitize(i) for i in obj]
    return obj

def save_messages(messages, channel_username):
    sanitized = sanitize(messages)
    date_str = datetime.now().strftime("%Y-%m-%d")
    dir_path = f"../data/raw/telegram_messages/{date_str}/{channel_username}"
    os.makedirs(dir_path, exist_ok=True)
    file_path = os.path.join(dir_path, "messages.json")
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(sanitized, f, ensure_ascii=False, indent=2)
    logging.info(f"Saved messages to {file_path}")
    print(f"✅ Saved messages to {file_path}")


Scrape All 6 Channels

In [8]:
channels = [
    "lobelia4cosmetics",
    "tikvahpharma",
    "CheMed123",
    "EAHCI",
    "Thequorachannel",
    "HakimApps_Guideline"
]

for channel in channels:
    print(f"📥 Scraping: {channel}")
    messages = await scrape_messages(channel, limit=100)
    save_messages(messages, channel)


📥 Scraping: lobelia4cosmetics
✅ Saved messages to ../data/raw/telegram_messages/2025-07-12/lobelia4cosmetics\messages.json
📥 Scraping: tikvahpharma
✅ Saved messages to ../data/raw/telegram_messages/2025-07-12/tikvahpharma\messages.json
📥 Scraping: CheMed123
✅ Saved messages to ../data/raw/telegram_messages/2025-07-12/CheMed123\messages.json
📥 Scraping: EAHCI
✅ Saved messages to ../data/raw/telegram_messages/2025-07-12/EAHCI\messages.json
📥 Scraping: Thequorachannel
✅ Saved messages to ../data/raw/telegram_messages/2025-07-12/Thequorachannel\messages.json
📥 Scraping: HakimApps_Guideline
✅ Saved messages to ../data/raw/telegram_messages/2025-07-12/HakimApps_Guideline\messages.json


In [9]:
async def download_images(channel_username, limit=50, download_path="../data/images"):
    os.makedirs(os.path.join(download_path, channel_username), exist_ok=True)
    try:
        async for message in client.iter_messages(channel_username, limit=limit):
            if message.photo:
                await message.download_media(file=os.path.join(download_path, channel_username))
                logging.info(f"Downloaded image from {channel_username}")
    except Exception as e:
        logging.error(f"Error downloading images from {channel_username}: {str(e)}")


In [10]:
# Let's say we want images only from CheMed123 and lobelia4cosmetics
image_channels = ["lobelia4cosmetics", "CheMed123"]

for channel in image_channels:
    print(f"📸 Downloading images from: {channel}")
    await download_images(channel, limit=30, download_path="../data/images")


📸 Downloading images from: lobelia4cosmetics
📸 Downloading images from: CheMed123
