In [25]:
# scrape_telegram.py

# If you haven't installed these yet:
# pip install telethon python-dotenv

import os
import logging
from pathlib import Path

from dotenv import load_dotenv
from telethon import TelegramClient
from telethon.errors import RPCError

def main():
    # 1) Locate and load .env from the current working directory
    cwd      = Path(os.getcwd())
    env_file = cwd / ".env"
    if not env_file.exists():
        raise FileNotFoundError(
            f"No .env found at {env_file!r}. "
            "Place your .env (with API_ID and API_HASH) in this folder."
        )
    load_dotenv(dotenv_path=env_file)

    # 2) Retrieve and validate credentials
    raw_id   = os.getenv("API_ID")
    raw_hash = os.getenv("API_HASH")
    print(f"DEBUG: raw API_ID={raw_id!r}, API_HASH={raw_hash!r}")

    if raw_id is None or raw_hash is None:
        raise RuntimeError(
            "Environment variables API_ID or API_HASH not set. "
            "Check your .env file."
        )
    try:
        api_id = int(raw_id)
    except ValueError:
        raise RuntimeError(f"API_ID value {raw_id!r} is not integer‐parsable.")

    api_hash = raw_hash  # remains a string

    # 3) Configure logging
    log_dir = cwd / "data" / "raw" / "telegram_messages"
    log_dir.mkdir(parents=True, exist_ok=True)
    logging.basicConfig(
        filename=str(log_dir / "scrape.log"),
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s"
    )

    # 4) Initialize and start the Telegram client
    client = TelegramClient("session_scrape", api_id, api_hash)
    try:
        client.start()  # on first run, may prompt for your phone & code
        print("✅ Telegram client initialized and connected.")
    except RPCError as e:
        logging.error(f"Failed to start Telegram client: {e!r}")
        print(f"❌ Error initializing Telegram client: {e}")
        return

    # --- Your scraping logic goes here ---
    # e.g., dialogs = client.get_dialogs(), client.download_media(), etc.
    #
    # When done:
    # client.disconnect()

if __name__ == "__main__":
    main()


DEBUG: raw API_ID='6592689', API_HASH='5d9b4b87ca31121363a5421dafe3071a'
✅ Telegram client initialized and connected.


  client.start()  # on first run, may prompt for your phone & code


In [27]:
# scrape_telegram.py

import os
import json
import asyncio
import logging
from datetime import datetime
from pathlib import Path

from dotenv import load_dotenv
from telethon import TelegramClient
from telethon.errors import RPCError

# 1) Load credentials from .env
load_dotenv()  # looks for .env in cwd
raw_id   = os.getenv("API_ID")
raw_hash = os.getenv("API_HASH")

if raw_id is None or raw_hash is None:
    raise RuntimeError(
        "Missing API_ID or API_HASH in your .env file. "
        "Make sure .env contains API_ID=... and API_HASH=..."
    )
try:
    api_id = int(raw_id)
except ValueError:
    raise RuntimeError(f"API_ID {raw_id!r} is not an integer")

api_hash = raw_hash

# 2) Configure logging
today_str = datetime.now().strftime("%Y-%m-%d")
base_dir  = Path("data") / "raw" / "telegram_messages" / today_str
log_file  = base_dir / "scrape.log"
(base_dir / "media").mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    filename=str(log_file),
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# 3) Define your channels
channels = [
    "lobelia4cosmetics",
    "tikvahpharma",
    # add more usernames here...
]

# 4) Create the client
client = TelegramClient("session_scrape", api_id, api_hash)

async def main():
    try:
        await client.start()
        logging.info("✅ Telegram client started successfully")
        print("Telegram client connected ✅")
    except RPCError as e:
        logging.error(f"Failed to start Telegram client: {e!r}")
        print(f"Error initializing Telegram client: {e}")
        return

    # 5) Scrape each channel
    for channel in channels:
        try:
            logging.info(f"Start scraping channel: {channel}")
            messages_data = []

            # iterate in chronological order
            async for msg in client.iter_messages(channel, reverse=True):
                record = {
                    "message_id": msg.id,
                    "channel":     channel,
                    "date":        msg.date.strftime("%Y-%m-%d %H:%M:%S"),
                    "text":        msg.message,
                    "sender_id":   msg.sender_id,
                    "media_file":  None
                }

                # if there's media, download it
                if msg.media:
                    media_path = base_dir / "media" / f"{channel}_{msg.id}"
                    try:
                        file_path = await msg.download_media(file=str(media_path))
                        record["media_file"] = file_path
                    except Exception as ex:
                        logging.error(f"Media download error for {channel}#{msg.id}: {ex}")

                messages_data.append(record)

            # write out JSON
            out_json = base_dir / f"{channel}.json"
            with open(out_json, "w", encoding="utf-8") as f:
                json.dump(messages_data, f, ensure_ascii=False, indent=2)

            logging.info(f"Saved {len(messages_data)} msgs for {channel} → {out_json.name}")
            print(f"Scraped {len(messages_data)} messages from {channel}")

        except RPCError as e:
            logging.error(f"RPCError scraping {channel}: {e!r}")
            print(f"Error scraping {channel}: {e}")

    await client.disconnect()
    logging.info("🔌 Client disconnected")

if __name__ == "__main__":
    asyncio.run(main())


RuntimeError: asyncio.run() cannot be called from a running event loop