In [1]:
!pip install loguru
import os
import sys

# Create the utils folder and logger.py file
os.makedirs("utils", exist_ok=True)
with open("utils/logger.py", "w") as f:
    f.write("from loguru import logger\n")

# Add current directory to Python path
sys.path.append(os.path.abspath("."))

# Now safely import logger
from utils.logger import logger

logger.info("✅ Logger is working and utils/logger.py is found!")





[32m2025-07-13 14:40:46.764[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1m✅ Logger is working and utils/logger.py is found![0m




In [2]:
# ✅ Install once per environment
!pip install --upgrade pip
!pip install telethon python-dotenv loguru nest_asyncio

# ✅ Imports
import os
import json
import asyncio
from datetime import datetime
from telethon import TelegramClient
from dotenv import load_dotenv
from loguru import logger
import nest_asyncio

nest_asyncio.apply()  # Allows async in Jupyter notebooks

# ✅ Load environment variables
load_dotenv()
api_id_raw = os.getenv("API_ID")
api_hash = os.getenv("API_HASH")
session_name = os.getenv("SESSION_NAME")

if not api_id_raw or not api_hash or not session_name:
    raise ValueError("Please make sure API_ID, API_HASH, and SESSION_NAME are set in the .env file.")

API_ID = int(api_id_raw)
API_HASH = api_hash
SESSION_NAME = session_name

# ✅ Telegram Channels to scrape
CHANNELS = [
    "https://t.me/CheMed123",
    "https://t.me/lobelia4cosmetics",
    "https://t.me/tikvahpharma"
]

# ✅ Prepare storage paths based on today's date
TODAY = datetime.now().strftime("%Y-%m-%d")
BASE_PATH = f"data/raw/telegram_messages/{TODAY}"
IMAGE_PATH = f"data/raw/images/{TODAY}"
os.makedirs(BASE_PATH, exist_ok=True)
os.makedirs(IMAGE_PATH, exist_ok=True)

# ✅ Initialize Telegram client
client = TelegramClient(SESSION_NAME, API_ID, API_HASH)

# ✅ Scrape channel function
async def scrape_channel(channel_url):
    channel_name = channel_url.split("/")[-1]
    logger.info(f"Scraping channel: {channel_name}")
    messages_data = []
    channel_img_path = os.path.join(IMAGE_PATH, channel_name)
    os.makedirs(channel_img_path, exist_ok=True)

    try:
        async for message in client.iter_messages(channel_url, limit=100):
            msg = {
                "id": message.id,
                "date": str(message.date),
                "sender_id": message.sender_id,
                "text": message.text,
                "has_photo": bool(message.photo),
                "channel": channel_name
            }

            # Download image if it exists
            if message.photo:
                image_name = f"{message.id}.jpg"
                image_path = os.path.join(channel_img_path, image_name)
                try:
                    await client.download_media(message.photo, file=image_path)
                    msg["image_path"] = image_path
                    logger.debug(f"Downloaded image: {image_path}")
                except Exception as e:
                    logger.warning(f"Image download failed: {e}")
                    msg["image_path"] = None

            messages_data.append(msg)

        # Save JSON file
        out_file = os.path.join(BASE_PATH, f"{channel_name}.json")
        with open(out_file, "w", encoding="utf-8") as f:
            json.dump(messages_data, f, ensure_ascii=False, indent=2)

        logger.success(f"Saved {len(messages_data)} messages from {channel_name}")

    except Exception as e:
        logger.error(f"Error scraping {channel_url}: {e}")

# ✅ Main async runner
async def main():
    await client.start()
    tasks = [scrape_channel(url.strip()) for url in CHANNELS if url.strip()]
    await asyncio.gather(*tasks)
    await client.disconnect()

# ✅ Run the scraping logic
await main()




[32m2025-07-13 14:41:03.431[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_channel[0m:[36m50[0m - [1mScraping channel: CheMed123[0m
[32m2025-07-13 14:41:03.438[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_channel[0m:[36m50[0m - [1mScraping channel: lobelia4cosmetics[0m
[32m2025-07-13 14:41:03.443[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_channel[0m:[36m50[0m - [1mScraping channel: tikvahpharma[0m
[32m2025-07-13 14:41:05.980[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mscrape_channel[0m:[36m73[0m - [34m[1mDownloaded image: data/raw/images/2025-07-13\lobelia4cosmetics\18535.jpg[0m
[32m2025-07-13 14:41:07.572[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mscrape_channel[0m:[36m73[0m - [34m[1mDownloaded image: data/raw/images/2025-07-13\CheMed123\97.jpg[0m
[32m2025-07-13 14:41:08.564[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mscrape_channel[0m:[36m73[0m - [34m[1mDownloaded image: data/raw/images/20