In [1]:
import os
import psycopg2
from dotenv import load_dotenv

load_dotenv()

DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")

conn = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASS,
    host=DB_HOST,
    port=DB_PORT
)
cursor = conn.cursor()

print("✅ Connected to PostgreSQL!")


✅ Connected to PostgreSQL!


In [2]:
create_table_sql = """
CREATE TABLE IF NOT EXISTS raw_telegram_messages (
    message_id BIGINT PRIMARY KEY,
    date TIMESTAMP,
    sender_id TEXT,
    text TEXT,
    channel TEXT,
    has_photo BOOLEAN
);
"""

cursor.execute(create_table_sql)
conn.commit()

print("✅ Table 'raw_telegram_messages' ensured in database")


✅ Table 'raw_telegram_messages' ensured in database


In [3]:
import json
from glob import glob
from datetime import datetime

def load_json_to_postgres(file_path, channel_name):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    for msg in data:
        msg_id = msg.get("id")
        msg_date = msg.get("date")
        if msg_date:
            # Convert string to datetime
            msg_date = datetime.fromisoformat(msg_date.replace("Z", "+00:00"))
        sender = msg.get("from_id", {})
        sender_id = sender.get("user_id") if isinstance(sender, dict) else sender
        text = msg.get("message")
        has_photo = bool(msg.get("photo"))
        
        try:
            cursor.execute(
                """
                INSERT INTO raw_telegram_messages (message_id, date, sender_id, text, channel, has_photo)
                VALUES (%s, %s, %s, %s, %s, %s)
                ON CONFLICT (message_id) DO NOTHING;
                """,
                (msg_id, msg_date, sender_id, text, channel_name, has_photo)
            )
        except Exception as e:
            print(f"⚠️ Skipping message {msg_id} due to error: {e}")
            conn.rollback()
        else:
            conn.commit()

    print(f"✅ Loaded data from {channel_name} - {file_path}")


In [5]:
import os

raw_data_path = "../data/raw/telegram_messages"

# Recursively find all messages.json files
json_files = glob(os.path.join(raw_data_path, "*", "*", "messages.json"))

for file_path in json_files:
    # Extract channel name from path (assumes channel is folder name one level above file)
    channel_name = file_path.split(os.sep)[-2]
    load_json_to_postgres(file_path, channel_name)


✅ Loaded data from CheMed123 - ../data/raw/telegram_messages\2025-07-10\CheMed123\messages.json
✅ Loaded data from EAHCI - ../data/raw/telegram_messages\2025-07-10\EAHCI\messages.json
✅ Loaded data from HakimApps_Guideline - ../data/raw/telegram_messages\2025-07-10\HakimApps_Guideline\messages.json
✅ Loaded data from lobelia4cosmetics - ../data/raw/telegram_messages\2025-07-10\lobelia4cosmetics\messages.json
✅ Loaded data from Thequorachannel - ../data/raw/telegram_messages\2025-07-10\Thequorachannel\messages.json
✅ Loaded data from tikvahpharma - ../data/raw/telegram_messages\2025-07-10\tikvahpharma\messages.json


In [6]:
cursor.close()
conn.close()
print("✅ PostgreSQL connection closed.")


✅ PostgreSQL connection closed.
