In [1]:
# 1. Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

In [2]:
# Load environment variables
load_dotenv()

# PostgreSQL connection setup
DB_USER = os.getenv("POSTGRES_USER", "postgres")
DB_PASS = os.getenv("POSTGRES_PASSWORD", "postgres")
DB_HOST = os.getenv("POSTGRES_HOST", "db")
DB_PORT = os.getenv("POSTGRES_PORT", "5432")
DB_NAME = os.getenv("POSTGRES_DB", "telegram_data_db")

DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)

In [6]:
query = """
SELECT * FROM public_staging.stg_telegram_messages;
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,id,source_file,date,text,views,from_id,replies,forwards
0,1,CheMed123_2025-07-17,2023-02-10 12:23:06,"⚠️Notice!\nDear esteemed customers,\nDue to fo...",1222.0,,,1.0
1,2,CheMed123_2025-07-17,2023-02-02 08:58:52,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,1123.0,,,3.0
2,3,CheMed123_2025-07-17,2023-02-01 08:59:37,አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...,1015.0,,,4.0
3,4,CheMed123_2025-07-17,2023-01-31 09:19:53,Che-Med Trivia #3\n\nምግብና መጠጦች አንዳንድ መድሃኒቶች በደ...,784.0,,,1.0
4,5,CheMed123_2025-07-17,2023-01-30 09:45:25,"Che-Med Trivia #2\n\nእንደ Ciprofloxacin, Doxycy...",681.0,,,2.0


## Frequently mentioned Term?

In [25]:
query = """
SELECT
  word,
  COUNT(*) AS mention_count
FROM (
  SELECT
    unnest(string_to_array(lower(text), ' ')) AS word
  FROM public_staging.stg_telegram_messages
) sub
WHERE 
  -- Exclude stopwords
  word NOT IN (
    '', '-', 'of', 'and', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'with',
    'is', 'are', 'was', 'were', 'by', 'at', 'as', 'that', 'this', 'from',
    'እስከ', 'until', 'high', 'ከሰኞ'
  )
  -- Exclude digits
  AND word !~ '[0-9]'
  AND word ~ '^[a-zA-Zአ-ኸ]{8,}$' -- Exclude special characters (non-letter characters)
GROUP BY word
ORDER BY mention_count DESC
LIMIT 20;
"""
df = pd.read_sql(query, engine)
df


Unnamed: 0,word,mention_count
0,pharmacy,1548
1,delivery,1521
2,midnight,1313
3,cosmetics,1313
4,medhanialem,1313
5,centrifuge,426
6,surgical,291
7,measures,285
8,examination,241
9,pharmacist,228


## Posting Patterns - Daily 

In [5]:
query = """
SELECT
  DATE_TRUNC('day', date) AS day,
  COUNT(*) AS message_count
FROM public_staging.stg_telegram_messages
GROUP BY day
ORDER BY 2 desc;
"""
df = pd.read_sql(query, engine)
df.head(15)

Unnamed: 0,day,message_count
0,2025-07-16,174
1,2025-07-15,153
2,2025-06-04,144
3,2025-06-23,130
4,2025-07-17,126
5,2025-06-18,112
6,2025-07-14,111
7,2025-06-20,106
8,2025-06-16,106
9,2025-06-30,104
