In [1]:
import json
import psycopg2
from dateutil import parser as dateparser

In [2]:
# ---------------------------------------------------------
# DATABASE CONNECTION
# ---------------------------------------------------------
conn = psycopg2.connect(
    host="localhost",
    database="mates",
    user="postgres",
    password="1"
)
conn.autocommit = True
cur = conn.cursor()

In [3]:
def get_or_create_source(source_url):
    cur.execute("""
        INSERT INTO sources (source_url, source_name)
        VALUES (%s, %s)
        ON CONFLICT (source_url) DO UPDATE SET source_name = EXCLUDED.source_name
        RETURNING source_id;
    """, (source_url, source_url))
    return cur.fetchone()[0]

In [4]:
def get_or_create_category(category_name):
    if not category_name:
        return None

    cur.execute("""
        INSERT INTO categories (category_name)
        VALUES (%s)
        ON CONFLICT (category_name) DO UPDATE SET category_name = EXCLUDED.category_name
        RETURNING category_id;
    """, (category_name,))
    return cur.fetchone()[0]

In [5]:
def get_or_create_tag(tag_name):
    cur.execute("""
        INSERT INTO tags (tag_name)
        VALUES (%s)
        ON CONFLICT (tag_name) DO UPDATE SET tag_name = EXCLUDED.tag_name
        RETURNING tag_id;
    """, (tag_name,))
    return cur.fetchone()[0]

In [6]:
def insert_article(article_data, source_id, category_id):
    publication_date = dateparser.parse(article_data["publication_date"]).date()
    scrape_date = dateparser.parse(article_data["scrape_date"])

    cur.execute("""
        INSERT INTO articles (
            url, source_id, category_id,
            publication_date, scrape_date,
            title, content,
            word_count, sentence_count, character_count,
            category_confidence
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (url) DO UPDATE SET
            title = EXCLUDED.title,
            content = EXCLUDED.content,
            word_count = EXCLUDED.word_count,
            sentence_count = EXCLUDED.sentence_count,
            character_count = EXCLUDED.character_count,
            category_id = EXCLUDED.category_id,
            category_confidence = EXCLUDED.category_confidence,
            updated_at = CURRENT_TIMESTAMP
        RETURNING article_id;
    """, (
        article_data["url"],
        source_id,
        category_id,
        publication_date,
        scrape_date,
        article_data["title"],
        article_data["content"],
        article_data.get("word_count", 0),
        article_data.get("sentence_count", 0),
        article_data.get("character_count", 0),
        article_data.get("category_confidence", 0.0)
    ))

    return cur.fetchone()[0]

In [7]:
def insert_article_tags(article_id, tags):
    for tag_name in tags:
        tag_id = get_or_create_tag(tag_name)
        cur.execute("""
            INSERT INTO article_tags (article_id, tag_id)
            VALUES (%s, %s)
            ON CONFLICT (article_id, tag_id) DO NOTHING;
        """, (article_id, tag_id))

In [8]:
# ---------------------------------------------------------
# PROCESS A SINGLE ARTICLE (CORRECT FLOW)
# ---------------------------------------------------------
def process_article(article):
    print(f"→ Processing: {article['title'][:50]}")

    # 1. Insert / get source
    source_id = get_or_create_source(article["source"])

    # 2. Insert / get category
    category_id = get_or_create_category(article.get("primary_category"))

    # 3. Insert article
    article_id = insert_article(article, source_id, category_id)

    # 4. Insert tags
    tags = article.get("tags", [])
    insert_article_tags(article_id, tags)

    print(f"Article inserted with ID {article_id}")

In [9]:
# ---------------------------------------------------------
# MAIN ETL FUNCTION
# ---------------------------------------------------------
def run_etl(json_path):
    print("Loading JSON...")
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Found {len(data)} articles.")

    for article in data:
        try:
            process_article(article)
        except Exception as e:
            print("Error:", e)

    print("ETL Completed Successfully!")

In [11]:
if __name__ == "__main__":
    run_etl(r"D:\Menghour\MATES\data\datasets\raw\all_articles_cleaned.json")

Loading JSON...
Found 7856 articles.
→ Processing: ចំនួននៃការវាយប្រហារ ទៅលើពលរដ្ឋរុស្ស៊ី នៅក្នុងប្រទេ
Article inserted with ID 1
→ Processing: មន្រ្តីជាន់ខ្ពស់ក្រសួងទេសចរណ៍ ស្នើឱ្យស្ថាប័នពាក់ព័
Article inserted with ID 2
→ Processing: លោក ម៉ឹង យូឡេង ប្រធានមន្ទីរសាធារណការ និងដឹកជញ្ជូនខ
Article inserted with ID 3
→ Processing: កូរ៉េខាងជើង សាកល្បងយានក្រោមទឹក គ្មានមនុស្សបើកបំពាក
Article inserted with ID 4
→ Processing: គេហទំព័រយោធាអាមេរិក ៖ វៀតណាមជាប់ជាប្រទេស ដែលយោធាមា
Article inserted with ID 5
→ Processing: បើកពិធីសម្ពោធពិព័រណ៍ ទំនិញគុណភាព ខ្ពស់អន្តរជាតិ ឆ្
Article inserted with ID 6
→ Processing: មន្រ្តីនាយកដ្ឋាន ដឹកជញ្ជូនរបស់វៀតណាម ដាក់បន្ទុកទៅល
Article inserted with ID 7
→ Processing: ខ្មែរ ប៊ែវើរីជីស លើកកម្ពស់សកម្មភាពការលេងកីឡាដើម្បី
Article inserted with ID 8
→ Processing: យន្តហោះ អ៊ីស្រាអែល វាយប្រហារ ទីតាំងបាញ់រ៉ុកកែត នៅល
Article inserted with ID 9
→ Processing: នាយចឺម ឆាតញ៉ែសង្សារគេ ត្រូវគេលបវាយឡើង បែកក្បាល
Article inserted with ID 10
→ Processing: ៖ ខ្លួននឹងជួសជុល បញ្ហាផ្នែកទន់ ប