<a href="https://colab.research.google.com/github/ZNAXNOR/AI-Blog-Posts/blob/main/AI_Blog_Posts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================
# 1. Install Dependencies
# =============================
!pip install feedparser python-dotenv openai requests sentence-transformers scikit-learn

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1

In [2]:
# =============================
# 2. Imports & Environment Setup
# =============================
import os
import json
import feedparser
import requests
import base64
import hashlib
from openai import OpenAI
from dotenv import load_dotenv
from google.colab import userdata
from collections import defaultdict
import re
import time
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables
load_dotenv()

os.environ["HF_TOKEN"] = userdata.get('HF_Token')
os.environ["WP_URL"] = "https://odlabagency.wpcomstaging.com"
os.environ["WP_USER"] = "odomkardalvi"
os.environ["WP_PASS"] = userdata.get('WP_OdLabsAgency_App')

HF_TOKEN = os.environ["HF_TOKEN"]
WP_URL = os.environ["WP_URL"]
WP_USER = os.environ["WP_USER"]
WP_PASS = os.environ["WP_PASS"]

client = OpenAI(base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
# =============================
# 3. RSS Feed Lists
# =============================
GLOBAL_RSS_FEEDS = [
    "https://feeds.bbci.co.uk/news/technology/rss.xml",
    "https://www.reutersagency.com/feed/?best-topics=technology",
    "https://www.theverge.com/rss/index.xml",
    "https://feeds.arstechnica.com/arstechnica/technology-lab",
    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
    "https://www.cnet.com/rss/news/",
    "https://www.technologyreview.com/feed/"
]

INDIA_RSS_FEEDS = [
    "https://www.thehindu.com/sci-tech/technology/feeder/default.rss",
    "https://economictimes.indiatimes.com/tech/rssfeeds/13357270.cms",
    "https://indianexpress.com/section/technology/feed/",
    "https://www.livemint.com/rss/technology",
    "https://timesofindia.indiatimes.com/rssfeeds/5880659.cms"
]

RSS_FEEDS = GLOBAL_RSS_FEEDS + INDIA_RSS_FEEDS

def fetch_articles(feed_urls):
    articles = []
    for feed_url in feed_urls:
        feed = feedparser.parse(feed_url)
        if feed.entries:
            for entry in feed.entries:
                entry['source_feed'] = feed_url
            articles.extend(feed.entries)
    return articles

In [4]:
# =============================
# 4. Semantic Clustering
# =============================
def cluster_articles(articles, threshold=0.75):
    clusters, used = [], set()
    texts = [a['title'] + " " + a.get('summary', '') for a in articles]
    embeddings = embed_model.encode(texts, convert_to_numpy=True)
    for i, art in enumerate(articles):
        if i in used:
            continue
        cluster = [art]
        used.add(i)
        for j in range(i+1, len(articles)):
            if j in used:
                continue
            sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
            if sim >= threshold:
                cluster.append(articles[j])
                used.add(j)
        clusters.append(cluster)
    return clusters

In [5]:
# =============================
# 5. AI Post Generator with JSON Reliability and Sources Section
# =============================
def generate_post_with_ai(topic_articles, retries=2):
    titles = [a['title'] for a in topic_articles]
    summaries = [a.get('summary', '') for a in topic_articles]
    sources = [(a['link'], a['title']) for a in topic_articles]

    schema_example = {
        "title": "Example Title",
        "excerpt": "Example excerpt...",
        "tags": ["tag1", "tag2"],
        "body": "<h3>Heading</h3><p>Paragraph...</p>",
        "image_prompt": "A futuristic AI lab scene"
    }

    prompt_template = f"""
    You are an expert journalist. Merge the following sources into one unified post.
    Return ONLY valid JSON matching this schema:
    {json.dumps(schema_example, ensure_ascii=False)}

    Requirements:
    - 900–1200 words, 3–5 min read
    - Conversational but well-researched
    - <h3> for headings, <h4> for subheadings, <p> for paragraphs
    - Do not use \n for line breaks; use proper HTML tags only.
    - Do not use \\n for line breaks; use proper HTML tags only.
    - Add a <h3>Sources</h3> section at the bottom with an ordered <ol> list of clickable <a> links to each source.

    Articles:
    TITLES: {titles}
    SUMMARIES: {summaries}
    SOURCES: {sources}
    """

    attempt = 0
    while attempt <= retries:
        try:
            completion = client.chat.completions.create(
                model="openai/gpt-oss-20b",
                messages=[{"role": "user", "content": prompt_template}],
                temperature=0.7,
                max_tokens=4000
            )

            # Support both .message.content and .messages[0].content style returns
            choice = completion.choices[0]
            content = None

            if hasattr(choice, "message") and hasattr(choice.message, "content"):
                content = choice.message.content
            elif hasattr(choice, "text"):
                content = choice.text

            if not content or not content.strip():
                raise ValueError(f"Empty AI completion received. Raw: {completion}")


            match = re.search(r'\{[\s\S]*\}', content)
            if match:
                content = match.group(0)

            parsed = json.loads(content)
            parsed["body"] = re.sub(r'\\n', '', parsed["body"])
            return parsed

        except Exception as e:
            attempt += 1
            print(f"Retry {attempt}/{retries} - {type(e).__name__}: {e}")
            time.sleep(2)
            prompt_template = "Your last output was invalid JSON or empty, return only valid JSON this time.\n" + prompt_template

    raise ValueError(f"AI failed to return valid JSON after {retries+1} attempts")

In [6]:
# =============================
# 6. WordPress Posting
# =============================
def ensure_tags_exist(tag_names):
    token = base64.b64encode(f"{WP_USER}:{WP_PASS}".encode()).decode("utf-8")
    headers = {"Authorization": f"Basic {token}", "Content-Type": "application/json"}
    tag_ids = []
    for tag in tag_names:
        r = requests.get(f"{WP_URL}/wp-json/wp/v2/tags?search={tag}", headers=headers)
        if r.status_code == 200 and r.json():
            tag_ids.append(r.json()[0]['id'])
        else:
            cr = requests.post(f"{WP_URL}/wp-json/wp/v2/tags", headers=headers, json={"name": tag})
            if cr.status_code in (200, 201):
                tag_ids.append(cr.json()['id'])
    return tag_ids

def post_to_wordpress(ai_post):
    token = base64.b64encode(f"{WP_USER}:{WP_PASS}".encode()).decode("utf-8")
    headers = {"Authorization": f"Basic {token}", "Content-Type": "application/json"}
    tag_ids = ensure_tags_exist(ai_post["tags"])
    r = requests.post(f"{WP_URL}/wp-json/wp/v2/posts", headers=headers, json={
        "title": ai_post["title"],
        "content": ai_post["body"],
        "excerpt": ai_post["excerpt"],
        "status": "draft",
        "tags": tag_ids
    })
    if r.status_code in (200, 201):
        print(f"✅ Draft created: {ai_post['title']}")
        return r.json().get("link")
    print(f"❌ Failed: {r.status_code} {r.text}")
    return None

In [7]:
# =============================
# 7. Duplicate Detection (Local + WordPress)
# =============================
def load_published():
    try:
        with open("published_topics.json", "r") as f:
            return set(json.load(f))
    except FileNotFoundError:
        return set()

def save_published(published_set):
    with open("published_topics.json", "w") as f:
        json.dump(list(published_set), f)

def topic_signature(title):
    return hashlib.sha256(title.lower().encode()).hexdigest()

In [8]:
# =============================
# 8. Main Flow
# =============================
def run_pipeline():
    published_topics = load_published()
    articles = fetch_articles(RSS_FEEDS)
    clusters = cluster_articles(articles)
    for cluster in clusters:
        feeds_covered = len({a['source_feed'] for a in cluster})
        if len(cluster) >= 2 and feeds_covered > 1:
            ai_post = generate_post_with_ai(cluster)
            sig = topic_signature(ai_post["title"])
            if sig not in published_topics:
                wp_link = post_to_wordpress(ai_post)
                if wp_link:
                    published_topics.add(sig)
                    save_published(published_topics)
            break

In [9]:
# =============================
# 9. Run
# =============================
run_pipeline()

✅ Draft created: Apple Fires Back at Musk’s App Store Bias Allegations
