<a href="https://colab.research.google.com/github/ZNAXNOR/AI-Blog-Posts/blob/main/AI_Blog_Posts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================
# 1. Install Dependencies
# =============================
!pip install feedparser python-dotenv openai requests sentence-transformers scikit-learn rapidfuzz

In [None]:
# =============================
# 2. Imports & Environment Setup
# =============================
import os
import json
import feedparser
import requests
import base64
import hashlib
from openai import OpenAI
from dotenv import load_dotenv
from google.colab import userdata
from collections import defaultdict
import re
import time
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables
load_dotenv()

os.environ["HF_TOKEN"] = userdata.get('HF_Token')
os.environ["WP_URL"] = "https://odlabagency.wpcomstaging.com"
os.environ["WP_USER"] = "odomkardalvi"
os.environ["WP_PASS"] = userdata.get('WP_OdLabsAgency_App')

HF_TOKEN = os.environ["HF_TOKEN"]
WP_URL = os.environ["WP_URL"]
WP_USER = os.environ["WP_USER"]
WP_PASS = os.environ["WP_PASS"]

client = OpenAI(base_url="https://router.huggingface.co/v1", api_key=HF_TOKEN)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

SEMANTIC_THRESHOLD = 0.80  # Similarity cutoff for needs_review flag
DUPLICATE_SIMILARITY = 0.90  # Duplicate blocking threshold
TOP_K = 3

In [None]:
# =============================
# 3. RSS Feed Lists
# =============================
GLOBAL_RSS_FEEDS = [
    "https://feeds.bbci.co.uk/news/technology/rss.xml",
    "https://www.reutersagency.com/feed/?best-topics=technology",
    "https://www.theverge.com/rss/index.xml",
    "https://feeds.arstechnica.com/arstechnica/technology-lab",
    "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
    "https://www.cnet.com/rss/news/",
    "https://www.technologyreview.com/feed/",
    "https://www.wired.com/feed/category/gear/latest/rss",
    "https://techcrunch.com/feed/",
    "https://venturebeat.com/category/ai/feed/",
    "https://www.zdnet.com/news/rss.xml",
    "https://www.pcmag.com/feed"
]

INDIA_RSS_FEEDS = [
    "https://www.thehindu.com/sci-tech/technology/feeder/default.rss",
    "https://economictimes.indiatimes.com/tech/rssfeeds/13357270.cms",
    "https://indianexpress.com/section/technology/feed/",
    "https://www.livemint.com/rss/technology",
    "https://timesofindia.indiatimes.com/rssfeeds/5880659.cms"
]

Google_Trends_Google_News_RSS_FEEDS = [
    "https://news.google.com/rss/search?q=technology&hl=en-IN&gl=IN&ceid=IN:en",
    "https://news.google.com/rss/search?q=artificial+intelligence&hl=en-US&gl=US&ceid=US:en",
    "https://news.google.com/rss/search?q=machine+learning&hl=en-GB&gl=GB&ceid=GB:en"
]

RSS_FEEDS = GLOBAL_RSS_FEEDS + INDIA_RSS_FEEDS + Google_Trends_Google_News_RSS_FEEDS

def fetch_articles(feed_urls):
    articles = []
    for feed_url in feed_urls:
        feed = feedparser.parse(feed_url)
        if feed.entries:
            for entry in feed.entries:
                entry['source_feed'] = feed_url
            articles.extend(feed.entries)
    return articles

In [None]:
# =============================
# 4. Semantic Clustering
# =============================
def cluster_articles(articles, threshold=0.75):
    clusters, used = [], set()
    texts = [a['title'] + " " + a.get('summary', '') for a in articles]
    embeddings = embed_model.encode(texts, convert_to_numpy=True)
    for i, art in enumerate(articles):
        if i in used:
            continue
        cluster = [art]
        used.add(i)
        for j in range(i+1, len(articles)):
            if j in used:
                continue
            sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
            if sim >= threshold:
                cluster.append(articles[j])
                used.add(j)
        clusters.append(cluster)
    return clusters

In [None]:
# =============================
# 5. AI Post Generator
# =============================
def generate_post_with_ai(topic_articles, retries=2):
    titles = [a['title'] for a in topic_articles]
    summaries = [a.get('summary', '') for a in topic_articles]
    sources = [(a['link'], a['title']) for a in topic_articles]

    schema_example = {
        "title": "Example Title",
        "excerpt": "Example excerpt...",
        "tags": ["tag1", "tag2"],
        "body": "<h3>Heading</h3><p>Paragraph...</p>",
        "image_prompt": "A futuristic AI lab scene"
    }

    prompt_template = f"""
    You are an expert journalist. Merge the following sources into one unified post.
    Return ONLY valid JSON matching this schema:
    {json.dumps(schema_example, ensure_ascii=False)}

    Requirements:
    - 900–1200 words
    - Conversational but well-researched
    - <h3> for headings, <h4> for subheadings, <p> for paragraphs
    - Add a <h3>Sources</h3> section with <ol> clickable links.

    Articles:
    TITLES: {titles}
    SUMMARIES: {summaries}
    SOURCES: {sources}
    """

    attempt = 0
    while attempt <= retries:
        try:
            completion = client.chat.completions.create(
                model="openai/gpt-oss-20b",
                messages=[{"role": "user", "content": prompt_template}],
                temperature=0.7,
                max_tokens=4000
            )

            choice = completion.choices[0]
            content = getattr(choice.message, "content", None) or getattr(choice, "text", None)
            if not content or not content.strip():
                raise ValueError("Empty AI completion")

            match = re.search(r'\{[\s\S]*\}', content)
            if match:
                content = match.group(0)

            parsed = json.loads(content)
            parsed["body"] = re.sub(r'\\n', '', parsed["body"])
            return parsed

        except Exception:
            attempt += 1
            time.sleep(2)
            prompt_template = "Your last output was invalid JSON, return only valid JSON this time.\n" + prompt_template

    raise ValueError("AI failed to return valid JSON")

In [None]:
# =============================
# 6. Semantic Similarity Check
# =============================
def compute_semantic_score(generated_html, source_articles, top_k=TOP_K):
    def strip_html(text):
        return re.sub(r'<[^>]*>', ' ', text)

    gen_text = strip_html(generated_html)
    src_texts = [a['title'] + " " + a.get('summary', '') for a in source_articles]

    gen_emb = embed_model.encode([gen_text], convert_to_tensor=True, normalize_embeddings=True)
    src_embs = embed_model.encode(src_texts, convert_to_tensor=True, normalize_embeddings=True)

    sims = util.cos_sim(gen_emb, src_embs).cpu().numpy().flatten()
    sims_sorted = sorted(sims, reverse=True)
    k = min(top_k, len(sims_sorted))
    return round(float(sum(sims_sorted[:k]) / k), 4)

In [None]:
# =============================
# 7. Cross-Day Duplicate Detection
# =============================
DUPLICATE_LOG = "published_posts.json"

def load_published_posts():
    try:
        with open(DUPLICATE_LOG, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return []

def save_published_post(post_data):
    posts = load_published_posts()
    posts.append(post_data)
    with open(DUPLICATE_LOG, "w") as f:
        json.dump(posts, f)

def is_duplicate(new_post):
    past_posts = load_published_posts()
    # Combine title, tags, and body for stronger semantic comparison
    new_text = new_post["title"] + " " + new_post["excerpt"] + " " + " ".join(new_post["tags"]) + " " + re.sub(r'<[^>]*>', ' ', new_post["body"])
    new_emb = embed_model.encode([new_text], convert_to_tensor=True, normalize_embeddings=True)

    for post in past_posts:
        past_text = post["title"] + " " + post.get("excerpt", "") + " " + " ".join(post["tags"]) + " " + re.sub(r'<[^>]*>', ' ', post["body"])
        past_emb = embed_model.encode([past_text], convert_to_tensor=True, normalize_embeddings=True)
        semantic_sim = float(util.cos_sim(new_emb, past_emb).cpu().numpy()[0][0])
        if semantic_sim >= DUPLICATE_SIMILARITY:
            return True
    return False

In [None]:
# =============================
# 8. WordPress Posting with Meta
# =============================
def ensure_tags_exist(tag_names):
    token = base64.b64encode(f"{WP_USER}:{WP_PASS}".encode()).decode("utf-8")
    headers = {"Authorization": f"Basic {token}", "Content-Type": "application/json"}
    tag_ids = []
    for tag in tag_names:
        r = requests.get(f"{WP_URL}/wp-json/wp/v2/tags?search={tag}", headers=headers)
        if r.status_code == 200 and r.json():
            tag_ids.append(r.json()[0]['id'])
        else:
            cr = requests.post(f"{WP_URL}/wp-json/wp/v2/tags", headers=headers, json={"name": tag})
            if cr.status_code in (200, 201):
                tag_ids.append(cr.json()['id'])
    return tag_ids

def post_to_wordpress(ai_post, semantic_score):
    REVIEW_THRESHOLD = SEMANTIC_THRESHOLD
    status_flag = "approved_auto" if semantic_score >= REVIEW_THRESHOLD else "needs_review"
    token = base64.b64encode(f"{WP_USER}:{WP_PASS}".encode()).decode("utf-8")
    headers = {"Authorization": f"Basic {token}", "Content-Type": "application/json"}
    tag_ids = ensure_tags_exist(ai_post["tags"])
    payload = {
        "title": ai_post["title"],
        "content": ai_post["body"],
        "excerpt": ai_post["excerpt"],
        "status": "draft",
        "tags": tag_ids,
        "meta": {
            "semantic_score": semantic_score,
            "status_flag": status_flag
        }
    }
    r = requests.post(f"{WP_URL}/wp-json/wp/v2/posts", headers=headers, json=payload)
    if r.status_code in (200, 201):
        print(f"✅ Draft created: {ai_post['title']} (score={semantic_score}, flag={status_flag})")
        return r.json().get("link")
    print(f"❌ Failed: {r.status_code} {r.text}")
    return None

In [None]:
# =============================
# 9. Main Flow
# =============================
def run_pipeline():
    articles = fetch_articles(RSS_FEEDS)
    clusters = cluster_articles(articles)
    for cluster in clusters:
        feeds_covered = len({a['source_feed'] for a in cluster})
        if len(cluster) >= 2 and feeds_covered > 1:
            ai_post = generate_post_with_ai(cluster)
            if is_duplicate(ai_post):
                print(f"⛔ Duplicate skipped: {ai_post['title']}")
                continue
            score = compute_semantic_score(ai_post["body"], cluster)
            wp_link = post_to_wordpress(ai_post, score)
            if wp_link:
                print(f"Post URL: {wp_link}")
            break

In [None]:
# =============================
# 10. Run
# =============================
run_pipeline()