In [1]:
!pip install transformers torch sentence-transformers feedparser chromadb beautifulsoup4 requests accelerate bitsandbytes # bitsandbytes for faster model loading (optional but good)
# Add langchain if you decide to use it later: !pip install langchain

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting chromadb
  Downloading chromadb-1.0.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloa

In [2]:
import feedparser
import requests
from bs4 import BeautifulSoup
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import time
import json
import uuid # To generate unique IDs for documents

In [3]:
user_preferences = {
    "id": "user123",
    "keywords": ["artificial intelligence agents", "large language models", "reinforcement learning applications", "vector databases"],
    "preferred_tone": "informative and slightly enthusiastic",
    "excluded_sources": ["example-bad-site.com"] # Optional
}

In [4]:
rss_feed_urls = [
    "http://feeds.feedburner.com/TechCrunch/artificial-intelligence",
    "https://news.mit.edu/topic/mitcobrand-artificial-intelligence2-rss.xml", # Add more relevant feeds
    "https://hackingbutlegal.com/feed/", # Example cybersecurity feed if keywords match
]

In [5]:
def fetch_articles_from_feeds(feed_urls):
    articles = []
    for url in feed_urls:
        try:
            feed = feedparser.parse(url)
            for entry in feed.entries:
                # Basic filtering - skip if source is excluded
                if any(excluded in entry.link for excluded in user_preferences.get("excluded_sources", [])):
                    continue

                articles.append({
                    "id": str(uuid.uuid4()), # Generate unique ID
                    "title": entry.title,
                    "link": entry.link,
                    "published": entry.get("published", "N/A"), # .get handles missing keys
                    "summary": entry.get("summary", ""),
                    # Attempt to get full content (may need scraping later)
                    "content": entry.get("content", [{"value": entry.get("summary", "")}])[0].get("value", entry.get("summary", ""))
                })
            print(f"Fetched {len(feed.entries)} entries from {url}")
            time.sleep(1) # Be polite to servers
        except Exception as e:
            print(f"Error fetching feed {url}: {e}")
    return articles

# Fetch the articles
fetched_articles = fetch_articles_from_feeds(rss_feed_urls)
print(f"\nFetched a total of {len(fetched_articles)} articles.")
# Optional: Print a sample
# if fetched_articles:
#     print("\nSample Article:")
#     print(json.dumps(fetched_articles[0], indent=2))

Fetched 0 entries from http://feeds.feedburner.com/TechCrunch/artificial-intelligence
Fetched 0 entries from https://news.mit.edu/topic/mitcobrand-artificial-intelligence2-rss.xml
Fetched 20 entries from https://hackingbutlegal.com/feed/

Fetched a total of 20 articles.


In [6]:
def scrape_article_content(url):
    try:
        # Respect robots.txt (Ideally check it programmatically, but for now, be mindful)
        headers = {'User-Agent': 'MyNewsletterBot/1.0 (+http://example.com/botinfo)'} # Identify your bot
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status() # Raise error for bad responses (4xx or 5xx)

        soup = BeautifulSoup(response.content, 'html.parser')

        # --- VERY Basic Content Extraction ---
        # Try common tags: <article>, <main>, specific divs often contain main content
        # This needs significant refinement per site and is prone to breaking!
        main_content = soup.find('article') or soup.find('main') or soup.find('div', class_='content') # Add more guesses
        if main_content:
            # Get text, remove excessive whitespace
            text = ' '.join(main_content.stripped_strings)
            return text[:5000] # Limit length
        else: # Fallback to just pulling paragraph tags
            paragraphs = soup.find_all('p')
            text = ' '.join(p.get_text() for p in paragraphs)
            return text[:5000] # Limit length

    except requests.exceptions.RequestException as e:
        print(f"Scraping Error for {url}: {e}")
        return None
    except Exception as e:
         print(f"Scraping Parsing Error for {url}: {e}")
         return None

# --- Integrate scraping (Optional) ---
for article in fetched_articles:
    print(f"Attempting to scrape: {article['link']}")
    full_content = scrape_article_content(article['link'])
    if full_content:
        article['content'] = full_content # Replace summary/RSS content if scraping succeeds
    time.sleep(2) # Be extra polite when scraping

Attempting to scrape: https://www.hackingbutlegal.com/p/50-easter-discount
Attempting to scrape: https://www.hackingbutlegal.com/p/not-joking-trumps-third-term-strategy
Attempting to scrape: https://www.hackingbutlegal.com/p/the-politics-of-pathology
Attempting to scrape: https://www.hackingbutlegal.com/p/update-the-trump-administration-just
Attempting to scrape: https://www.hackingbutlegal.com/p/the-trump-administration-just-disregarded
Attempting to scrape: https://www.hackingbutlegal.com/p/us-treasurys-new-financial-surveillance
Attempting to scrape: https://www.hackingbutlegal.com/p/trumps-capital-one-lawsuit-the-conservative
Attempting to scrape: https://www.hackingbutlegal.com/p/ghosts-in-the-machine-the-rise-of
Attempting to scrape: https://www.hackingbutlegal.com/p/update-americas-cyber-surrender
Attempting to scrape: https://www.hackingbutlegal.com/p/americas-cyber-surrender
Attempting to scrape: https://www.hackingbutlegal.com/p/the-ghosts-of-american-eugenics
Attempting to s

In [7]:
# Load embedding model (runs on CPU or GPU if available)
# Using a smaller, faster model is fine for this project
print("Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding model loaded.")

# Setup ChromaDB client
# Using in-memory for simplicity in Colab. For persistence, provide a path:
# client = chromadb.PersistentClient(path="./chroma_db")
client = chromadb.Client() # In-memory client

# Create or get a collection (like a table in SQL)
# Use embedding_function=None if providing embeddings manually, otherwise Chroma can handle it.
# We'll provide embeddings manually for clarity here.
collection_name = "newsletter_articles"
try:
    client.delete_collection(name=collection_name) # Delete if exists for clean run
    print(f"Deleted existing collection: {collection_name}")
except:
    pass # Collection didn't exist, which is fine
collection = client.create_collection(name=collection_name)
print(f"Created collection: {collection_name}")


# --- Add Articles to ChromaDB ---
print("Adding articles to Vector DB...")
ids_to_add = []
embeddings_to_add = []
documents_to_add = [] # Store the main text content for Chroma
metadata_to_add = [] # Store title, link, etc.

# Simple text cleaning function
def clean_text(text):
    # Remove excessive newlines and whitespace
    return ' '.join(text.split())

for article in fetched_articles:
    # Use the 'content' field we populated (either summary or scraped text)
    cleaned_content = clean_text(article['content'])
    if not cleaned_content: # Skip if no content
         continue

    ids_to_add.append(article['id'])
    documents_to_add.append(cleaned_content)
    metadata_to_add.append({
        "title": article['title'],
        "link": article['link'],
        "published": article['published']
    })
    # Generate embedding (can batch this later for efficiency)
    embedding = embedding_model.encode(cleaned_content, convert_to_tensor=False) # Use numpy array
    embeddings_to_add.append(embedding.tolist()) # Chroma expects lists

    # Add in batches to ChromaDB (e.g., every 100 articles) if you have many
    # For few articles, adding one by one is fine

if ids_to_add:
     collection.add(
         ids=ids_to_add,
         embeddings=embeddings_to_add,
         documents=documents_to_add,
         metadatas=metadata_to_add
     )
     print(f"Added {len(ids_to_add)} articles to the collection.")
else:
    print("No valid articles found to add to the collection.")

Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model loaded.
Created collection: newsletter_articles
Adding articles to Vector DB...
Added 20 articles to the collection.


In [8]:
def retrieve_relevant_articles(query_keywords, top_n=5):
    if collection.count() == 0:
        print("Collection is empty. Cannot retrieve.")
        return []
    # Combine keywords into a single query string for embedding
    query_text = " ".join(query_keywords)
    query_embedding = embedding_model.encode(query_text, convert_to_tensor=False).tolist()

    print(f"\nQuerying for articles related to: '{query_text}'")
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_n,
        include=['metadatas', 'documents'] # Include documents for the LLM
    )
    print(f"Retrieved {len(results['ids'][0])} articles.")
    return results # Results is a dict containing ids, distances, metadatas, documents

# Retrieve articles based on preferences
relevant_articles_data = retrieve_relevant_articles(user_preferences["keywords"], top_n=3) # Ask for top 3
print("\nRetrieved Data Sample:")
print(json.dumps(relevant_articles_data, indent=2))


Querying for articles related to: 'artificial intelligence agents large language models reinforcement learning applications vector databases'
Retrieved 3 articles.

Retrieved Data Sample:
{
  "ids": [
    [
      "113f321d-54e9-44ce-b97c-4e1f7a274536",
      "8fa1b113-e248-4dbe-b325-4715ac318b29",
      "baa2ecde-3db8-4757-b926-4ee5df97eb1c"
    ]
  ],
  "embeddings": null,
  "documents": [
    [
      "Share this post Hacking, but Legal Ghosts in the Machine: The Rise of Hidden AI on Social Media Copy link Facebook Email Notes More Ghosts in the Machine: The Rise of Hidden AI on Social Media Jackie Singh Mar 06, 2025 \u2219 Paid 40 Share this post Hacking, but Legal Ghosts in the Machine: The Rise of Hidden AI on Social Media Copy link Facebook Email Notes More 4 16 Share Photo by Gwendal Cottin on Unsplash Earlier today, I found myself scrolling through my X feed when I encountered a peculiar reply. A user called \"PUNISHER\" was tweeting at me about deep state conspiracy theories i

In [9]:
# --- LLM Setup ---
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Configuration for loading the model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print(f"Loading LLM: {model_id}")
# Requires Hugging Face login/token for Llama 3:
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get('HF_Token')
login(token=hf_token)


tokenizer = AutoTokenizer.from_pretrained(model_id)
# Add padding token if missing (common issue)
if tokenizer.pad_token is None:
    print("Warning: pad_token is None. Setting pad_token to eos_token.")
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto", # Automatically place model layers on GPU/CPU
)
print("LLM loaded.")

# Create a Hugging Face pipeline for text generation
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16, # Match compute dtype
    device_map="auto",
)

Loading LLM: meta-llama/Meta-Llama-3-8B-Instruct


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Device set to use cuda:0


LLM loaded.


In [10]:
def generate_summary(article_content, max_length=150):
    # Truncate input context if too long for the model
    # Check model's max context length (e.g., 4096 for Phi-3, 8192 for Llama 3 8B)
    # Simple truncation:
    max_input_length = 3000 # Be conservative
    truncated_content = tokenizer.decode(tokenizer.encode(article_content, max_length=max_input_length, truncation=True))

    # Llama 3 Instruct prompt format
    messages = [
        {"role": "system", "content": "You are a helpful assistant that summarizes articles concisely."},
        {"role": "user", "content": f"Please summarize the following article:\n\n{truncated_content}\n\nSummary:"}
    ]
    # Note: The pipeline API handles prompt formatting for some models, but explicit formatting is safer.
    # We'll format it for the pipeline using the tokenizer's chat template if available, otherwise basic formatting.
    try:
         prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    except: # Fallback for models without chat templates set up correctly
         prompt = f"System: You are a helpful assistant that summarizes articles concisely.\nUser: Please summarize the following article:\n\n{truncated_content}\n\nSummary:\nAssistant:"

    print(f"\nGenerating summary...")
    # Adjust max_new_tokens for desired summary length + some buffer
    sequences = llm_pipeline(
        prompt,
        max_new_tokens=max_length + 50, # How many tokens to generate for the summary
        do_sample=True, # Use sampling for more varied output
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id # Set pad token ID
    )

    try:
        summary = sequences[0]['generated_text']
        # Clean up the output - remove the prompt part
        # This depends heavily on the model's output format
        # Try to find the start of the assistant's response
        assistant_marker = "Assistant:"
        summary_start_index = summary.rfind(assistant_marker)
        if summary_start_index != -1:
             summary = summary[summary_start_index + len(assistant_marker):].strip()
        else:
             # Fallback if marker not found (might happen with different models/prompts)
             # Remove the original prompt text (this is brittle)
             summary = summary.replace(prompt, "").strip()

        print("Summary generated.")
        return summary
    except Exception as e:
        print(f"Error processing LLM output: {e}")
        # print("Raw LLM output:", sequences) # Debugging
        return "Error generating summary."


# --- Generate Summaries for Retrieved Articles ---
summaries = {}
if relevant_articles_data and relevant_articles_data['ids']:
     for i, article_id in enumerate(relevant_articles_data['ids'][0]):
        content = relevant_articles_data['documents'][0][i]
        title = relevant_articles_data['metadatas'][0][i]['title']
        print(f"\nProcessing article: {title}")
        summaries[article_id] = generate_summary(content)
        # print(f"Summary for {title}: {summaries[article_id]}")
        time.sleep(1) # Small delay between LLM calls if needed
else:
    print("No relevant articles retrieved to summarize.")


Processing article: Ghosts in the Machine: The Rise of Hidden AI on Social Media

Generating summary...
Summary generated.

Processing article: Explainer: How AdTech Doubles as an Intelligence Platform

Generating summary...
Summary generated.

Processing article: Update: America's Cyber Surrender

Generating summary...
Summary generated.


In [11]:
def generate_commentary(summary, title, user_tone, max_length=75):
    # Llama 3 Instruct prompt format
    messages = [
        {"role": "system", "content": f"You are a content curator writing brief, engaging commentary for a newsletter. Adopt a {user_tone} tone."},
        {"role": "user", "content": f"Write a short comment (1-2 sentences) about the following article summary titled '{title}'. Relate it briefly to general interests in AI if possible, but focus on being engaging.\n\nSummary: {summary}\n\nCommentary:"}
    ]
    try:
         prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    except: # Fallback
         prompt = f"System: You are a content curator writing brief, engaging commentary for a newsletter. Adopt a {user_tone} tone.\nUser: Write a short comment (1-2 sentences) about the following article summary titled '{title}'. Relate it briefly to general interests in AI if possible, but focus on being engaging.\n\nSummary: {summary}\n\nCommentary:\nAssistant:"


    print(f"Generating commentary for: {title}")
    sequences = llm_pipeline(
        prompt,
        max_new_tokens=max_length + 30,
        do_sample=True,
        temperature=0.8, # Slightly higher temp for more creative commentary
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    try:
        commentary = sequences[0]['generated_text']
        # Clean up output
        assistant_marker = "Assistant:"
        commentary_start_index = commentary.rfind(assistant_marker)
        if commentary_start_index != -1:
             commentary = commentary[commentary_start_index + len(assistant_marker):].strip()
        else:
             commentary = commentary.replace(prompt, "").strip()

        print("Commentary generated.")
        return commentary
    except Exception as e:
        print(f"Error processing LLM output for commentary: {e}")
        # print("Raw LLM output:", sequences) # Debugging
        return "Error generating commentary."

# --- Generate Commentary ---
commentaries = {}
if relevant_articles_data and relevant_articles_data['ids']:
    for i, article_id in enumerate(relevant_articles_data['ids'][0]):
        if article_id in summaries:
            title = relevant_articles_data['metadatas'][0][i]['title']
            summary_text = summaries[article_id]
            commentaries[article_id] = generate_commentary(summary_text, title, user_preferences["preferred_tone"])
            # print(f"Commentary for {title}: {commentaries[article_id]}")
            time.sleep(1)

Generating commentary for: Ghosts in the Machine: The Rise of Hidden AI on Social Media
Commentary generated.
Generating commentary for: Explainer: How AdTech Doubles as an Intelligence Platform
Commentary generated.
Generating commentary for: Update: America's Cyber Surrender
Commentary generated.


In [12]:
def format_newsletter(retrieved_data, summaries_dict, commentaries_dict):
    newsletter = "# Your AI Agent & Workflow Digest 📰\n\n"
    newsletter += "Here are some articles curated based on your interests:\n\n"

    if not retrieved_data or not retrieved_data.get('ids') or not retrieved_data['ids'][0]:
        newsletter += "No relevant articles found this time."
        return newsletter

    for i, article_id in enumerate(retrieved_data['ids'][0]):
        metadata = retrieved_data['metadatas'][0][i]
        summary = summaries_dict.get(article_id, "Summary not available.")
        commentary = commentaries_dict.get(article_id, "")

        newsletter += f"## {metadata['title']}\n\n"
        newsletter += f"**Source:** [{metadata['link']}]({metadata['link']})\n"
        newsletter += f"**Published:** {metadata['published']}\n\n"
        newsletter += f"**Summary:** {summary}\n\n"
        if commentary:
            newsletter += f"**Quick Take:** {commentary}\n\n"
        newsletter += "---\n\n"

    return newsletter

# --- Generate the final output ---
final_newsletter = format_newsletter(relevant_articles_data, summaries, commentaries)

# --- Print the result ---
print("\n\n--- GENERATED NEWSLETTER ---")
print(final_newsletter)
print("--- END OF NEWSLETTER ---")



--- GENERATED NEWSLETTER ---
# Your AI Agent & Workflow Digest 📰

Here are some articles curated based on your interests:

## Ghosts in the Machine: The Rise of Hidden AI on Social Media

**Source:** [https://www.hackingbutlegal.com/p/ghosts-in-the-machine-the-rise-of](https://www.hackingbutlegal.com/p/ghosts-in-the-machine-the-rise-of)
**Published:** Thu, 06 Mar 2025 21:53:05 GMT

**Summary:** The article discusses the growing presence of "hidden AI" on social media, which refers to artificial intelligence (AI) that is designed to mimic human behavior but is not immediately recognizable as such. The author, a researcher in information security, notes that the distinction between human and AI is becoming increasingly blurred online. The author shares a personal experience where they interacted with a Twitter account that seemed suspiciously artificial, with unnatural language and behavior. The article suggests that this type of AI is becoming more prevalent and raises questions about

In [None]:
def run_newsletter_workflow(user_preferences):
    try:
        # Step 1: Fetch data
        print("Fetching data...")
        fetched_data = fetch_data(user_preferences)
        if not fetched_data:
            return None, "No data fetched. Please check your preferences or sources."

        # Step 2: Process with RAG
        print("Processing data with RAG...")
        processed_data = process_with_rag(fetched_data, user_preferences)
        if not processed_data:
            return None, "Failed to process data with RAG."

        # Step 3: Summarize with LLM
        print("Summarizing articles with LLM...")
        summaries = summarize_with_llm(processed_data)
        if not summaries:
            return None, "Failed to generate summaries."

        # Step 4: Generate commentary
        print("Generating commentary...")
        commentaries = generate_commentary(summaries, user_preferences)
        if not commentaries:
            return None, "Failed to generate commentaries."

        # Step 5: Format newsletter
        print("Formatting newsletter...")
        newsletter = format_newsletter(processed_data, summaries, commentaries)
        if not newsletter:
            return None, "Failed to format newsletter."

        return newsletter, "Newsletter generated successfully!"

    except Exception as e:
        print(f"Error in workflow: {e}")
        return None, f"An error occurred: {e}"