In [1]:
!pip install --upgrade pip
!pip install requests beautifulsoup4 tqdm python-dotenv openai pandas
# for local vector DB:
!pip install faiss-cpu sentence-transformers
# optional: pinecone
!pip install "pinecone-client>=2.0.0"


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cu

In [2]:
!pip install google-generativeai

Collecting protobuf (from google-generativeai)
  Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting cachetools<6.0,>=2.0.0 (from google-auth>=2.15.0->google-generativeai)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Downloading cachetools-5.5.2-py3-none-any.whl (10 kB)
Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
Installing collected packages: protobuf, cachetools
[2K  Attempting uninstall: protobuf
[2K    Found existing installation: protobuf 6.33.0
[2K    Uninstalling protobuf-6.33.0:
[2K      Successfully uninstalled protobuf-6.33.0
[2K  Attempting uninstall: cachetools━━━━━━━━━━━━━━[0m [32m0/2[0m [protobuf]
[2K    Found existing installation: cachetools 6.2.1[32m0/2[0m [protobuf]
[2K    Uninstalling cachetools-6.2.1:━━━━━━━━━━[0m [32m0/2[0m [protobuf]
[2K      Successfully uninstalled cachetools-6.2.1m [32m0/2[0m [protobuf]
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import getpass, os

GOOGLE_API_KEY = getpass.getpass("Enter your Google AI Studio API Key: ")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# configure client
import google.generativeai as genai
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


Enter your Google AI Studio API Key:  ········


In [5]:
import requests, hashlib, time
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urljoin

COMPANIES = ["google.com", "facebook.com"]
USER_AGENT = "Mozilla/5.0 (compatible; CompetitorBot/1.0)"

def fetch_url(url, timeout=10):
    try:
        r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=timeout)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print("fetch error:", url, e)
        return ""

def extract_text_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for s in soup(["script","style","noscript"]): s.extract()
    text = " ".join(p.get_text(" ", strip=True) 
                    for p in soup.find_all("p"))
    return text[:20000]

def url_hash(url):
    return hashlib.sha1(url.encode()).hexdigest()


In [8]:
collected = []

for company in COMPANIES:
    base = f"https://{company}"

    # homepage
    html = fetch_url(base)
    if html:
        collected.append({"url": base, "source": "homepage", "html": html, "ts": int(time.time())})

    # blog/news pages
    for p in ["/blog", "/news", "/press"]:
        url = urljoin(base, p)
        html = fetch_url(url)
        if html:
            collected.append({"url": url, "source": "blog", "html": html, "ts": int(time.time())})

# Google News mini scraper
def fetch_google_news_snippets(query, top_n=5):
    url = f"https://news.google.com/search?q={requests.utils.quote(query)}"
    html = fetch_url(url)
    soup = BeautifulSoup(html, "html.parser")
    items = []
    for a in soup.select("article a")[:top_n]:
        href = a.get("href")
        if not href: continue
        if href.startswith("."):  # convert relative → absolute
            href = "https://news.google.com" + href[1:]
        items.append({"url": href, "title": a.get_text().strip()})
    return items

for company in COMPANIES:
    hits = fetch_google_news_snippets(company, 5)
    for h in hits:
        html = fetch_url(h["url"])
        if html:
            collected.append({"url": h["url"], "source": "news", "html": html, "ts": int(time.time())})

len(collected)


18

In [9]:
import pandas as pd

docs = []
seen = set()

for item in collected:
    uid = url_hash(item["url"])
    if uid in seen: continue
    seen.add(uid)

    text = extract_text_from_html(item["html"])
    if len(text) < 200: continue

    docs.append({
        "id": uid,
        "url": item["url"],
        "source": item["source"],
        "text": text,
        "ts": item["ts"]
    })

df = pd.DataFrame(docs)
df.head()


Unnamed: 0,id,url,source,text,ts
0,a57c9027afa4c1a75e0f2cbd16f56801b786839e,https://google.com/blog,blog,AI Nano Banana Pro is our new image generation...,1763666227
1,14a5db8c1022d5f8c060230efd1d0e924ccac759,https://google.com/press,blog,Contact our press team Only members of the pre...,1763666228
2,4e7b6e179032055ed5c7e59fee5bf7ce7a136d53,https://facebook.com/press,blog,Shop the latest devices and expand your world ...,1763666231


In [12]:
#embeddings using Google text-embedding-004 + FAISS

import numpy as np
import faiss
from tqdm import tqdm

# chunk function
def chunk_text(text, chunk_size=250, overlap=40):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunks.append(" ".join(words[i:i+chunk_size]))
        i += chunk_size - overlap
    return chunks

# create chunk records
chunk_records = []
for d in docs:
    chunks = chunk_text(d["text"])
    for i, c in enumerate(chunks):
        chunk_records.append({
            "chunk_id": f"{d['id']}_{i}",
            "doc_id": d["id"],
            "url": d["url"],
            "text": c,
            "ts": d["ts"]
        })


import google.generativeai as genai

embeddings = []
for r in tqdm(chunk_records):
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=r["text"]
    )
    embedding = result["embedding"]
    embeddings.append(embedding)

embed_matrix = np.array(embeddings, dtype="float32")


# Build FAISS index
dim = embed_matrix.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embed_matrix)

# Save vector index + metadata
import pickle
with open("chunk_records.pkl", "wb") as f:
    pickle.dump(chunk_records, f)

faiss.write_index(index, "faiss_index.bin")

len(chunk_records)


100%|██████████| 3/3 [00:00<00:00,  4.42it/s]


3

In [14]:
# Retriever (fixed) using genai.embed_content + FAISS
import numpy as np
import google.generativeai as genai

# retrieve function: embed the query using genai.embed_content, then search FAISS
def retrieve(query, k=5):
    # get embedding from Google AI Studio
    resp = genai.embed_content(model="models/text-embedding-004", content=query)
    q_emb = np.array(resp["embedding"], dtype="float32").reshape(1, -1)  # shape (1, dim)

    # sanity check: dimension must match index dimension
    if q_emb.shape[1] != index.d:
        raise ValueError(f"Embedding dim mismatch: query_emb={q_emb.shape[1]} but faiss index dim={index.d}")

    # search FAISS
    D, I = index.search(q_emb, k)  # D: distances, I: indices
    results = []
    for idx_pos in I[0]:
        # sometimes index.search returns -1 for empty slots — guard against that
        if idx_pos < 0 or idx_pos >= len(chunk_records):
            continue
        results.append(chunk_records[idx_pos])
    return results

# Quick test
print("Running quick retrieval test for: 'pricing changes' ...\n")
hits = retrieve("pricing changes", k=3)
for i, r in enumerate(hits):
    print(f"Result #{i+1}")
    print("URL:", r["url"])
    print("Snippet:", r["text"][:300].replace("\n"," ") + ("..." if len(r["text"])>300 else ""))
    print("-"*80)


Running quick retrieval test for: 'pricing changes' ...

Result #1
URL: https://google.com/press
Snippet: Contact our press team Only members of the press will receive a response. For all other inquiries please visit Google's Help Center . Images on this page may be used for publication with credit: "Source: Google." Let’s stay in touch. Get the latest news from Google in your inbox. Follow Us
--------------------------------------------------------------------------------
Result #2
URL: https://google.com/blog
Snippet: AI Nano Banana Pro is our new image generation and editing model from Google DeepMind. Naina Raisinghani Developers Gemini App Collection Gemini 3, our most intelligent model, combines all of Gemini’s capabilities together so you can bring any idea to life. We’re bringing together experts, students,...
--------------------------------------------------------------------------------
Result #3
URL: https://facebook.com/press
Snippet: Shop the latest devices and expand you

In [31]:
analysis_model = genai.GenerativeModel("gemini-2.5-flash-lite")

def build_rag_prompt(company, query, contexts):
    ctx_str = "\n\n".join([f"SOURCE: {c['url']}\n{c['text'][:600]}" for c in contexts])
    return f"""
You are a competitive intelligence analyst.

Company: {company}
Query: {query}

Use ONLY the sources below:

{ctx_str}

Tasks:
1. Provide a 3-sentence summary of notable recent changes.
2. Provide 3 tactical implications for product/marketing.
3. Each implication must include: (a) priority level, (b) confidence level, (c) source URL.
"""

def analyze(company, query):
    contexts = retrieve(company + " " + query, k=6)
    prompt = build_rag_prompt(company, query, contexts)
    resp = analysis_model.generate_content(prompt)
    return resp.text

print(analyze("facebook.com", "new product launches"))


Here's an analysis of Meta's (facebook.com) new product launches based on the provided sources:

**1. Notable Recent Changes Summary:**

Meta has launched the Meta Quest 3S, a new device aimed at expanding users' worlds and enabling hands-free capture and sharing. This product emphasizes mixed reality experiences and new ways to connect. The company also highlighted its ongoing commitment to safety, privacy, and making a positive impact through its technologies.

**2. Tactical Implications for Product/Marketing:**

*   **Implication:** Focus marketing efforts on the "hands-free" and "mixed reality" aspects of the Meta Quest 3S to highlight its unique selling propositions and potential for new use cases.
    *   **Priority:** High
    *   **Confidence:** High
    *   **Source URL:** https://facebook.com/press

*   **Implication:** Develop product demonstrations and content that showcase how the Meta Quest 3S facilitates new forms of connection and content creation, encouraging user-gene

In [32]:
ALERT_KEYWORDS = ["launch", "pricing", "acquired", "hiring", "beta", "announced"]

def check_alerts(company):
    alerts = []
    for kw in ALERT_KEYWORDS:
        hits = retrieve(company + " " + kw, k=5)
        for h in hits:
            if int(time.time()) - h["ts"] < 7*24*3600:
                alerts.append((kw, h))
    return alerts

for kw, h in check_alerts("facebook.com"):
    print("ALERT:", kw, "→", h["url"])
    print(h["text"][:200], "\n---\n")


ALERT: launch → https://google.com/press
Contact our press team Only members of the press will receive a response. For all other inquiries please visit Google's Help Center . Images on this page may be used for publication with credit: "Sour 
---

ALERT: launch → https://facebook.com/press
Shop the latest devices and expand your world Capture, share and stay in the moment, completely hands-free. Dive into the wonder of mixed reality with the new Meta Quest 3S. Connect in new ways with o 
---

ALERT: launch → https://google.com/blog
AI Nano Banana Pro is our new image generation and editing model from Google DeepMind. Naina Raisinghani Developers Gemini App Collection Gemini 3, our most intelligent model, combines all of Gemini’s 
---

ALERT: pricing → https://facebook.com/press
Shop the latest devices and expand your world Capture, share and stay in the moment, completely hands-free. Dive into the wonder of mixed reality with the new Meta Quest 3S. Connect in new ways with o 
---

ALER