In [2]:
# =========================
# RAG PIPELINE — SINGLE CELL
# =========================

import warnings, time, re, urllib.parse, requests, numpy as np
warnings.filterwarnings("ignore")

from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_core.documents import Document
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from huggingface_hub import InferenceClient


In [17]:
# -------- CONFIG --------
USER_AGENT = "RAG-Notebook/0.1"
WIKI_REST_API = "https://en.wikipedia.org/w/rest.php/v1"
WIKI_ACTION_API = "https://en.wikipedia.org/w/api.php"
TOPIC = "How has artificial intelligence been used in military operations?"


In [None]:


# -------- HELPERS --------
def clean_text(t): 
    return re.sub(r"\s+", " ", t).strip()

def session_with_retries(retries=3, backoff=1.5):
    s = requests.Session()
    s.headers.update({"User-Agent": USER_AGENT})
    s.retries, s.backoff = retries, backoff
    return s

def get_json(session, url, params):
    for i in range(session.retries):
        try:
            r = session.get(url, params=params, timeout=15)
            r.raise_for_status()
            return r.json()
        except Exception:
            time.sleep(session.backoff ** i)
    raise RuntimeError(f"Request failed: {url}")

def filter_near_duplicates(docs, embeddings, threshold=0.92):
    vecs = np.array(embeddings.embed_documents([d.page_content for d in docs]))
    keep, used = [], set()
    for i in range(len(docs)):
        if i in used:
            continue
        keep.append(i)
        sims = (vecs @ vecs[i]) / (np.linalg.norm(vecs, axis=1) * np.linalg.norm(vecs[i]) + 1e-9)
        used.update(np.where(sims >= threshold)[0])
    return [docs[i] for i in keep]



In [11]:

# -------- WIKIPEDIA FETCH --------
def wikipedia_fetch(topic, max_pages=6):
    s = session_with_retries()
    pages = get_json(
        s, f"{WIKI_REST_API}/search/page", {"q": topic, "limit": max_pages}
    ).get("pages", [])

    titles = [p["title"] for p in pages]
    extracts = get_json(
        s, WIKI_ACTION_API,
        {
            "action": "query",
            "format": "json",
            "prop": "extracts",
            "explaintext": "1",
            "titles": "|".join(titles),
        },
    )["query"]["pages"]

    docs = []
    for page in extracts.values():
        text = clean_text(page.get("extract", ""))
        if len(text) < 300:
            continue
        title = page["title"]
        url = f"https://en.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ', '_'))}"
        docs.append({"title": title, "url": url, "text": text})
    return docs



In [10]:

# -------- INGEST --------
docs = wikipedia_fetch(TOPIC)


In [13]:

docs = [
    d if hasattr(d, "page_content")
    else Document(
        page_content=d["text"],
        metadata={"title": d["title"], "source": d["url"]},
    )
    for d in docs
]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=900,
    chunk_overlap=120,
    separators=["\n\n", "\n", ". ", "? ", "! ", "; ", ": ", " "],
)

chunks = splitter.split_documents(docs)

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
chunks = filter_near_duplicates(chunks, embeddings)

vectorstore = Chroma(
    collection_name="industry_data",
    embedding_function=embeddings,
)

vectorstore.add_documents(chunks)
vectorstore.persist()

retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 4, "fetch_k": 50, "lambda_mult": 0.7},
)


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1531.33it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [21]:


# -------- GENERATION --------
client = InferenceClient()

def generate_answer(query, docs):
    context = "\n".join(d.page_content for d in docs)
    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.1-8B-Instruct",
        temperature=0.2,
        max_tokens=200,
        messages=[
            {"role": "system", "content": "Answer only using the context."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{query}"},
        ],
    )
    return completion.choices[0].message.content


In [19]:
def retrieve_tool(query: str):
    return retriever.invoke(query)

def answer_tool(query: str, docs=None):
    context = "\n".join(d.page_content for d in docs) if docs else ""
    return generate_answer(query, docs or [])


In [20]:
def needs_retrieval(query: str) -> bool:
    decision_prompt = f"""
Decide if the following question requires external knowledge.

Question: {query}

Answer only YES or NO.
"""
    out = client.chat.completions.create(
        model="meta-llama/Llama-3.1-8B-Instruct",
        messages=[{"role": "user", "content": decision_prompt}],
        max_tokens=5,
        temperature=0.0,
    )
    return "YES" in out.choices[0].message.content.upper()


In [22]:
def agentic_answer(query: str):
    if needs_retrieval(query):
        docs = retrieve_tool(query)
        return generate_answer(query, docs)
    else:
        return generate_answer(query, [])


In [None]:

# -------- RUN --------
retrieved_docs = retriever.invoke(TOPIC)
answer = generate_answer(TOPIC, retrieved_docs)
print(answer)


Artificial intelligence has been used in military operations in various countries, including Iraq, Syria, Israel, and Ukraine. Research is targeting intelligence collection and analysis, logistics, cyber operations, information operations, and semiautonomous and autonomous vehicles.


In [25]:
print(agentic_answer("How has AI been used in education?"))

AI has been increasingly used in education in various ways. Some of the key applications include:

1. **Personalized Learning**: AI-powered systems can analyze a student's learning style, pace, and abilities to provide customized learning experiences. This can help students learn more effectively and efficiently.

2. **Intelligent Tutoring Systems**: AI-based systems can offer one-on-one support to students, providing real-time feedback and guidance on complex topics. These systems can also adapt to a student's learning needs and adjust the level of difficulty accordingly.

3. **Automated Grading**: AI can help teachers with grading by automating the process of scoring assignments and exams. This can save teachers time and reduce the risk of human error.

4. **Natural Language Processing (NLP)**: AI-powered NLP can help students improve their language skills by providing interactive language learning tools, such as chatbots and virtual language assistants.

5. **Content Creation**: AI 