# Section 3 -- Image analysis & RAG

## Setup

In [11]:
from google.colab import userdata
from google.colab.userdata import SecretNotFoundError
from getpass import getpass

try:
    key = userdata.get('OPENAI_API_KEY')
    print("OPENAI_API_KEY is set in Colab Secrets.")
except SecretNotFoundError:
    print("No OPENAI_API_KEY found in Colab Secrets.")
    print("Open the 'Secrets' tab on the left (the key icon')")
    print("Click 'Add new secret'")
    print("Set the name to OPENAI_API_KEY")
    print("Set the value to your API key")
    print("Click on the 'notebook access' toggle to turn it on")

OPENAI_API_KEY is set in Colab Secrets.


## Setup — Install Dependencies

Installs runtime libraries. We pin FAISS to CPU flavor for portability. Also installs `gdown` to fetch the shared folder by ID if needed.

In [12]:
!pip -q install polars==1.* faiss-cpu openai==1.* tqdm pillow gdown

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## Setup — Mount Drive or Auto-Download Shared Folder

The class folder is shared at:  
`https://drive.google.com/drive/folders/1XTbuR5vcz7FsikBRPh1SXwFjbUY4mx46`

This script will download the materials in that folder to your Google Drive. You will see a folder appear called `DPI681 Section Materials` in your drive. If you would like to put it elsewhere, change the file path in `STUDENT_WORKDIR`.

In [15]:
# @title
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Preferred path if students added the shared folder (or a shortcut) to their MyDrive:
CLASS_FOLDER = "/content/drive/MyDrive/Teaching/DPI681 Materials/RAG Materials"

# Fallback: download shared folder by ID (read-only) into local runtime.
FOLDER_ID = "1XTbuR5vcz7FsikBRPh1SXwFjbUY4mx46"
LOCAL_FALLBACK = "/content/class_materials"

def ensure_class_folder():
    global CLASS_FOLDER
    os.makedirs(LOCAL_FALLBACK, exist_ok=True)
    # gdown will create a subdir named by the folder; normalize to the first/only dir if needed
    !gdown --folder "$FOLDER_ID" -O "$LOCAL_FALLBACK" -q
    # Try to detect the actual directory with our expected files.
    candidates = []
    for root, dirs, files in os.walk(LOCAL_FALLBACK):
        if "images.csv" in files or ("faiss_index.bin" in files and "metadata.json" in files):
            candidates.append(root)
    if candidates:
        candidates.sort(key=len)
        CLASS_FOLDER = candidates[0]
        print("Using downloaded class folder:", CLASS_FOLDER)
        return CLASS_FOLDER
    else:
        raise FileNotFoundError("Downloaded folder does not contain expected files. Please add the shared folder to your Drive or adjust paths.")

CLASS_FOLDER = ensure_class_folder()


Mounted at /content/drive
Using downloaded class folder: /content/class_materials


In [16]:
# Student output location (Change this if you would like)
STUDENT_WORKDIR = "/content/drive/MyDrive/DPI681 Section Materials"

os.makedirs(STUDENT_WORKDIR, exist_ok=True)
print("STUDENT_WORKDIR:", STUDENT_WORKDIR)

STUDENT_WORKDIR: /content/drive/MyDrive/DPI681 RAG Data


In [17]:
# Common imports
import os, json
from typing import List, Dict, Any
import polars as pl
import numpy as np
import faiss
from tqdm import tqdm
from openai import OpenAI

from google.colab import userdata
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise RuntimeError("OPENAI_API_KEY missing from Colab Secrets. Rerun the secrets cell.")
client = OpenAI(api_key=OPENAI_API_KEY)

---
## Activity 1 — Bulk Image Analysis (Multimodal)

**Goal:** send a text+image prompt to the OpenAI Responses API for each image in `images.csv` and save results.


In [27]:
IMAGES_CSV = os.path.join(CLASS_FOLDER, "images.csv")       # must include columns: image_id, url
RESULTS_CSV = os.path.join(STUDENT_WORKDIR, "images_analysis_results.csv")
MODEL = "gpt-5-mini"

### Load the dataset (Polars) and validate columns

In [22]:
if not os.path.exists(IMAGES_CSV):
    raise FileNotFoundError(f"images.csv not found at {IMAGES_CSV}. Check the shared folder or fallback download.")

df_images = pl.read_csv(IMAGES_CSV)
df_images.head(3)

image_id,url
i64,str
40,"""https://api.time.com/wp-conten…"
60,"""https://api.time.com/wp-conten…"
20,"""https://api.time.com/wp-conten…"


### Process images with the Responses API
Sends a single user message containing text **and** an image URL per row. Saves a Polars DataFrame to CSV in your `STUDENT_WORKDIR`.

In [29]:
results_raw: List[Dict[str, Any]] = []

# Your prompt: edit freely and rerun
PROMPT_TEXT = "Who is in the image?"

for row in tqdm(df_images.iter_rows(named=True), total=df_images.height, desc="Processing Images"):
    image_id = row["image_id"]
    image_url = row["url"]

    resp = client.responses.create(
        model=MODEL,
        input=[{
            "role": "user",
            "content": [
                {"type": "input_text", "text": PROMPT_TEXT},
                {"type": "input_image", "image_url": str(image_url)},
            ],
        }],
    )
    output_text = (resp.output_text or "").strip()

    results_raw.append({"image_id": image_id, "url": image_url, "output_text": output_text})

results = pl.DataFrame(results_raw)
results.write_csv(RESULTS_CSV)
print(f"Image analysis complete. Results saved to: {RESULTS_CSV}")

Processing Images: 100%|██████████| 5/5 [01:06<00:00, 13.40s/it]

Image analysis complete. Results saved to: /content/drive/MyDrive/DPI681 RAG Data/images_analysis_results.csv





In [30]:
results

image_id,url,output_text
i64,str,str
40,"""https://api.time.com/wp-conten…","""I’m sorry — I can’t help ident…"
60,"""https://api.time.com/wp-conten…","""Sorry — I can’t help identify …"
20,"""https://api.time.com/wp-conten…","""Sorry — I can’t help identify …"
45,"""https://api.time.com/wp-conten…","""Sorry — I can’t help identify …"
47,"""https://api.time.com/wp-conten…","""Sorry — I can’t help identify …"


In [31]:
results.write_csv(RESULTS_CSV)

---
## Activity 2 — Legal Assistant with RAG (One-Shot)

**Pipeline**
1. Load FAISS index and `metadata.json` (prebuilt).
2. Embed the query (`text-embedding-3-small`).
3. Retrieve top-K documents and format brief citations.
4. Call the **Responses API** with a system prompt + retrieved context + user question.  
**Model replies** and cites as `Chapter [X] Section [Y]` with a link.

In [41]:
FAISS_INDEX_FILE = os.path.join(CLASS_FOLDER, "faiss_index.bin")
METADATA_FILE = os.path.join(CLASS_FOLDER, "metadata.json")
EMBEDDING_MODEL = "text-embedding-3-small"
GEN_MODEL = "gpt-5-mini"
TOP_K = 3

if not os.path.exists(FAISS_INDEX_FILE) or not os.path.exists(METADATA_FILE):
    raise FileNotFoundError("Missing FAISS index or metadata.json in CLASS_FOLDER.")

faiss_index = faiss.read_index(FAISS_INDEX_FILE)
with open(METADATA_FILE, "r", encoding="utf-8") as f:
    metadata = json.load(f)

def get_embedding(text: str) -> np.ndarray:
    txt = text if isinstance(text, str) else str(text)
    txt = txt[:8150]
    r = client.embeddings.create(model=EMBEDDING_MODEL, input=txt)
    emb = r.data[0].embedding
    return np.asarray(emb, dtype=np.float32)

def retrieve_context(query: str, top_k: int = TOP_K) -> str:
    q = get_embedding(query)
    q = np.expand_dims(q, axis=0)
    D, I = faiss_index.search(q, top_k)
    lines = []
    for idx in I[0]:
        if 0 <= idx < len(metadata):
            doc = metadata[idx]
            citation = f"(Chapter {doc.get('chapter','?')} Section {doc.get('section','?')}, {doc.get('link','No link')})"
            text = (doc.get('full_text','').replace('\n',' ')).strip()
            lines.append(f"{citation}: {text}")
    return "Retrieved context:\n" + "\n".join(lines) + "\n" if lines else ""


### Try it (one-shot)

In [None]:
# System prompt to define the behavior
BASE_SYSTEM_PROMPT = (
    "You are a legal assistant helping non-lawyers understand Massachusetts real-estate law. "
    "You are not a lawyer and cannot provide legal advice. "
    "Point users to the relevant section of the law and explain how it applies to them in an easy to understand way. "
    "Do not return your replies in markdown, only plain text. "
    "Cite sources as 'Chapter [Chapter] Section [Section]' with a link at the end."
)

q = "What are the notice requirements for eviction in Massachusetts?"

# Get the necessary context from our vector database
ctx = retrieve_context(q)

# Add it to the prompt (+ trick we talked about last time)
sys_prompt = BASE_SYSTEM_PROMPT + "\n" + ctx

# Send request to model with the actual legal code appended to the query
resp = client.responses.create(
    model=MODEL,
    input=[
        {"role":"system","content":sys_prompt},
        {"role":"user","content":q}
    ],
)

print(resp.output_text)

---
## Legal Assistant with RAG (Chat)

Maintains a small conversation history and augments each turn with retrieved context.  
Streaming omitted for simplicity; we return final text.

In [45]:
# @title
from typing import List, Dict

conversation_history: List[Dict[str,str]] = []

def chat_once(user_text: str, max_context_msgs: int = 10) -> str:
    ctx = retrieve_context(user_text)
    sys_prompt = BASE_SYSTEM_PROMPT + "\n" + ctx
    msgs = [{"role":"system","content":sys_prompt}] + conversation_history[-max_context_msgs:] + [
        {"role":"user","content":user_text}
    ]
    r = client.responses.create(model=MODEL, input=msgs)
    out = (r.output_text or "").strip()
    conversation_history.append({"role":"user","content":user_text})
    conversation_history.append({"role":"assistant","content":out})
    return out

def chat():
    print("RAG Chat With Mass Real Estate Code Bot — type 'exit' to quit.\n")
    while True:
        try:
            u = input("> ").strip()
        except EOFError:
            break
        if u.lower() in ("exit","quit"):
            print("Goodbye.")
            break
        reply = chat_once(u)
        print("\n" + reply + "\n")

In [None]:
chat()