# Using RAG to Engage with IPCC Reports

## First Steps
- create .venv (I'm using Python 3.12)  
- install requirements.txt (either automatically while setting up, or run `pip install -r requirements.txt`)
- get API key from:
    - https://kisski.gwdg.de/leistungen/2-02-llm-service/ or
    - https://console.groq.com/keys  
    &rarr; Groq can be used with the OpenAI library with limitations, see https://console.groq.com/keys
  - create `.env` file with:
  > OPENAI_API_KEY = "YOUR-API-KEY"  
  > KISSKI_URL = "https://chat-ai.academiccloud.de/v1"  

  OR (for Groq):  

  > OPENAI_API_KEY = "YOUR-API-KEY"  
  > KISSKI_URL = "https://api.groq.com/openai/v1"  

  (if you change the variable name you'll have to change it in the code too)

## Overview

![graph.png](graph.png)

1. Extract information from official IPCC reports

2. Prepare the data for smart search

3. Use AI to answer relevant questions

4. Log everything for evaluation and improvement

## Code & Notes

In [2]:
from enum import Enum
import faiss
import os
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from openai import OpenAI
from dotenv import load_dotenv
import json
import textwrap
from IPython.display import Markdown, display
from bs4 import BeautifulSoup
import glob
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


- Text and data processing (BeautifulSoup, glob, json, os)

- Embedding and search (HuggingFace, FAISS)

- Language model connections (OpenAI, LLaMA)

- Scoring and evaluation tools (BERTScore, ROUGE)

In [3]:
#####
# Transform HTML
#####

input_folder = "html"
output_file = "txt/numbered_chunks.txt"

all_chunks = []

# Loop through all HTML files
for html_file in glob.glob(os.path.join(input_folder, "*.html")):
    with open(html_file, encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
        # Find all paragraphs with an id
        for p in soup.find_all("p"):
            pid = p.get("id")
            text = p.get_text().strip()
            if pid and text:
                chunk = f"[{pid}] {text}"
                all_chunks.append(chunk)

# Save all chunks to a text file
with open(output_file, "w", encoding="utf-8") as f:
    for chunk in all_chunks:
        f.write(chunk + "\n")

This step takes raw, complex IPCC reports (in HTML) and breaks them down into manageable pieces -- one paragraph per line, each with a unique ID.

Advantages: 

- A lot more readable  
&rarr; allows users to get answers that can be traced back 

In [55]:
#####
# Enums and variables
#####


ID_prompt = """
Pass back the full ID of the paragraph(s) from the input file you're taking the information from.
"""

class Prompt(Enum):
    BASIC = f'{ID_prompt} You are explaining to someone with basic knowledge of the topic.'
    ADVANCED = f'{ID_prompt} You are explaining to someone with advanced knowledge of the topic.'

class Model(Enum):
    LLAMA = 'meta-llama-3.1-8b-instruct'
    GEMMA = 'gemma-3-27b-it'
    
class Embedding(Enum):
    MINILM = "sentence-transformers/all-MiniLM-L6-v2"
    GTR = "sentence-transformers/gtr-t5-base"
    MPNET = "sentence-transformers/paraphrase-mpnet-base-v2"

EMBED_DIM_MAP = {
    Embedding.MINILM: 384,
    Embedding.MPNET: 768,
    Embedding.GTR: 768
}

jsonl_filepath = "eval/log.jsonl"

llm_model = Model["LLAMA"]
answer_level = Prompt["BASIC"]
embed_model = Embedding["MINILM"]
vector_dimensions = EMBED_DIM_MAP[Embedding.MINILM]

index_dir = "./faiss_index"
input_dir = "./txt"
tokens_per_chunk = 1024
chunk_overlap = 200
force_rebuild = False

All of these can be changed to suit the source texts better. 

- Different file paths  

- Different models for LLM & Embeddings

- Prompt base for the answers can be adjusted (give back paragraph IDs, depth of explanation)

In [56]:
#####
# Load, chunk, and embed input file
#####

def make_index(index_dir, embed_model, force_rebuild):

    # Embed Chunks with HuggingFace
    embedder = HuggingFaceEmbedding(model_name=embed_model)

    vector_store = FaissVectorStore.from_persist_dir(index_dir)

    faiss_index = vector_store._faiss_index
    stored_dim = faiss_index.d

    #if:
    # - not instructed to rebuild index
    # - stored index fits the dimensions required by embedding model
    # - index directory exists
    # - index directory isn't empty
    if (not force_rebuild) and (stored_dim == vector_dimensions) and os.path.exists(index_dir) and os.listdir(index_dir):

        storage_context = StorageContext.from_defaults(
            vector_store=vector_store, persist_dir=index_dir
        )
        index = load_index_from_storage(storage_context=storage_context, embed_model=embedder)
        print("Using stored index.")

    else:

        # Load HTML file(s)
        documents = SimpleDirectoryReader(input_dir=input_dir).load_data()
        print(f"Loaded {len(documents)} document(s).")

        # Chunk with SentenceSplitter (progress bar per doc)
        splitter = SentenceSplitter(chunk_size=tokens_per_chunk, chunk_overlap=chunk_overlap)

        nodes = []
        for doc in documents:
            nodes.extend(splitter.get_nodes_from_documents([doc]))

        print(f"Generated {len(nodes)} chunks.")

        # Create Index
        faiss_index = faiss.IndexFlatL2(vector_dimensions)
        vector_store = FaissVectorStore(faiss_index=faiss_index)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)


        index = VectorStoreIndex(
            nodes,
            embed_model=embedder,
            storage_context=storage_context,
        )

        # Save index
        index.storage_context.persist(persist_dir=index_dir)
        print(f"Index stored in {index_dir}")

    return index

Here we create a “searchable memory” of all the report paragraphs, using AI-powered embeddings.

In [57]:
#####
# LLM
#####

def load_llm(llm_model, answer_level):
    
  load_dotenv()

  api_key = os.getenv("OPENAI_API_KEY")
  base_url = os.getenv("KISSKI_URL")

  if not api_key or not base_url:
      raise ValueError("Missing API key or URL.")

  client = OpenAI(
      api_key=api_key,
      base_url=base_url
  )

  def ask_openai_llm(prompt: str) -> str:
      response = client.chat.completions.create(
          model=llm_model,
          messages=[
              {"role": "system", "content": answer_level},
              {"role": "user", "content": prompt}
          ]
      )
      return response.choices[0].message.content
  return ask_openai_llm


This function sets up the language model (“the brain” of the system).

It connects to an API hosted by a German computing centre (KISSKI).


In [58]:
#####
# Log for Eval
#####

def log_rag_example(filepath, question, answer, retrieved_context, reference=None):
    with open(filepath, "a", encoding="utf-8") as f:
        f.write(json.dumps({
            "question": question,
            "generated_answer": answer,
            "retrieved_context": retrieved_context,
            "reference_answer": reference
        }) + "\n")

        
#####
# Query
#####

def ask_question(index, ask_openai_llm):


  while True:
      query = input("Enter your question (or type 'q'): ").strip()
      if query.lower() == 'q':
          print("Session ended.")
          break

      nodes = index.as_retriever().retrieve(query)
      context = "\n---\n".join([n.get_content() for n in nodes])

      full_prompt = f"""
  Context:
  {context}

  Question:
  {query}"""

      answer = ask_openai_llm(full_prompt)
      print(f"\nQ:")
      display(Markdown(textwrap.dedent(query)))
      print("\nA:")
      display(Markdown(textwrap.dedent(answer)))
      print("___\n")
      
      answer = str(answer)
      log_rag_example(jsonl_filepath, query, answer, context, reference=None)


This is where the queries for user questions are put together from the default prompt and user input.

Aditionally, every question and answer -- plus all supporting context -- are logged for evaluation.

In [59]:
#####
# Starting point
#####

index = make_index(index_dir, embed_model.value, force_rebuild)
ask_openai_llm = load_llm(llm_model.value, answer_level.value)
ask_question(index, ask_openai_llm)

Using stored index.
Session ended.



This runs the main pipeline:

1. Builds or loads the search index

2. Loads the language model

3. Lets you ask a question and get an answer, with clear sourcing

In [52]:
#####
# Evaluation
#####

def load_examples(jsonl_path):
    questions, generated, references = [], [], []
    with open(jsonl_path, encoding="utf-8") as f:
        for line in f:
            ex = json.loads(line)
            questions.append(ex.get("question", ""))
            generated.append(ex.get("generated_answer", ""))
            references.append(ex.get("reference_answer", ""))  # empty string if missing
    return questions, generated, references
def load_examples(jsonl_path):
    questions, generated, references = [], [], []
    with open(jsonl_path, encoding="utf-8") as f:
        for line in f:
            ex = json.loads(line)
            questions.append(ex.get("question", ""))
            generated.append(ex.get("generated_answer", ""))
            ref = ex.get("reference_answer")
            references.append("\n" if ref is None else ref)
    return questions, generated, references

def evaluate_bertscore(candidates, references, lang="en"):
    P, R, F1 = bert_score(candidates, references, lang=lang)
    return {
        "precision": float(P.mean()),
        "recall": float(R.mean()),
        "f1": float(F1.mean())
    }

def evaluate_rouge(candidates, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    results = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
    avg_scores = {}
    for key in results[0]:
        avg_scores[key] = np.mean([r[key].fmeasure for r in results])
    return avg_scores

def evaluate_cosine(candidates, references, model_name="all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    emb_refs = model.encode(references, convert_to_tensor=True)
    emb_cands = model.encode(candidates, convert_to_tensor=True)
    scores = util.cos_sim(emb_cands, emb_refs)
    mean_sim = float(scores.diag().mean())
    return {"cosine_similarity": mean_sim}

def eval(path):
    questions, generated, references = load_examples(path)

    # Optionally, filter empty references if your gold data is patchy
    filtered_gen, filtered_ref = [], []
    for g, r in zip(generated, references):
        if r.strip():  # has reference
            filtered_gen.append(g)
            filtered_ref.append(r)
    if not filtered_ref:
        print("No reference answers found in data! Populate 'reference_answer' for proper eval.")
        return

    print("Evaluating BERTScore...")
    bert = evaluate_bertscore(filtered_gen, filtered_ref)
    print("Evaluating ROUGE...")
    rouge = evaluate_rouge(filtered_gen, filtered_ref)
    print("Evaluating Cosine Similarity...")
    cosine = evaluate_cosine(filtered_gen, filtered_ref)

    print("\n=== RAG EVALUATION RESULTS ===")
    print("BERTScore:")
    for k, v in bert.items():
        print(f"  {k}: {v:.4f}")
    print("ROUGE:")
    for k, v in rouge.items():
        print(f"  {k}: {v:.4f}")
    print("Cosine similarity:")
    for k, v in cosine.items():
        print(f"  {k}: {v:.4f}")

eval(jsonl_filepath)


Evaluating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating ROUGE...
Evaluating Cosine Similarity...

=== RAG EVALUATION RESULTS ===
BERTScore:
  precision: 0.6850
  recall: 0.8348
  f1: 0.7525
ROUGE:
  rouge1: 0.0000
  rouge2: 0.0000
  rougeL: 0.0000
Cosine similarity:
  cosine_similarity: -0.0556


Evaluates all previously stored Q&As for BERTScore, Rouge, and Cosine Similarity.