In [57]:
import re
import json
import shutil
import gradio as gr
from tqdm.auto import tqdm
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple

from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
from langchain_core.messages import SystemMessage, HumanMessage

USER CONFIGURATION

In [None]:
#unreal engine install root
ENGINE_ROOT = Path(r"C:\Program Files\Epic Games\UE_5.7")

#vector database
DB_DIR = Path("vectorstore_ue")
COLLECTION_NAME = "ue_source"

#embeddings
EMBED_MODEL = "BAAI/bge-base-en-v1.5"

#ingestion options
INCLUDE_PRIVATE_CPP = True
CLEAN_REBUILD = True


<h3>Why use bge-base-en-v1.5?</h3>

We can answer based on what we want/don't want.

What we want:
- dense retrieval
- passage matching
- handling technical identifiers (function names, type names, natural language v/s code)

What we don't want:
- semantic closeness
- paraphrase detection
- conversational similarity

This is why many popular embedding models would be a worse choice for this project.

In [28]:
assert ENGINE_ROOT.exists(), f"Engine root not found: {ENGINE_ROOT}"
assert (ENGINE_ROOT / "Engine").exists(), "Invalid Unreal root (missing Engine/)"

print("Engine root:", ENGINE_ROOT)

Engine root: C:\Program Files\Epic Games\UE_5.7


ENGINE VERSION DETECTION

In [None]:
def detect_engine_version(engine_root: Path) -> str:
    build_version = engine_root / "Engine" / "Build" / "Build.version"
    if build_version.exists():
        try:
            data = json.loads(build_version.read_text(encoding="utf-8", errors="ignore"))
            major = data.get("MajorVersion")
            minor = data.get("MinorVersion")
            patch = data.get("PatchVersion")
            if major is not None and minor is not None:
                if patch is None:
                    return f"{major}.{minor}"
                return f"{major}.{minor}.{patch}"
        except Exception:
            pass

    #fall back to folder name heuristic
    m = re.search(r"UE_(\d+\.\d+(\.\d+)?)", str(engine_root))
    return m.group(1) if m else "unknown"

SOURCE CHUNK DATACLASS

In [30]:
@dataclass
class SourceChunk:
    text: str
    file: str
    symbol: str
    kind: str
    line_start: int
    line_end: int
    module: str
    plugin: str
    engine_version: str

FILE HELPERS

In [None]:
def read_lines(path: Path) -> Optional[List[str]]:
    try:
        text = path.read_text(encoding="utf-8", errors="ignore")
        return text.replace("\r\n", "\n").replace("\r", "\n").splitlines() #unix style line endings normalization
    except Exception:
        return None

In [None]:
def iter_files(root: Path, exts: Tuple[str, ...]):
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            yield p

BRACE MATCHER

In [26]:
def find_matching_brace(lines: List[str], start_idx: int) -> Optional[int]:
    depth = 0
    for i in range(start_idx, len(lines)):
        line = lines[i].split("//")[0]
        for c in line:
            if c == "{":
                depth += 1
            elif c == "}":
                depth -= 1
                if depth == 0:
                    return i
    return None

DETERMINISTIC HEADER CHUNKER

In [31]:
RE_TYPE = re.compile(r"^\s*(UCLASS|USTRUCT|UENUM|class|struct|enum)\b")
RE_NAME = re.compile(r"(class|struct|enum)\s+([A-Za-z_]\w*)")

def chunk_header(path: Path, module: str, plugin: str) -> List[SourceChunk]:
    lines = read_lines(path)
    if not lines:
        return []

    chunks = []
    i = 0

    while i < len(lines):
        if RE_TYPE.match(lines[i]):
            name = "Unknown"
            m = RE_NAME.search(lines[i])
            if m:
                name = m.group(2)

            end = find_matching_brace(lines, i)
            if end is None:
                end = min(i + 50, len(lines) - 1)

            text = "\n".join(lines[i:end + 1]).strip()
            if len(text) > 200:
                chunks.append(SourceChunk(
                    text=text,
                    file=str(path),
                    symbol=name,
                    kind="type",
                    line_start=i + 1,
                    line_end=end + 1,
                    module=module,
                    plugin=plugin,
                    engine_version=ENGINE_VERSION,
                ))
            i = end + 1
        else:
            i += 1

    return chunks


CPP FUNCTTION CHUNKER

In [32]:
RE_METHOD = re.compile(r"(\w+)::(\w+)\s*\(")

def chunk_cpp(path: Path, module: str, plugin: str) -> List[SourceChunk]:
    lines = read_lines(path)
    if not lines:
        return []

    chunks = []
    i = 0

    while i < len(lines):
        m = RE_METHOD.search(lines[i])
        if m:
            symbol = f"{m.group(1)}::{m.group(2)}"
            brace_line = i
            while brace_line < len(lines) and "{" not in lines[brace_line]:
                brace_line += 1
            if brace_line >= len(lines):
                i += 1
                continue

            end = find_matching_brace(lines, brace_line)
            if end:
                text = "\n".join(lines[i:end + 1]).strip()
                if len(text) > 250:
                    chunks.append(SourceChunk(
                        text=text,
                        file=str(path),
                        symbol=symbol,
                        kind="function",
                        line_start=i + 1,
                        line_end=end + 1,
                        module=module,
                        plugin=plugin,
                        engine_version=ENGINE_VERSION,
                    ))
                i = end + 1
                continue
        i += 1

    return chunks


CHUNKS TO LANGCHAIN DOCS

In [33]:
def to_documents(chunks: List[SourceChunk]) -> List[Document]:
    return [
        Document(
            page_content=c.text,
            metadata={
                "file": c.file,
                "symbol": c.symbol,
                "kind": c.kind,
                "line_start": c.line_start,
                "line_end": c.line_end,
                "module": c.module,
                "plugin": c.plugin,
                "engine_version": c.engine_version,
            },
        )
        for c in chunks
    ]


LOCATE GAMEPLAY ABILITIES FOR EXPERIMENT

In [34]:
GAS_ROOT = ENGINE_ROOT / "Engine/Plugins/Runtime/GameplayAbilities/Source/GameplayAbilities"
assert GAS_ROOT.exists(), f"GAS plugin not found: {GAS_ROOT}"

PUBLIC_DIR = GAS_ROOT / "Public"
PRIVATE_DIR = GAS_ROOT / "Private"

print("GAS plugin found:", GAS_ROOT)

GAS plugin found: C:\Program Files\Epic Games\UE_5.7\Engine\Plugins\Runtime\GameplayAbilities\Source\GameplayAbilities


RUN INGESTION

In [35]:
if CLEAN_REBUILD and DB_DIR.exists():
    shutil.rmtree(DB_DIR)

all_chunks = []

# Headers
for f in iter_files(PUBLIC_DIR, (".h", ".hpp", ".inl")):
    all_chunks.extend(chunk_header(f, "GameplayAbilities", "GameplayAbilities"))

# CPP (optional)
if INCLUDE_PRIVATE_CPP and PRIVATE_DIR.exists():
    for f in iter_files(PRIVATE_DIR, (".cpp",)):
        all_chunks.extend(chunk_cpp(f, "GameplayAbilities", "GameplayAbilities"))

documents = to_documents(all_chunks)

print(f"Total chunks created: {len(documents)}")


Total chunks created: 1771


In [6]:
embeddings = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    encode_kwargs={"normalize_embeddings": True},
)

vectordb = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=str(DB_DIR),
)

print("Stored documents:", vectordb._collection.count()) #check persistant vectordb(run imports and user config)

Stored documents: 5313


BUILD VECTOR STORE

In [None]:
shutil.rmtree(DB_DIR)

BATCH_SIZE = 128
print(f"Adding {len(documents)} documents to vectorstore...")

for i in tqdm(range(0, len(documents), BATCH_SIZE), desc="Embedding & indexing"):
    batch = documents[i : i + BATCH_SIZE]
    vectordb.add_documents(batch)

print("Vectorstore built and persisted at:", DB_DIR)

In [7]:
query = "Why does FGameplayEffectContextHandle not replicate SourceObject?"

results = vectordb.similarity_search(query, k=5)

for i, d in enumerate(results, 1):
    m = d.metadata
    print(f"\n--- Result {i} ---")
    print(f"{m['file']}:{m['line_start']}-{m['line_end']}")
    print(f"Symbol: {m['symbol']}")
    print(d.page_content[:600])


--- Result 1 ---
C:\Program Files\Epic Games\UE_5.7\Engine\Plugins\Runtime\GameplayAbilities\Source\GameplayAbilities\Private\Abilities\GameplayAbility.cpp:1786-1800
Symbol: UGameplayAbility::GetContextFromOwner
FGameplayEffectContextHandle UGameplayAbility::GetContextFromOwner(FGameplayAbilityTargetDataHandle OptionalTargetData) const
{
	ensure(CurrentActorInfo);
	FGameplayEffectContextHandle Context = MakeEffectContext(CurrentSpecHandle, CurrentActorInfo);
	
	for (auto Data : OptionalTargetData.Data)
	{
		if (Data.IsValid())
		{
			Data->AddTargetDataToContext(Context, true);
		}
	}

	return Context;
}

--- Result 2 ---
C:\Program Files\Epic Games\UE_5.7\Engine\Plugins\Runtime\GameplayAbilities\Source\GameplayAbilities\Private\Abilities\GameplayAbility.cpp:1786-1800
Symbol: UGameplayAbility::GetContextFromOwner
FGameplayEffectContextHandle UGameplayAbility::GetContextFromOwner(FGameplayAbilityTargetDataHandle OptionalTargetData) const
{
	ensure(CurrentActorInfo);
	FGameplayEffectCon

DEDUPLICATION

In [8]:
def dedupe_by_symbol(docs):
    seen = set()
    deduped = []

    for d in docs:
        meta = d.metadata
        key = (meta.get("file"), meta.get("symbol"))
        if key not in seen:
            seen.add(key)
            deduped.append(d)

    return deduped

In [9]:
def retrieve_deduplicated(
    vectordb,
    query: str,
    k: int = 15,
    max_results: int = 5,
):
    raw_results = vectordb.similarity_search(query, k=k)
    deduped_results = dedupe_by_symbol(raw_results)[:max_results]

    print(f"Query: {query}")
    print(f"Raw hits: {len(raw_results)} | Deduplicated: {len(deduped_results)}")

    return deduped_results

CITATION OUTPUT

In [10]:
def print_citations(docs):
    """
    Print retrieved Documents in a clean, citation-style format.
    """
    for i, d in enumerate(docs, 1):
        m = d.metadata

        file_name = Path(m["file"]).name
        line_span = f"{m['line_start']}-{m['line_end']}"
        symbol = m.get("symbol", "Unknown")

        print(f"\n[{i}] {file_name}:{line_span}")
        print(f"    Symbol: {symbol}")

        #print first meaningful line
        preview = next(
            (line for line in d.page_content.splitlines() if line.strip()),
            ""
        )
        print(f"    Preview: {preview[:180]}")

In [11]:
query = "Why does FGameplayEffectContextHandle not replicate SourceObject?"

results = retrieve_deduplicated(
    vectordb,
    query=query,
    k=15,
    max_results=5,
)

print_citations(results)

Query: Why does FGameplayEffectContextHandle not replicate SourceObject?
Raw hits: 15 | Deduplicated: 5

[1] GameplayAbility.cpp:1786-1800
    Symbol: UGameplayAbility::GetContextFromOwner
    Preview: FGameplayEffectContextHandle UGameplayAbility::GetContextFromOwner(FGameplayAbilityTargetDataHandle OptionalTargetData) const

[2] GameplayEffectContextNetSerializer.cpp:606-634
    Symbol: FGameplayEffectContextAccessorForNetSerializer::CopyReplicatedFieldsFrom
    Preview: void FGameplayEffectContextAccessorForNetSerializer::CopyReplicatedFieldsFrom(const FGameplayEffectContextAccessorForNetSerializer& GE)

[3] GameplayAbility.cpp:1904-1925
    Symbol: UGameplayAbility::MakeEffectContext
    Preview: FGameplayEffectContextHandle UGameplayAbility::MakeEffectContext(const FGameplayAbilitySpecHandle Handle, const FGameplayAbilityActorInfo *ActorInfo) const

[4] GameplayEffect.cpp:2236-2244
    Symbol: FGameplayEffectSpec::SetContext
    Preview: void FGameplayEffectSpec::SetContext(FGamep

LLM SETUP

In [None]:
CHAT_MODEL = "llama3.1:8b"
#CHAT_MODEL = "qwen2.5:7b"

llm = ChatOllama(
    model=CHAT_MODEL,
    temperature=0.0,
)

CONTEXT FORMATTER

In [46]:
def format_context_with_citations(docs):
    blocks = []

    for i, d in enumerate(docs, 1):
        m = d.metadata
        citation = (
            f"Source: {Path(m['file']).name}:"
            f"{m['line_start']}-{m['line_end']} "
            f"({m['symbol']})"
        )

        blocks.append(
            f"[BEGIN SOURCE]\n"
            f"{citation}\n\n"
            f"{d.page_content}\n"
            f"[END SOURCE]"
        )

    return "\n\n".join(blocks)

In [112]:
SYSTEM_PROMPT = """You are an Unreal Engine engineer.

You are answering questions about Unreal Engine by analyzing the provided
engine source code excerpts.

Conversation context (IMPORTANT):
- Questions may be follow-ups that refer implicitly to concepts discussed earlier.
- If a question refers to “that”, “this”, “the data structure”, or similar,
  resolve the reference using the most recently discussed Unreal Engine type
  or structure in the conversation.
- Do NOT switch to a different engine type or structure unless the question
  explicitly names it.

Rules (STRICT):
- Answer ONLY using the provided source context.
- Do NOT use Unreal documentation, prior knowledge, or assumptions.
- Do NOT speculate or generalize beyond the code shown.
- If the answer cannot be determined from the sources, respond with:
  "The provided source code does not contain this information."

Citation rules (MANDATORY):
- Every factual claim MUST have exactly ONE citation.
- Citations MUST be written as plain text (not in code blocks or backticks).
- Use exactly this format:

  Source: <FileName>:<LineStart>-<LineEnd> (<Symbol>)

- Do NOT invent file names, symbols, or line numbers.

Output structure (MANDATORY):
- Write the answer as a Markdown bullet list.
- Each bullet must contain:
  1. One factual claim
  2. Followed immediately by its Source line on the next line
- The Source line MUST be copied verbatim from one of the provided [BEGIN SOURCE] blocks.
- Do NOT group citations at the end.
- Do NOT write paragraphs.

Formatting rules:
- Use Markdown bullets only.
- Do NOT wrap citations in backticks or code blocks.
"""

EXPLAINER

In [40]:
def explain_with_citations(query: str, docs):
    context = format_context_with_citations(docs)

    messages = [
        SystemMessage(content=SYSTEM_PROMPT),
        HumanMessage(
            content=(
                f"Question:\n{query}\n\n"
                f"Source code context:\n\n{context}\n\n"
                "Provide a concise explanation with citations."
            )
        ),
    ]

    response = llm.invoke(messages)
    return response.content

TEST

In [41]:
query = "Why does FGameplayEffectContextHandle not replicate SourceObject?"

docs = retrieve_deduplicated(
    vectordb,
    query=query,
    k=15,
    max_results=5,
)

explanation = explain_with_citations(query, docs)

print(explanation)

Query: Why does FGameplayEffectContextHandle not replicate SourceObject?
Raw hits: 15 | Deduplicated: 5
* FGameplayEffectContextHandle does not replicate SourceObject because it is not marked as replicated in the CopyReplicatedFieldsFrom function.
Source: GameplayEffectContextNetSerializer.cpp:623-624 (void)

* The bReplicateSourceObject flag is checked before copying the SourceObject, indicating that replication of this field is optional.
Source: GameplayEffectContextNetSerializer.cpp:627-628 (if)

* In the MakeEffectContext function, the SourceObject is added to the context only if the AbilitySystemComponent and AbilitySpec are valid.
Source: GameplayAbility.cpp:1912-1915 (if)

* The SetContext function does not modify the EffectContext's SourceObject when it is already initialized.
Source: GameplayEffect.cpp:2238-2241 (if)


CITATION & GROUNDING VALIDATOR

In [49]:
SOURCE_PATTERN = re.compile(
    r"Source:\s*(?P<file>[^:]+):(?P<start>\d+)-(?P<end>\d+)\s*\((?P<symbol>[^)]+)\)"
)

def validate_citations(answer: str, docs):
    allowed = []

    for d in docs:
        m = d.metadata
        allowed.append({
            "file": Path(m["file"]).name,
            "start": int(m["line_start"]),
            "end": int(m["line_end"]),
            "symbol": m["symbol"],
        })

    errors = []
    paragraphs = [p.strip() for p in answer.split("\n\n") if p.strip()]

    for i, p in enumerate(paragraphs, 1):
        matches = SOURCE_PATTERN.findall(p)
        if not matches:
            errors.append(f"Paragraph {i} is missing a Source citation.")
            continue

        for file, start, end, symbol in matches:
            start, end = int(start), int(end)

            valid = any(
                file == a["file"]
                and symbol == a["symbol"]
                and a["start"] <= start <= end <= a["end"]
                for a in allowed
            )

            if not valid:
                errors.append(
                    f"Paragraph {i} contains invalid citation: "
                    f"{file}:{start}-{end} ({symbol})"
                )

    return errors

In [43]:
errors = validate_citations(explanation, docs)

if errors:
    print("Validation failed:\n")
    for e in errors:
        print("-", e)
else:
    print("All citations are valid and grounded.")

Validation failed:

- Paragraph 1 contains invalid citation: GameplayEffectContextNetSerializer.cpp:623-624 (void)
- Paragraph 2 contains invalid citation: GameplayEffectContextNetSerializer.cpp:627-628 (if)
- Paragraph 3 contains invalid citation: GameplayAbility.cpp:1912-1915 (if)
- Paragraph 4 contains invalid citation: GameplayEffect.cpp:2238-2241 (if)


In [44]:
def explain_with_validation(query, vectordb):
    docs = retrieve_deduplicated(
        vectordb,
        query=query,
        k=15,
        max_results=5,
    )

    answer = explain_with_citations(query, docs)
    errors = validate_citations(answer, docs)

    if errors:
        print("Answer rejected due to citation violations:\n")
        for e in errors:
            print("-", e)
        print("\nThe model output was:\n")
        print(answer)
        return None

    print("Answer accepted.\n")
    print(answer)
    return answer

In [56]:
explain_with_validation(
    "What is the difference between FGameplayEffectContext and FGameplayEffectContextHandle?",
    vectordb,
)

Query: What is the difference between FGameplayEffectContext and FGameplayEffectContextHandle?
Raw hits: 15 | Deduplicated: 5
Answer accepted.

* FGameplayEffectContext is used to store context information for an effect, and it can be created from scratch or retrieved from an existing handle.
Source: GameplayAbility.cpp:1904-1925 (UGameplayAbility::MakeEffectContext)

* FGameplayEffectContextHandle is a handle that references a FGameplayEffectContext object, allowing it to be passed around and used without having to store the entire context object itself.
Source: GameplayAbility.cpp:1786-1800 (UGameplayAbility::GetContextFromOwner)

* The main difference between FGameplayEffectContext and FGameplayEffectContextHandle is that a handle can be created from an existing context or retrieved from a spec, while a context must be created from scratch.
Source: AbilitySystemBlueprintLibrary.cpp:1184-1196 (UAbilitySystemBlueprintLibrary::GetEffectContext)

* A FGameplayEffectContextHandle can be 

'* FGameplayEffectContext is used to store context information for an effect, and it can be created from scratch or retrieved from an existing handle.\nSource: GameplayAbility.cpp:1904-1925 (UGameplayAbility::MakeEffectContext)\n\n* FGameplayEffectContextHandle is a handle that references a FGameplayEffectContext object, allowing it to be passed around and used without having to store the entire context object itself.\nSource: GameplayAbility.cpp:1786-1800 (UGameplayAbility::GetContextFromOwner)\n\n* The main difference between FGameplayEffectContext and FGameplayEffectContextHandle is that a handle can be created from an existing context or retrieved from a spec, while a context must be created from scratch.\nSource: AbilitySystemBlueprintLibrary.cpp:1184-1196 (UAbilitySystemBlueprintLibrary::GetEffectContext)\n\n* A FGameplayEffectContextHandle can be used to retrieve the underlying FGameplayEffectContext object using the Get() method.\nSource: GameplayAbility.cpp:1904-1925 (UGamepla

INTERFACE

In [109]:
def format_context(docs):
    if not docs:
        return "*No source context retrieved.*"

    md = "## Retrieved Source Context\n\n"
    for i, d in enumerate(docs, 1):
        m = d.metadata
        md += (
            f"---\n"
            f"### Result {i}\n\n"
            f"Source: `{m.get('file', 'unknown')}:"
            f"{m.get('line_start', '?')}-{m.get('line_end', '?')}`\n\n"
            f"```cpp\n{d.page_content.strip()}\n```\n\n"
        )
    return md

In [110]:
def chat(message, history):
    message = (message or "").strip()
    if not message:
        return "Enter a question.", ""

    docs = list(retrieve_deduplicated(
        vectordb,
        query=message,
        k=15,
        max_results=5,
    ))

    answer = explain_with_citations(message, docs)
    context = format_context(docs)

    return answer, context

In [None]:
with gr.Blocks(title="Unreal Engine Source-Grounded Assistant") as ui:
    gr.Markdown(
        "# Unreal Engine Source-Grounded Assistant\n"
        "Answers are grounded in Unreal Engine source code with strict citations."
    )

    with gr.Accordion("Retrieved Source Context", open=False):
        context_md = gr.Markdown(
            value="*Retrieved source context will appear here*",
            container=True,
        )

    chat = gr.ChatInterface(
        fn=chat,
        additional_outputs=[context_md],
        title=None,
        description=None,
    )

ui.launch(inbrowser=True)


* Running on local URL:  http://127.0.0.1:7878
* To create a public link, set `share=True` in `launch()`.




Query: What data does FGameplayEffectContext store, and which engine functions populate it?
Raw hits: 15 | Deduplicated: 5
Query: Why is that data structure not copied by value when creating gameplay effect specs, and what concrete engine issues would result if it were?
Raw hits: 15 | Deduplicated: 5
