### 1) extract text from PDFs 

In [1]:
from pypdf import PdfReader
from pathlib import Path


def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []

    for p in reader.pages:
        text = p.extract_text() or ""
        pages.append(text)
    return "\n\n".join(pages)


def save_text(pdf_path: str, out_txt_path: str):
    text = extract_text_from_pdf(pdf_path=pdf_path)
    Path(out_txt_path).write_text(text, encoding="utf-8")


# PDF paths
pdf1 = "../data/raw/Automotive_SPICE_PAM_31_EN.pdf"
out1 = "../data/txt/automotivespice.txt"

pdf2 = "../data/raw/AUTOSAR_SWS_ECUStateManager.pdf"
out2 = "../data/txt/autosar_ecum.txt"

# Run extraction
save_text(pdf1, out1)
save_text(pdf2, out2)


### 2) Parse handling convert text to Markdown 

In [2]:
import re, uuid
from typing import List, Dict

numbered_header_re = re.compile(
    r"^\s*(\d+(?:\.\d+){0,3})\s*\.?\s+(?P<title>[A-Z][^\n]{3,200})$", re.M
)

code_header_re = re.compile(r"^(?P<code>[A-Z]{2,5}\.\d+)\s+(?P<title>.+)$", re.M)

def parse_numbered_headings(text: str) -> List[Dict]:
    """
    Parse text into hierarchical chunks with full content under each header.
    """
    matches = list(numbered_header_re.finditer(text))
    if not matches:
        matches = list(code_header_re.finditer(text))
    
    if not matches:
        return [{
            "id": str(uuid.uuid4()),
            "level": 1,
            "title": "Document",
            "parent_id": None,
            "content": text.strip(),
        }]

    chunks = []
    boundaries = [(m.start(), m.end(), m.group(0), m.groupdict().get("title") or m.group(0)) for m in matches]
    boundaries.append((len(text), len(text), "", ""))  

    for i in range(len(boundaries)-1):
        header_line = boundaries[i][2]
        title = boundaries[i][3].strip()
        content = text[boundaries[i][1]:boundaries[i+1][0]].strip()
        num_match = re.match(r"^\s*(\d+(?:\.\d+)*)", header_line)
        level = num_match.group(1).count(".")+1 if num_match else 2
        chunks.append({
            "id": str(uuid.uuid4()),
            "level": level,
            "title": title,
            "parent_num": num_match.group(1) if num_match else None,
            "content": content,
        })

    for c in chunks:
        if c.get("parent_num"):
            parts = c["parent_num"].split(".")
            found_parent = None
            for candidate in reversed(chunks):
                pnum = candidate.get("parent_num")
                if pnum and pnum == ".".join(parts[:-1]):
                    found_parent = candidate["id"]
                    break
            c["parent_id"] = found_parent
        else:
            c["parent_id"] = None

    return chunks


def save_markdown(chunks: List[Dict], out_md_path: Path):
    out_md_path.parent.mkdir(parents=True, exist_ok=True)
    lines = []
    for c in chunks:
        header_prefix = "#" * c["level"]
        lines.append(f"{header_prefix} {c['title']}\n")
        lines.append(c["content"] + "\n")
    out_md_path.write_text("\n".join(lines), encoding="utf-8")
    print(f"Saved Markdown: {out_md_path}")

txt_files = Path("../data/txt").glob("*.txt")
for txt_file in txt_files:
    text = txt_file.read_text(encoding="utf-8")
    chunks = parse_numbered_headings(text)
    md_path = Path("../data/md") / txt_file.with_suffix(".md").name
    save_markdown(chunks, md_path)


Saved Markdown: ..\data\md\automotivespice.md
Saved Markdown: ..\data\md\autosar_ecum.md


### 3) build index 
####  convert each chunk to embedding and build FAISS index one for each file 

In [3]:
from sentence_transformers import SentenceTransformer
import faiss, json
from pathlib import Path

EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def build_embeddings(chunks, model_name=EMB_MODEL):
    model = SentenceTransformer(model_name)
    texts = [c["content"] or c["title"] for c in chunks]
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    return embeddings

def build_faiss_index(chunks, embeddings, out_dir: str, index_name: str):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    d = embeddings.shape[1]
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(d)
    index.add(embeddings)
    faiss.write_index(index, f"{out_dir}/{index_name}.index")

    # save metadata
    meta = {
        i: {
            "id": chunks[i]["id"],
            "title": chunks[i].get("title"),
            "level": chunks[i]["level"],
            "parent_id": chunks[i].get("parent_id"),
            "content": chunks[i].get("content")  # مهم جدًا
        }
        for i in range(len(chunks))
    }
    Path(f"{out_dir}/{index_name}_meta.json").write_text(
        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    print(f"Saved index + meta for {index_name}")
    return index



if __name__ == "__main__":
    md_dir = Path("../data/md")
    out_dir = Path("../outputs/indices")

    # process all md files
    for md_file in md_dir.glob("*.md"):
        text = md_file.read_text(encoding="utf-8")
        chunks = parse_numbered_headings(text)
        embeddings = build_embeddings(chunks)
        index_name = md_file.stem + "_clauses"
        build_faiss_index(chunks, embeddings, out_dir=out_dir, index_name=index_name)




Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Saved index + meta for automotivespice_clauses




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved index + meta for autosar_ecum_clauses


### 4) hierarchical Retrieval
#### headers -> sections -> clauses

In [5]:
# hierarchical Retrieval
## headers -> sections -> clauses
import faiss, json
from sentence_transformers import SentenceTransformer
from pathlib import Path

EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

class HierarchicalRetriever:
    def __init__(self, indices_meta, emb_model=EMB_MODEL):
        self.indices = []
        self.metas = []
        for idx_path, meta_path in indices_meta:
            self.indices.append(faiss.read_index(idx_path))
            self.metas.append(json.loads(Path(meta_path).read_text(encoding="utf-8")))
        self.model = SentenceTransformer(emb_model)

    def _embed(self, query):
        v = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(v)
        return v

    def search_topk(self, query, k=5):
        v = self._embed(query)
        results = []
        for index, meta in zip(self.indices, self.metas):
            D, I = index.search(v, k)
            for idx, score in zip(I[0], D[0]):
                if idx == -1:
                    continue
                info = meta[str(idx)] if str(idx) in meta else meta[idx]
                results.append({"idx": idx, "score": float(score), **info})
        results = sorted(results, key=lambda x: -x["score"])
        return results[:k]

    def hierarchical_retrieve(self, query, k_header=3, k_section=3, k_clause=3):
        top = self.search_topk(query, k=max(k_header,k_section,k_clause))
        headers = [r for r in top if r["level"]==1][:k_header]
        if not headers:
            headers = top[:k_header]

        final_clauses = []
        for h in headers:
            q_section = f"{query} context: {h.get('title')}"
            sections = self.search_topk(q_section, k=k_section)
            if sections:
                top_section = sections[0]
                q_clause = f"{query} context: {h.get('title')} > {top_section.get('title')}"
                clauses = self.search_topk(q_clause, k=k_clause)
                final_clauses.extend(clauses)

        # deduplicate & sort
        seen = set()
        deduped = []
        for c in sorted(final_clauses, key=lambda x: -x["score"]):
            if c['idx'] in seen:
                continue
            seen.add(c['idx'])
            deduped.append(c)
        return {"headers": headers, "clauses": deduped[:k_clause]}

# usage
if __name__ == "__main__":
    indices_meta = [
        (
            "../outputs/indices/automotivespice_clauses.index",
            "../outputs/indices/automotivespice_clauses_meta.json",
        ),
        (
            "../outputs/indices/autosar_ecum_clauses.index",
            "../outputs/indices/autosar_ecum_clauses_meta.json",
        ),
    ]
    hr = HierarchicalRetriever([
    ("../outputs/indices/automotivespice_clauses.index",
     "../outputs/indices/automotivespice_clauses_meta.json"),
    ("../outputs/indices/autosar_ecum_clauses.index",
     "../outputs/indices/autosar_ecum_clauses_meta.json"),
])

res = hr.hierarchical_retrieve("What does SYS.2 require?", k_header=2, k_section=2, k_clause=3)

retrieved_texts = []
for clause in res['clauses']:
    retrieved_texts.append(f"{clause['title']}\n{clause['content']}")  

context = "\n\n".join(retrieved_texts)
print(context)


SYS.2


SYS.5


Reuse Program
Management
System Engineering Process Group (SYS)


LLM Layer 

In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv
import os 

load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
llm = ChatGoogleGenerativeAI(
  api_key = GEMINI_API_KEY,
  model = "gemini-2.5-pro", 
  temperature=0.2,
  max_retries=2
)

In [7]:
from langchain.schema import HumanMessage, SystemMessage

system_prompt = SystemMessage(content="You are an expert in automotive standards and AUTOSAR.")
human_prompt = HumanMessage(content=f"Answer the question based on this context:\n{context}\nQuestion: What does SYS.2 require?")

response = llm([system_prompt, human_prompt])
print(response.content)


  response = llm([system_prompt, human_prompt])


Based on the context provided, it is impossible to determine what SYS.2 requires. The text only lists "SYS.2" as an item under the "System Engineering Process Group (SYS)" without any further description.

---

However, as an expert in automotive standards, I can tell you that **SYS.2** is a standard process area identifier from **Automotive SPICE (ASPICE)**.

In ASPICE, **SYS.2** is titled **"System Requirements Analysis"**.

The purpose of this process is to transform the stakeholder requirements into a complete, consistent, and technically correct set of system requirements.

**SYS.2 requires an organization to:**

1.  **Specify system requirements:** Elicit and document the functional and non-functional requirements for the system.
2.  **Structure system requirements:** Organize and categorize the requirements logically.
3.  **Analyze system requirements:** Ensure the requirements are correct, technically feasible, verifiable, and consistent with each other.
4.  **Analyze the impac