In [1]:
import json
from pathlib import Path

JSON_PATH = "/content/draft_1.json"  # uploaded file

with open(JSON_PATH, "r", encoding="utf-8") as f:
    ecc_data = json.load(f)

def extract_ecc_controls(ecc_json):
    records = []

    for domain in ecc_json.get("domains", []):
        domain_name = domain["domain_name"]

        for sub in domain.get("subdomains", []):
            subdomain_name = sub["subdomain_name"]

            for ctrl in sub.get("controls", []):
                base_text = ctrl["control_text"]

                # include subcontrols in same chunk
                if "subcontrols" in ctrl:
                    subs = "\n".join(
                        f"- {s['subcontrol_text']}"
                        for s in ctrl["subcontrols"]
                    )
                    base_text = f"{base_text}\n{subs}"

                records.append({
                    "control_id": ctrl["control_id"],
                    "domain": domain_name,
                    "subdomain": subdomain_name,
                    "text": base_text,
                    "language": "en"
                })

    return records

ecc_controls = extract_ecc_controls(ecc_data)

print(f"Total ECC control chunks: {len(ecc_controls)}")
print(ecc_controls[0])


Total ECC control chunks: 108
{'control_id': '1-1-1', 'domain': 'Cybersecurity Governance', 'subdomain': 'Cybersecurity Strategy', 'text': 'The cybersecurity strategy of the entity shall be identified, documented, and approved, and it shall be supported by the head of the entity or his/her delegate. The strategy goals shall be in line with the relevant legislative and regulatory requirements.', 'language': 'en'}


In [2]:
pip install -U sentence-transformers faiss-cpu numpy



In [3]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")

def embed_passages(texts, batch_size=16):
    texts = [f"passage: {t}" for t in texts]
    emb = model.encode(
        texts,
        batch_size=batch_size,
        normalize_embeddings=True,
        show_progress_bar=True
    )
    return np.asarray(emb, dtype=np.float32)

ecc_texts = [c["text"] for c in ecc_controls]
ecc_embeddings = embed_passages(ecc_texts)

dim = ecc_embeddings.shape[1]
ecc_index = faiss.IndexFlatIP(dim)
ecc_index.add(ecc_embeddings)

print("ECC vector index size:", ecc_index.ntotal)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_xlm-roberta_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

ECC vector index size: 108


In [6]:
def embed_query(q):
    return model.encode(
        [f"query: {q}"],
        normalize_embeddings=True
    ).astype(np.float32)

query_ar = "هل يوجد متطلبات للمصادقة متعددة العوامل للوصول عن بعد؟"
q_emb = embed_query(query_ar)

scores, ids = ecc_index.search(q_emb, 5)

for i in ids[0]:
    print(ecc_controls[i]["control_id"], "-", ecc_controls[i]["text"][:120])

4-1-3 - Cybersecurity requirements for contracts and agreements with third parties providing IT or cybersecurity outsourcing or 
2-2-3 - Cybersecurity requirements for identity and access management of the entity shall include the following as a minimum:
- 
4-1-2 - Cybersecurity requirements for contracts and agreements with third parties shall include the following as a minimum:
- C
1-9-3 - Cybersecurity requirements prior to the commencement of the employment relationship between personnel and the entity sha
2-11-3 - Cybersecurity requirements for penetration testing shall include the following as a minimum:
- Scope of penetration test


In [7]:
query_en = "email security requirements like SPF and DMARC"
q_emb = embed_query(query_en)
scores, ids = ecc_index.search(q_emb, 5)

for i in ids[0]:
    print(ecc_controls[i]["control_id"], "-", ecc_controls[i]["text"][:120])

2-4-3 - Cybersecurity requirements for protection of the email service of the entity shall include the following as a minimum:
-
2-4-2 - Cybersecurity requirements for protection of email service of the entity shall be implemented.
2-4-1 - Cybersecurity requirements for protection of the email service of the entity shall be identified, documented, and approv
1-10-3 - The cybersecurity awareness program shall include how to protect the entity against the most important and latest cyber 
2-4-4 - The implementation of cybersecurity requirements for email service of the entity shall be periodically reviewed.


In [8]:
def embed_query(q):
    return model.encode(
        [f"query: {q}"],
        normalize_embeddings=True
    ).astype(np.float32)

query_ar = "هل يوجد متطلبات للمصادقة متعددة العوامل للوصول عن بعد؟"
q_emb = embed_query(query_ar)

scores, ids = ecc_index.search(q_emb, 5)

for i in ids[0]:
    print(ecc_controls[i]["control_id"], "-", ecc_controls[i]["text"][:120])

4-1-3 - Cybersecurity requirements for contracts and agreements with third parties providing IT or cybersecurity outsourcing or 
2-2-3 - Cybersecurity requirements for identity and access management of the entity shall include the following as a minimum:
- 
4-1-2 - Cybersecurity requirements for contracts and agreements with third parties shall include the following as a minimum:
- C
1-9-3 - Cybersecurity requirements prior to the commencement of the employment relationship between personnel and the entity sha
2-11-3 - Cybersecurity requirements for penetration testing shall include the following as a minimum:
- Scope of penetration test


In [9]:
query_en = "email security requirements like SPF and DMARC"
q_emb = embed_query(query_en)
scores, ids = ecc_index.search(q_emb, 5)

for i in ids[0]:
    print(ecc_controls[i]["control_id"], "-", ecc_controls[i]["text"][:120])

2-4-3 - Cybersecurity requirements for protection of the email service of the entity shall include the following as a minimum:
-
2-4-2 - Cybersecurity requirements for protection of email service of the entity shall be implemented.
2-4-1 - Cybersecurity requirements for protection of the email service of the entity shall be identified, documented, and approv
1-10-3 - The cybersecurity awareness program shall include how to protect the entity against the most important and latest cyber 
2-4-4 - The implementation of cybersecurity requirements for email service of the entity shall be periodically reviewed.
