In [46]:
from elasticsearch import Elasticsearch

client = Elasticsearch(
    "https://f0c40d3bde77471f83e6cace87537665.ap-southeast-1.aws.found.io",
    api_key="MU9oZlg1Y0JiN3JESEE4V0Y5aUg6N1lNZ0VaREtPU1pob3hYQWQtNU1zZw=="
)

index_name = "cve-index"

# Delete if exists (optional cleanup step for testing)
if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)

# Create index with proper mapping
mappings = {
    "mappings": {
        "properties": {
            "text": {
                "properties": {
                    "cve_id": {"type": "keyword"},
                    "description": {"type": "text"},
                    "severity": {"type": "keyword"}
                }
            }
        }
    }
}

mappings = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},  # not an object
            "metadata": {
                "properties": {
                    "cve_id": {"type": "keyword"},
                    "severity": {"type": "keyword"},
                    "cvss_score": {"type": "float"},
                    "fix_version": {"type": "keyword"},
                    "published_date": {"type": "date"},
                    "last_updated": {"type": "date"},
                    # Add more fields as needed
                }
            }
        }
    }
}

mappings = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "vector": {
                "type": "dense_vector",
                "dims": 384  # match the size of your embedding model (e.g., BERT base = 768)
            },
            "metadata": {
                "properties": {
                    "cve_id": {"type": "keyword"},
                    "severity": {"type": "keyword"},
                    "cvss_score": {"type": "float"},
                    "fix_version": {"type": "keyword"},
                    "published_date": {"type": "date"},
                    "last_updated": {"type": "date"}
                }
            }
        }
    }
}
client.indices.create(index=index_name, body=mappings)

# Sample document
# doc = {
#     "text": {
#         "cve_id": "CVE-2024-12345",
#         "description": "This is a test CVE vulnerability.",
#         "severity": "High"
#     }
# }

# Index the document
# response = client.index(index=index_name, document=doc)
# print("Document indexed:", response)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'cve-index'})

POST /_security/api_key
{
  "name": "my-api-key",
  "role_descriptors": {
    "my-role": {
      "cluster": ["all"],
      "index": [
        {
          "names": ["cve-index"],
          "privileges": ["all"]
        }
      ]
    }
  }
}

In [8]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
MODEL_NAME = "all-MiniLM-L6-v2"  # small and fast
ES_INDEX = "cve-index"
model = SentenceTransformer(MODEL_NAME)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re
import html
from urllib.parse import unquote


def parse_cpe(cpe_uri):
    """
    Parse a CPE 2.3 URI and return a dictionary of components.
    Example: cpe:2.3:a:vendor:product:version:*:*:*:*:*:*:*
    """
    parts = cpe_uri.split(":")
    if len(parts) >= 7:
        return {
            "type": parts[2],         # a / o / h
            "vendor": unquote(parts[3]),
            "product": unquote(parts[4]),
            "version": unquote(parts[5])
        }
    return {}


def extract_cve_data(json_data, embed_model=None):
    try:
        cve_id = json_data.get("cveMetadata", {}).get("cveId", "")
        containers = json_data.get("containers", {})
        cna = containers.get("cna", {})

        # Description
        description = next(
            (d.get("value") for d in cna.get("descriptions", []) if d.get("lang") == "en"),
            ""
        )
        description = html.unescape(description.strip())

        # CVSS metrics
        metrics = cna.get("metrics", [])
        cvss = {}
        for m in metrics:
            if "cvssV3_1" in m:
                cvss = m["cvssV3_1"]
                break

        # CWE
        cwes = []
        for pt in cna.get("problemTypes", []):
            for desc in pt.get("descriptions", []):
                if desc.get("lang") == "en":
                    cwes.append({
                        "id": desc.get("cweId"),
                        "description": desc.get("description")
                    })

        # Affected packages
        affected_packages = []
        for a in cna.get("affected", []):
            vendor = a.get("vendor", "")
            product = a.get("product", "")
            version_info = a.get("versions", [])
            for v in version_info:
                pkg = {
                    "vendor": vendor,
                    "product": product,
                    "version": v.get("version"),
                    "lessThanOrEqual": v.get("lessThanOrEqual"),
                    "version_type": v.get("versionType"),
                    "status": v.get("status")
                }
                # Optionally include parsed CPE info if available
                for cpe_entry in a.get("cpe", []):
                    cpe_data = parse_cpe(cpe_entry.get("cpe23Uri", ""))
                    pkg.update(cpe_data)
                affected_packages.append(pkg)

        # Infer fix versions
        fix_versions = sorted(
            {f"> {pkg['lessThanOrEqual']}" for pkg in affected_packages if pkg.get("lessThanOrEqual")}
        )
        fix_version = fix_versions[0] if fix_versions else None

        # References
        references = [r.get("url") for r in cna.get("references", []) if "url" in r]

        # Timeline
        timeline = [
            {"date": t["time"], "event": t["value"]}
            for t in cna.get("timeline", []) if "time" in t and "value" in t
        ]

        # Credits
        credits = [c.get("value") for c in cna.get("credits", []) if c.get("lang") == "en"]

        # SSVC (CISA enrichment)
        ssvc_data = {}
        for adp in containers.get("adp", []):
            for metric in adp.get("metrics", []):
                if "other" in metric and metric["other"].get("type") == "ssvc":
                    content = metric["other"].get("content", {})
                    ssvc_data = {
                        "role": content.get("role"),
                        "version": content.get("version"),
                        "exploitStatus": content.get("exploitStatus")
                    }
                    for opt in content.get("options", []):
                        ssvc_data.update(opt)
                    break

        # Embedding (optional)
        vector = []
        if embed_model:
            vector = embed_model.encode(description).tolist()

        # Final output
        return {
            "id": cve_id,
            "text": description,
            "vector": vector,
            "metadata": {
                "title": cve_id + " " + cna.get("title", ""),
                "published_date": json_data.get("cveMetadata", {}).get("datePublished", ""),
                "last_updated": json_data.get("cveMetadata", {}).get("dateUpdated", ""),
                "cvss_score": cvss.get("baseScore"),
                "cvss_vector": cvss.get("vectorString"),
                "severity": cvss.get("baseSeverity"),
                "cwe": cwes,
                "affected_packages": affected_packages,
                "fix_version": fix_version,
                "references": references,
                "timeline": timeline,
                "credits": credits,
                "assigner": json_data.get("cveMetadata", {}).get("assignerShortName"),
                "ssvc": ssvc_data
            },
            "sources": references
        }

    except Exception as e:
        print(f"Failed to parse CVE JSON: {e}")
        return {}

In [74]:
import os
from pathlib import Path

def convert_all(input_dir: str):
    input_path = Path(input_dir)

    all_json_files = list(input_path.rglob("*.json"))
    print(f"[i] Found {len(all_json_files)} JSON files in '{input_dir}'")

    for file_path in all_json_files:
        try:
            with file_path.open() as f:
                cve_json = json.load(f)
            doc = extract_cve_data(cve_json,model)
            # return doc
            res=client.index(index="cve-index", id=doc["id"], document=doc)
            # print(doc)
            print(res)
        except Exception as e:
            print(f"[!] Failed to process {file_path}: {e}")



doc=convert_all("/Users/ar2024/Downloads/cvelistV5-main/cves/2025/0xxx/")

[i] Found 738 JSON files in '/Users/ar2024/Downloads/cvelistV5-main/cves/2025/0xxx/'
{'_index': 'cve-index', '_id': 'CVE-2025-0749', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 98, '_primary_term': 1}
{'_index': 'cve-index', '_id': 'CVE-2025-0175', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 99, '_primary_term': 1}
{'_index': 'cve-index', '_id': 'CVE-2025-0525', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 100, '_primary_term': 1}
{'_index': 'cve-index', '_id': 'CVE-2025-0460', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 101, '_primary_term': 1}
{'_index': 'cve-index', '_id': 'CVE-2025-0899', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 102, '_primary_term': 1}
{'_index': 'cve-index', '_id': 'CVE-2025-0

In [76]:
from sentence_transformers import SentenceTransformer

query = " linux systems "
query_vector = model.encode(query).tolist()

response = client.search(index="cve-index", body={
    "size": 5,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
})

for hit in response["hits"]["hits"]:
    print(f"{hit['_score']:.2f} - {hit['_id']}: {hit['_source']['text']}")

1.37 - CVE-2025-0288: Various Paragon Software products contain an arbitrary kernel memory vulnerability within biontdrv.sys, facilitated by the memmove function, which does not validate or sanitize user controlled input, allowing an attacker the ability to write arbitrary kernel memory and perform privilege escalation.
1.35 - CVE-2025-0356: NEC Corporation Aterm WX1500HP Ver.1.4.2 and earlier and WX3600HP Ver.1.5.3 and earlier allows a attacker to execute arbitrary OS commands via the network.
1.34 - CVE-2025-0286: Various Paragon Software products contain an arbitrary kernel memory write vulnerability within biontdrv.sys that is caused by a failure to properly validate the length of user supplied data, which can allow an attacker to execute arbitrary code on the victim machine.
1.32 - CVE-2025-0285: Various Paragon Software products contain an arbitrary kernel memory mapping vulnerability within biontdrv.sys that is caused by a failure to properly validate the length of user supplied

In [84]:
response = client.search(index="cve-index", body={
    "size": 10,
    "query": {
        "bool": {
            "filter": [
                {"term": {"metadata.severity": "HIGH"}},
            ]
        }
    }
})

for hit in response["hits"]["hits"]:
    print(f"{hit['_id']} - {hit['_source']['metadata']['severity']} - {hit['_source']['text']}")

CVE-2025-0749 - HIGH - The Homey theme for WordPress is vulnerable to authentication bypass in versions up to, and including, 2.4.3. This is due to the 'verification_id' value being set to empty, and the not empty check is missing in the dashboard user profile page. This makes it possible for unauthenticated attackers to log in to the first verified user.
CVE-2025-0460 - HIGH - A vulnerability, which was classified as critical, was found in Blog Botz for Journal Theme 1.0 on OpenCart. This affects an unknown part of the file /index.php?route=extension/module/blog_add. The manipulation of the argument image leads to unrestricted upload. It is possible to initiate the attack remotely. The exploit has been disclosed to the public and may be used. The vendor was contacted early about this disclosure but did not respond in any way.
CVE-2025-0358 - HIGH - During an annual penetration test conducted on behalf of Axis Communication, Truesec discovered a flaw in the VAPIX Device Configuration f

In [116]:
query = " what are the exploits will impact browser "
query_vector = model.encode(query).tolist()

response = client.search(index="cve-index", body={
    "size": 100,
    "query": {
        "script_score": {
            "query": {
                "bool": {
                    "filter": [
                        {"term": {"metadata.severity": "CRITICAL"}},
                        {"range": {"metadata.cvss_score": {"gte": 9.0}}}
                    ]
                }
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
})

print('Total Document Found : {}'.format(len(response['hits']['hits'])))
for hit in response["hits"]["hits"]:
    print(f"{hit['_id']} - {hit['_source']['metadata']['title']}")

Total Document Found : 29
CVE-2025-0674 - CVE-2025-0674 Elber Communications Equipment Authentication Bypass Using an Alternate Path or Channel
CVE-2025-0493 - CVE-2025-0493 MultiVendorX – The Ultimate WooCommerce Multivendor Marketplace Solution <= 4.2.14 - Unauthenticated Limited Local File Inclusion
CVE-2025-0471 - CVE-2025-0471 Unrestricted Upload of File with Dangerous Type vulnerability in PMB platform
CVE-2025-0637 - CVE-2025-0637 Inadequate access control in Beta10
CVE-2025-0585 - CVE-2025-0585 aEnrich Technology a+HRD - SQL Injection
CVE-2025-0680 - CVE-2025-0680 New Rock Technologies Cloud Connected Devices has a Improper Neutralization of Special Elements used in an OS Command ('OS Command Injection') vulnerability.
CVE-2025-0066 - CVE-2025-0066 Information Disclosure vulnerability in SAP NetWeaver AS for ABAP and ABAP Platform (Internet Communication Framework)
CVE-2025-0070 - CVE-2025-0070 Improper Authentication in SAP NetWeaver ABAP Server and ABAP Platform
CVE-2025-0177

In [104]:
response['hits']

{'total': {'value': 29, 'relation': 'eq'},
 'max_score': 1.2515438,
 'hits': [{'_index': 'cve-index',
   '_id': 'CVE-2025-0066',
   '_score': 1.2515438,
   '_source': {'id': 'CVE-2025-0066',
    'text': 'Under certain conditions SAP NetWeaver AS for ABAP and ABAP Platform (Internet Communication Framework) allows an attacker to access restricted information due to weak access controls. This can have a significant impact on the confidentiality, integrity, and availability of an application',
    'vector': [-0.05567854270339012,
     0.048889148980379105,
     -0.05899299308657646,
     -0.007609989959746599,
     0.004149011801928282,
     -0.03386954218149185,
     0.06040226295590401,
     0.04569008946418762,
     0.013219360262155533,
     0.01930377073585987,
     0.06589735299348831,
     0.027485355734825134,
     -0.03598302602767944,
     -0.03648963198065758,
     0.05324232950806618,
     0.04970042034983635,
     0.06178036332130432,
     -0.037145599722862244,
     -0.08162

In [120]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
import torch

model_name = "markusbayer/CySecBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# Example: Fill-mask
fill = pipeline("fill-mask", model=model, tokenizer=tokenizer)
print(fill("The latest zero-day CVE-2025-##### exploits a buffer [MASK]"))

text = "The vulnerability allows for remote code execution through buffer overflow in Apache Struts."

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)


# Run model inference
with torch.no_grad():
    outputs = model(**inputs)

# Get prediction
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()
print("Predicted class:", predicted_class)

Device set to use mps:0


[{'score': 0.5349510312080383, 'token': 2058, 'token_str': 'over', 'sequence': 'the latest zero - day cve - 2025 - # # # # # exploits a buffer over'}, {'score': 0.23651579022407532, 'token': 24672, 'token_str': 'overrun', 'sequence': 'the latest zero - day cve - 2025 - # # # # # exploits a buffer overrun'}, {'score': 0.08804173767566681, 'token': 3091, 'token_str': 'length', 'sequence': 'the latest zero - day cve - 2025 - # # # # # exploits a buffer length'}, {'score': 0.01977766864001751, 'token': 2946, 'token_str': 'size', 'sequence': 'the latest zero - day cve - 2025 - # # # # # exploits a buffer size'}, {'score': 0.007776712998747826, 'token': 1999, 'token_str': 'in', 'sequence': 'the latest zero - day cve - 2025 - # # # # # exploits a buffer in'}]


RuntimeError: Placeholder storage has not been allocated on MPS device!