Add mariner data source (#181)

Add functionality to vunnel to ingest published vulnerabilities with CBL Mariner Linux. Signed-off-by: Will Murphy <will.murphy@anchore.com>
anchore · May 25, 2023 · 52392a2 · 52392a2
1 parent 4474074
commit 52392a2
Show file tree

Hide file tree

Showing 17 changed files with 1,402 additions and 5 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,6 +57,7 @@ SQLAlchemy = ">= 1.4.46, < 3.0"  # note: 1.4.x currently required for enterprise
 mergedeep = "^1.3.4"
 future = "^0.18.3"
 importlib-metadata = "^6.1.0"
+xsdata = {extras = ["cli", "lxml", "soap"], version = "^22.12"}
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.2"
@@ -103,6 +104,7 @@ exclude = '''(?x)(
     | ^src/vunnel/providers/amazon/parser\.py$    # ported from enterprise, never had type hints
     | ^src/vunnel/providers/debian/parser\.py$    # ported from enterprise, never had type hints
     | ^src/vunnel/providers/github/parser\.py$    # ported from enterprise, never had type hints
+    | ^src/vunnel/providers/mariner/model/        # generated code
     | ^src/vunnel/providers/nvd/parser\.py$       # ported from enterprise, never had type hints
     | ^src/vunnel/providers/oracle/parser\.py$    # ported from enterprise, never had type hints
     | ^src/vunnel/providers/rhel/parser\.py$      # ported from enterprise, never had type hints
@@ -135,6 +137,7 @@ exclude = '''
     | dist
     | data
     | backup
+    | src/vunnel/providers/mariner/model # files in here are generated
     | tests/quality/vulnerability-match-labels
     | tests/quality/.yardstick
     | tests/quality/data
@@ -221,4 +224,5 @@ ignore = [
 
 extend-exclude = [
   "**/tests/**",
+  "src/vunnel/providers/mariner/model/**" # these are generated
 ]
diff --git a/src/vunnel/cli/config.py b/src/vunnel/cli/config.py
@@ -18,6 +18,7 @@ class Providers:
     chainguard: providers.chainguard.Config = field(default_factory=providers.chainguard.Config)
     debian: providers.debian.Config = field(default_factory=providers.debian.Config)
     github: providers.github.Config = field(default_factory=providers.github.Config)
+    mariner: providers.mariner.Config = field(default_factory=providers.mariner.Config)
     nvd: providers.nvd.Config = field(default_factory=providers.nvd.Config)
     oracle: providers.oracle.Config = field(default_factory=providers.oracle.Config)
     rhel: providers.rhel.Config = field(default_factory=providers.rhel.Config)

diff --git a/src/vunnel/providers/__init__.py b/src/vunnel/providers/__init__.py
@@ -10,6 +10,7 @@
     chainguard,
     debian,
     github,
+    mariner,
     nvd,
     oracle,
     rhel,
@@ -31,6 +32,7 @@
     amazon.Provider.name(): amazon.Provider,
     debian.Provider.name(): debian.Provider,
     github.Provider.name(): github.Provider,
+    mariner.Provider.name(): mariner.Provider,
     nvd.Provider.name(): nvd.Provider,
     oracle.Provider.name(): oracle.Provider,
     rhel.Provider.name(): rhel.Provider,

diff --git a/src/vunnel/providers/mariner/DEVELOPING.md b/src/vunnel/providers/mariner/DEVELOPING.md
@@ -0,0 +1,102 @@
+# Developing for the Mariner Provider
+
+This provider gets its own DEVELOPING.md because it introduces a new pattern for how OVAL XML is parsed in Vunnel.
+
+## How this provider works
+
+1. Vulnerabilities in Mariner Linux are published to https://github.com/microsoft/CBL-MarinerVulnerabilityData
+2. Each major version of Mariner Linux, currently 1.0 and 2.0, gets a file named `cbl-mariner-${MAJOR_VERSION}-oval.xml` in the root of that repo.
+3. Vunnel downloads this XML from GitHub's raw user content
+4. Vunnel normalizes and transforms that XML for output
+
+The transformation is handled differently by this provider:
+
+1. The file at './model/generated.py' is generated by `./generate_models.py`, using the [`xsdata`](https://pypi.org/project/xsdata/) package.
+2. The XML files are loaded as a `etree` from `lxml`
+3. XPath expressions are used to enumerate nodes of different types from the document
+4. `xsdata` is then used to parse these nodes into generated data classes
+5. The transform logic is written in plain python, converting these data classes into Vunnel's `Vulnerability` type
+6. Base classes common to all vunnel providers handle the output after that
+
+## FAQ
+
+### What to do if the schema of the OVAL XML changes?
+
+1. Re-run `./generate_models.py`
+2. Fix the logic in `./parser.py` to account for any changes in data classes.
+
+### `xsdata` can parse the whole document, so why use `etree`?
+
+For error handling - `xsdata` can parse the entire document, but the entire
+parse operation succeeds or fails. By looping over parts of the document and
+attempting to deserialize each part, we gain the flexibility to ignore parts of
+the document that can't be parsed, and still get some information from the
+overall document.
+
+### `xsdata` can generate dataclasses from an XML schema, or from an example document. Why use the example document approach?
+
+We initially tried generating the dataclasses from the xsd files named as the
+schemas in the OVAL XML files, but `xsdata` can't parse these files into the
+resulting dataclasses.
+
+### The dataclasses represent a subset of the schema. What if something changes?
+
+If something changes we will need to fix the parser. However, we validated the existing parser via the following process:
+
+1. Clone the [Mariner Linux Vulnerability Repo](https://github.com/microsoft/CBL-MarinerVulnerabilityData).
+2. Check out every tag in the repo, and copy the available OVAL XML files into a different directory
+3. Run a script against every file in that directory to validate our assumptions about the data shape.
+
+Also, we specifically asked whether the schema was stable and [were told it was](https://github.com/anchore/grype/issues/1220#issuecomment-1548447284).
+
+Below is the script used to validate our assumptions about the XML. The only difference we found was that the
+"version" field of test objects is called "evr" and used to be called "version".
+
+``` python
+import os
+from xml.etree import ElementTree as ET
+from lxml import etree
+
+EXAMPLE_DIR="/Users/willmurphy/work/scratch/xsdata-experiments/examples/"
+
+def main():
+    for file in os.listdir(EXAMPLE_DIR):
+        validate_file(os.path.join(EXAMPLE_DIR, file))
+
+    print('yay')
+
+def validate_file(file):
+    root = etree.parse(file)
+    nsmap = etree.XPath("/*")(root)[0].nsmap
+    default = nsmap[None]
+    nsmap["default"] = default
+    del nsmap[None]
+    selection = etree.XPath("//default:definition", namespaces=nsmap)
+    definitions = selection(root)
+
+    tests = etree.XPath('//linux-def:rpminfo_test', namespaces=nsmap)(root)
+    objects = etree.XPath('//linux-def:rpminfo_object', namespaces=nsmap)(root)
+    states = etree.XPath('//linux-def:rpminfo_state', namespaces=nsmap)(root)
+    evrs = etree.XPath('//linux-def:evr', namespaces=nsmap)(root)
+    # at 59187, used version not evr
+    versions = etree.XPath('//linux-def:version', namespaces=nsmap)(root)
+    all_versions = evrs + versions
+    expected_evr_operations = { "less than", "less than or equal" }
+    unique_evr_operations = {k.attrib['operation'] for k in all_versions}
+    if len(unique_evr_operations) == 0 or not unique_evr_operations.issubset(expected_evr_operations):
+        raise Exception("surprise evr operation!")
+    unique_evr_datatypes = {k.attrib['datatype'] for k in all_versions}
+    expected_evr_datatypes={ "version", "evr_string" }
+    if len(unique_evr_datatypes) != 1 or not unique_evr_datatypes.issubset(expected_evr_datatypes):
+        raise Exception(f"surprise evr datatype! {expected_evr_datatypes.difference(unique_evr_datatypes)}")
+    all_criteria_trees = etree.XPath('//default:definition/default:criteria', namespaces=nsmap)(root)
+    for c_node in all_criteria_trees:
+        children = c_node.getchildren()
+        if len(children) != 1:
+            raise Exception("different criterion tree!")
+        pass
+
+
+if __name__ == "__main__":
+    main()
+```
diff --git a/src/vunnel/providers/mariner/__init__.py b/src/vunnel/providers/mariner/__init__.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from vunnel import provider, result, schema
+from vunnel.providers.mariner.parser import Parser
+
+if TYPE_CHECKING:
+    import datetime
+
+
+@dataclass
+class Config:
+    runtime: provider.RuntimeConfig = field(
+        default_factory=lambda: provider.RuntimeConfig(
+            result_store=result.StoreStrategy.SQLITE,
+            existing_results=provider.ResultStatePolicy.KEEP,
+        ),
+    )
+    request_timeout: int = 125
+    allow_versions: list[str] = field(default_factory=lambda: ["1.0", "2.0"])
+
+
+class Provider(provider.Provider):
+    def __init__(self, root: str, config: Config | None = None):
+        if not config:
+            config = Config()
+        super().__init__(root, runtime_cfg=config.runtime)
+        self.config = config
+
+        self.logger.debug(f"config: {config}")
+        self.schema = schema.OSSchema()
+        self.parser = Parser(
+            workspace=self.workspace,
+            allow_versions=self.config.allow_versions,
+            download_timeout=self.config.request_timeout,
+            logger=self.logger,
+        )
+
+    @classmethod
+    def name(cls) -> str:
+        return "mariner"
+
+    def update(self, last_updated: datetime.datetime | None) -> tuple[list[str], int]:
+        with self.results_writer() as writer:
+            for namespace, vuln_id, record in self.parser.get():
+                writer.write(
+                    identifier=os.path.join(namespace, vuln_id),
+                    schema=self.schema,
+                    payload=record,
+                )
+                pass
+        return self.parser.urls, len(writer)
diff --git a/src/vunnel/providers/mariner/generate_models.py b/src/vunnel/providers/mariner/generate_models.py
@@ -0,0 +1,47 @@
+import os
+import tempfile
+from subprocess import PIPE, Popen
+
+import requests
+
+MARINER_URL_BASE = "https://raw.githubusercontent.com/microsoft/CBL-MarinerVulnerabilityData/main/{}"
+MARINER_URL_FILENAME = "cbl-mariner-{}-oval.xml"
+
+
+def download_version(version: str, dest_dir: str) -> None:
+    filename = MARINER_URL_FILENAME.format(version)
+    url = MARINER_URL_BASE.format(filename)
+    r = requests.get(url, timeout=125)
+    destination = os.path.join(dest_dir, filename)
+    with open(destination, "wb") as w:
+        w.write(r.content)
+
+
+def main() -> None:
+    versions = ["2.0"]
+    dest_path = tempfile.TemporaryDirectory()
+    for v in versions:
+        download_version(v, dest_path.name)
+
+    script_dir = os.path.realpath(os.path.dirname(__file__))
+    args = [
+        "xsdata",
+        "generate",
+        dest_path.name,
+        "-r",
+        "--relative-imports",
+        "--compound-fields",
+        "--package",
+        "model.generated",
+        "--structure-style",
+        "single-package",
+    ]
+    process = Popen(args=args, stderr=PIPE, stdout=PIPE, cwd=script_dir)
+    stdout, stderr = process.communicate()
+    print(stdout)
+    print(stderr)
+    pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/vunnel/providers/mariner/model/__init__.py b/src/vunnel/providers/mariner/model/__init__.py
@@ -0,0 +1,41 @@
+from .generated import (
+    Affected,
+    Criteria,
+    Criterion,
+    Definition,
+    Definitions,
+    Evr,
+    Generator,
+    Metadata,
+    Object,
+    Objects,
+    OvalDefinitions,
+    Reference,
+    RpminfoObject,
+    RpminfoState,
+    RpminfoTest,
+    State,
+    States,
+    Tests,
+)
+
+__all__ = [
+    "Affected",
+    "Criteria",
+    "Criterion",
+    "Definition",
+    "Definitions",
+    "Evr",
+    "Generator",
+    "Metadata",
+    "Object",
+    "Objects",
+    "OvalDefinitions",
+    "Reference",
+    "RpminfoObject",
+    "RpminfoState",
+    "RpminfoTest",
+    "State",
+    "States",
+    "Tests",
+]