Label PRs when the json schema changes (#2240)

* label PRs when the json schema changes Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> * moderate pr comments Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> * be more strict about processing file names Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> --------- Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
anchore · Oct 20, 2023 · 8f6bdde · 8f6bdde
1 parent ef43294
commit 8f6bdde
Show file tree

Hide file tree

Showing 6 changed files with 354 additions and 1 deletion.
diff --git a/.github/scripts/labeler.py b/.github/scripts/labeler.py
@@ -0,0 +1,224 @@
+from __future__ import annotations
+
+import sys
+import glob
+import subprocess
+import os
+import re
+
+DRY_RUN = False
+
+
+def main(changed_files: str | None = None, merge_base_schema_files: str | None = None):
+    global DRY_RUN
+
+    pr_number = os.environ.get("GITHUB_PR_NUMBER")
+    comment_file_path = os.environ.get("CI_COMMENT_FILE")
+
+    if not comment_file_path:
+        print("CI_COMMENT_FILE not set")
+        sys.exit(1)
+
+    if not pr_number:
+        DRY_RUN = True
+
+    if changed_files:
+        DRY_RUN = True
+
+        # read lines from file... this is useful for local testing
+        with open(changed_files) as f:
+            pr_changed_files = f.read().splitlines()
+
+        with open(merge_base_schema_files) as f:
+            og_json_schema_files = sort_json_schema_files(f.read().splitlines())
+
+    else:
+        if not is_ci():
+            print("Not in CI")
+            sys.exit(1)
+
+        if not pr_number:
+            print("Not a PR")
+            sys.exit(1)
+
+        pr_changed_files = get_pr_changed_files(pr_number)
+        # since we are running this in the context of the pull_request_target, the checkout is the merge base..
+        # that is the main branch of the original repo, NOT the branch in the forked repo (or branch in the target 
+        # repo for non-forked PRs). This means we just need to list the current checkedout files to get a sense of
+        # the changes before a merge.
+        og_json_schema_files = list_json_schema_files()
+
+    pr_json_schema_files = filter_to_schema_files(pr_changed_files)
+
+    # print("schema files in pr:   ", summarize_schema_files(pr_json_schema_files))
+    # print("og schema files:      ", summarize_schema_files(og_json_schema_files))
+
+    if not og_json_schema_files:
+        print("No schema files found in merge base")
+        sys.exit(1)
+
+    # pr_json_schema_files = set of PR files are added, removed, and changed files
+    new_schema_files = set(pr_json_schema_files) - set(og_json_schema_files)
+    removed_or_modified_schema_files = set(pr_json_schema_files) - set(new_schema_files)
+
+    print("new schemas:                ", summarize_schema_files(new_schema_files))
+    print("removed or modified schemas:", summarize_schema_files(removed_or_modified_schema_files))
+
+    # if there is a new or modified schema, we should add the "json-schema" label to the PR...
+    if new_schema_files or removed_or_modified_schema_files:
+        print("\nAdding json-schema label...")
+        add_label(pr_number, "json-schema")
+    else:
+        remove_label(pr_number, "json-schema")
+
+    # new schema files should be scrutinized, comparing the latest and added versions to see if it's a breaking
+    # change (major version bump). Warn about it on the PR via adding a breaking-change label...
+    if is_breaking_change(new_schema_files, og_json_schema_files[-1]):
+        print("\nBreaking change detected...")
+        add_label(pr_number, "breaking-change")
+    else:
+        remove_label(pr_number, "breaking-change")
+
+    # modifying an existing schema could be a breaking change, we should warn about it on the PR via a comment...
+    # removing schema files should never be allowed, we should warn about it on the PR via a comment...
+    if removed_or_modified_schema_files:
+        print("\nRemoved or modified schema detected...")
+        schemas = sort_json_schema_files(list(removed_or_modified_schema_files))
+        schemas_str = "\n".join([f" - {schema}" for schema in schemas])
+        add_comment(comment_file_path, f"Detected modification or removal of existing json schemas:\n{schemas_str}", warning=True)
+
+
+def add_comment(comment_file_path: str, comment: str, warning: bool = False, important: bool = False):
+    if warning or important:
+        comment_lines = comment.splitlines()
+        comment = "\n".join([f"> {line}" for line in comment_lines])
+
+    if warning:
+        comment = f"> [!WARNING]\n{comment}"    
+    elif important:
+        comment = f"> [!IMPORTANT]\n{comment}"
+
+    # create any parent directories if they don't exist
+    os.makedirs(os.path.dirname(comment_file_path), exist_ok=True)
+
+    with open(comment_file_path, "w") as f:
+        f.write(comment)
+
+    print(f"Comment file contents: {comment_file_path}")
+    print(comment)
+
+
+def add_label(pr_number: str, label: str):
+    # run "gh pr edit --add-label <label>"
+    result = run(f"gh pr edit {pr_number} --add-label {label}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if result.returncode != 0:
+        print(f"Unable to add  {label!r} label to PR with")
+        print(str(result.stderr))
+        sys.exit(1)
+
+
+def remove_label(pr_number: str, label: str):
+    # run "gh pr edit --remove-label <label>"
+    result = run(f"gh pr edit {pr_number} --remove-label {label}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if result.returncode != 0:
+        print(f"Unable to label PR with {label!r}")
+        print(str(result.stderr))
+        sys.exit(1)
+
+
+def major_version(semver: str) -> int:
+    return int(semver.split(".")[0])
+
+
+def is_breaking_change(new_schema_files: set[str], latest_schema_file: str) -> bool:
+    latest_major_version = major_version(get_semver(latest_schema_file))
+    for file in new_schema_files:
+        change_major_version = major_version(get_semver(file))
+        if change_major_version > latest_major_version:
+            return True
+    return False
+
+
+def summarize_schema_files(files: list[str]) -> list[str]:
+    return [get_semver(file) for file in files]
+
+
+def is_ci() -> bool:
+    return "CI" in os.environ
+
+
+def get_pr_changed_files(pr_number: str) -> list[str]:
+    result = run(f"gh pr view {pr_number} --json files --jq '.files.[].path'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if result.returncode != 0:
+        print("Unable to get list of changed files in PR")
+        print(str(result.stderr))
+        sys.exit(1)
+
+    list_of_files = result.stdout.splitlines()
+    return list_of_files
+
+
+def filter_to_schema_files(list_of_files: list[str]) -> list[str]:
+    # get files matching "schema/json/schema-*.json"
+    files = []
+    for file in list_of_files:
+        if re.match(r"^schema/json/schema-\d+\.\d+\.\d+\.json$", file):
+            files.append(file)
+    return sort_json_schema_files(files)
+
+
+def list_json_schema_files() -> list[str]:
+    # list files in "schema/json" directory matching the pattern of "schema-*.json"
+    return sort_json_schema_files(list(glob.glob("schema/json/schema-*.json")))
+
+
+def run(command: str,  **kwargs) -> subprocess.CompletedProcess:
+    if DRY_RUN:
+        print(f"[DRY RUN] {command}")
+        return subprocess.CompletedProcess(args=[command], returncode=0)
+    print(f"[RUN] {command}")
+    return subprocess.run(command, **kwargs)
+
+
+def get_semver(input_file: str) -> str:
+    return input_file.split("-")[1].split(".json")[0]
+
+
+def sort_json_schema_files(files: list[str]) -> list[str]:
+    # sort files by schema version, where the input looks like "schema/json/schema-1.12.1.json"
+    # we should sort by the semantic version embedded within the basename, not the string
+    # so that "schema/json/schema-1.2.1.json" comes before "schema/json/schema-1.12.1.json".
+    versions = [get_semver(file) for file in files if file]
+
+    versions = sorted(versions, key=lambda s: [int(u) for u in s.split('.')])
+
+    return [f"schema/json/schema-{version}.json" for version in versions]
+
+
+# allow for test files that have line-by-line list of files:
+
+# .binny.yaml
+# .github/actions/bootstrap/action.yaml
+# .github/scripts/goreleaser-install.sh
+# .github/workflows/release.yaml
+# .github/workflows/update-bootstrap-tools.yml
+# .github/workflows/update-cpe-dictionary-index.yml
+# .github/workflows/update-stereoscope-release.yml
+# .github/workflows/validations.yaml
+# .gitignore
+# .goreleaser.yaml
+# Makefile
+# Taskfile.yaml
+# schema/cyclonedx/Makefile
+
+if __name__ == "__main__":
+    # these are variables for a single file name that contains a list of files (line separated)
+    changed_files = None
+    merge_base_schema_files = None
+
+    if len(sys.argv) > 2:
+        changed_files = sys.argv[1]
+        merge_base_schema_files = sys.argv[2]
+
+    main(changed_files, merge_base_schema_files)
+
diff --git a/.github/scripts/labeler_test.py b/.github/scripts/labeler_test.py
@@ -0,0 +1,65 @@
+import unittest
+from unittest.mock import patch
+import subprocess
+
+import labeler
+
+class Labeler(unittest.TestCase):
+
+    def test_major_version(self):
+        self.assertEqual(labeler.major_version("1.2.3"), 1)
+        self.assertEqual(labeler.major_version("2.0.0"), 2)
+
+    def test_is_breaking_change(self):
+        new_schema_files = ["schema/json/schema-2.0.0.json"]
+        latest_schema_file = "schema/json/schema-1.2.0.json"
+        self.assertTrue(labeler.is_breaking_change(new_schema_files, latest_schema_file))
+
+        new_schema_files = ["schema/json/schema-1.3.0.json"]
+        latest_schema_file = "schema/json/schema-1.2.0.json"
+        self.assertFalse(labeler.is_breaking_change(new_schema_files, latest_schema_file))
+
+    def test_summarize_schema_files(self):
+        files = ["schema/json/schema-1.0.0.json", "schema/json/schema-2.0.0.json"]
+        expected = ["1.0.0", "2.0.0"]
+        self.assertEqual(labeler.summarize_schema_files(files), expected)
+
+    def test_is_ci(self):
+        # Mock os.environ to simulate CI environment
+        with patch.dict("os.environ", {"CI": "true"}):
+            self.assertTrue(labeler.is_ci())
+
+    def test_get_pr_changed_files(self):
+        expected_command = "gh pr view 123 --json files --jq '.files.[].path'"
+        expected_output = "file1.json\nfile2.json\n"
+
+        subprocess.CompletedProcess.returncode = 0
+        subprocess.CompletedProcess.stdout = expected_output
+        with patch("labeler.run", return_value=subprocess.CompletedProcess) as mock_run:
+            result = labeler.get_pr_changed_files("123")
+            mock_run.assert_called_with(expected_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            self.assertEqual(result, ["file1.json", "file2.json"])
+
+    def test_filter_to_schema_files(self):
+        input_files = ["schema/json/schema-1.0.0.json", "not_schema.txt", "schema/json/schema-2.0.0.json"]
+        expected_files = ["schema/json/schema-1.0.0.json", "schema/json/schema-2.0.0.json"]
+        self.assertEqual(labeler.filter_to_schema_files(input_files), expected_files)
+
+        # we should be strict about what files are allowed to be processed
+        input_files = ["schema/json/schema-1.0.0extracontent.json", "schema/json/schema-1.0.0.md", "schema/json/schema-1.0.0.json.extracontent"]
+        expected_files = []
+        self.assertEqual(labeler.filter_to_schema_files(input_files), expected_files)
+
+    def test_get_semver(self):
+        input_file = "schema/json/schema-1.0.0.json"
+        expected_semver = "1.0.0"
+        self.assertEqual(labeler.get_semver(input_file), expected_semver)
+
+    def test_sort_json_schema_files(self):
+        files = ["schema/json/schema-1.12.1.json", "schema/json/schema-1.2.1.json"]
+        expected_sorted_files = ["schema/json/schema-1.2.1.json", "schema/json/schema-1.12.1.json"]
+        self.assertEqual(labeler.sort_json_schema_files(files), expected_sorted_files)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.github/workflows/labeler.yaml b/.github/workflows/labeler.yaml
@@ -0,0 +1,54 @@
+name: "Detect schema changes"
+
+on:
+  # IMPORTANT! This workflow is triggered by the `pull_request_target` event
+  # which means that forked PRs will run with access secrets from the repo
+  # it's forked from (the "target" repo).
+  #
+  # For this reason we only NEVER checkout the code from the pull request
+  # (e.g. "ref: ${{ github.event.pull_request.head.sha }}") to prevent
+  # accidentally running potentially untrusted code.
+  #
+  # By default the checkout will be:
+  #   - GITHUB_SHA: Last commit on the PR base branch
+  #   - GITHUB_REF: PR base branch
+  #
+  # ...unlike a typical PR where:
+  #   - GITHUB_SHA: Last merge commit on the GITHUB_REF branch
+  #   - GITHUB_REF: PR merge branch refs/pull/:prNumber/merge
+  pull_request_target:
+
+env:
+  # note: this is used within hashFiles() so must be within the GITHUB_WORKSPACE path (or will silently fail)
+  CI_COMMENT_FILE: .tmp/labeler-comment.txt
+  # needs to be any string to uniquely identify the comment on a PR across multiple runs
+  COMMENT_HEADER: "label-commentary"
+
+jobs:
+  label:
+    name: "Label changes"
+    runs-on: ubuntu-22.04
+    steps:
+
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 #v4.1.1
+
+      - run: python .github/scripts/labeler.py
+        env:
+          # note: this token has write access to the repo
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_PR_NUMBER: ${{ github.event.number }}
+
+      - name: Delete existing comment
+        if: ${{ hashFiles( env.CI_COMMENT_FILE ) == '' }}
+        uses: marocchino/sticky-pull-request-comment@efaaab3fd41a9c3de579aba759d2552635e590fd #v2.8.0
+        with:
+          header: ${{ env.COMMENT_HEADER }}
+          hide: true
+          hide_classify: "OUTDATED"
+
+      - name: Add comment
+        if: ${{ hashFiles( env.CI_COMMENT_FILE ) != '' }}
+        uses: marocchino/sticky-pull-request-comment@efaaab3fd41a9c3de579aba759d2552635e590fd #v2.8.0
+        with:
+          header: ${{ env.COMMENT_HEADER }}
+          path: ${{ env.CI_COMMENT_FILE }}
diff --git a/.gitignore b/.gitignore
@@ -64,3 +64,8 @@ test/integration/test-fixtures/**/go.sum
 # attestation
 cosign.key
 cosign.pub
+
+# Byte-compiled object files for python
+__pycache__/
+*.py[cod]
+*$py.class
diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -6,6 +6,7 @@ In order to test and develop in this repo you will need the following dependenci
 - Golang
 - docker
 - make
+- Python (>= 3.9)
 
 ### Docker settings for getting started
 Make sure you've updated your docker settings so the default docker socket path is available.

diff --git a/Makefile b/Makefile
@@ -70,7 +70,7 @@ all: static-analysis test ## Run all linux-based checks (linting, license check,
 static-analysis: check-go-mod-tidy lint check-licenses check-json-schema-drift  ## Run all static analysis checks
 
 .PHONY: test
-test: unit integration validate-cyclonedx-schema benchmark cli ## Run all tests (currently unit, integration, linux compare, and cli tests)
+test: unit integration validate-cyclonedx-schema benchmark test-utils cli ## Run all tests (currently unit, integration, linux compare, and cli tests)
 
 
 ## Bootstrapping targets #################################
@@ -167,6 +167,10 @@ cli: $(SNAPSHOT_DIR)  ## Run CLI tests
 	SYFT_BINARY_LOCATION='$(SNAPSHOT_BIN)' \
 		go test -count=1 -timeout=15m -v ./test/cli
 
+.PHONY: test-utils
+test-utils:
+	python .github/scripts/labeler_test.py
+
 
 ## Benchmark test targets #################################