From 5f4082b42a67c6030ed89721c21519e6106763d8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:42:02 +0000 Subject: [PATCH 1/5] Initial plan From a3061098aa53966260b7a4619c56b6391f76409e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:47:35 +0000 Subject: [PATCH 2/5] Fix dataclass mutable default issue in config.py Co-authored-by: felickz <1760475+felickz@users.noreply.github.com> --- sariftoolkit/config.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sariftoolkit/config.py b/sariftoolkit/config.py index eddce54..c3b767c 100644 --- a/sariftoolkit/config.py +++ b/sariftoolkit/config.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field @dataclass @@ -10,13 +10,13 @@ class PluginConfig: @dataclass class Plugins: - relativepaths: PluginConfig = PluginConfig( + relativepaths: PluginConfig = field(default_factory=lambda: PluginConfig( "RelativePaths", "sariftoolkit.plugins.relativepaths" - ) + )) - submodules: PluginConfig = PluginConfig( + submodules: PluginConfig = field(default_factory=lambda: PluginConfig( "Submodules", "sariftoolkit.plugins.submodules" - ) + )) @dataclass @@ -24,7 +24,7 @@ class Config: name: str = "Default Configuration" version: str = "0.0.0" - plugins: Plugins = Plugins() + plugins: Plugins = field(default_factory=Plugins) def load(path: str) -> Config: From 4b64134a21e6fa68b421713ea37219e7757c6471 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:53:08 +0000 Subject: [PATCH 3/5] Create sarif-splitter plugin with path and severity-based splitting Co-authored-by: felickz <1760475+felickz@users.noreply.github.com> --- sariftoolkit/config.py | 4 + sariftoolkit/plugins/__init__.py | 1 + sariftoolkit/plugins/splitter.py | 343 +++++++++++++++++++++++++++++++ sariftoolkit/sarif/models.py | 6 + 4 files changed, 354 insertions(+) create mode 100644 sariftoolkit/plugins/splitter.py diff --git a/sariftoolkit/config.py b/sariftoolkit/config.py index c3b767c..e807a50 100644 --- a/sariftoolkit/config.py +++ b/sariftoolkit/config.py @@ -18,6 +18,10 @@ class Plugins: "Submodules", "sariftoolkit.plugins.submodules" )) + splitter: PluginConfig = field(default_factory=lambda: PluginConfig( + "Splitter", "sariftoolkit.plugins.splitter" + )) + @dataclass class Config: diff --git a/sariftoolkit/plugins/__init__.py b/sariftoolkit/plugins/__init__.py index 674f431..8d5a2cc 100644 --- a/sariftoolkit/plugins/__init__.py +++ b/sariftoolkit/plugins/__init__.py @@ -1,2 +1,3 @@ from sariftoolkit.plugins.relativepaths import RelativePaths from sariftoolkit.plugins.submodules import Submodules +from sariftoolkit.plugins.splitter import Splitter diff --git a/sariftoolkit/plugins/splitter.py b/sariftoolkit/plugins/splitter.py new file mode 100644 index 0000000..e84e901 --- /dev/null +++ b/sariftoolkit/plugins/splitter.py @@ -0,0 +1,343 @@ +import os +import copy +import fnmatch +from typing import List, Dict, Any +from dataclasses import dataclass, field +from argparse import ArgumentParser + +from sariftoolkit.plugin import Plugin +from sariftoolkit.sarif.sarif import exportSarif +from sariftoolkit.sarif.models import SarifModel, RunsModel + + +@dataclass +class PathSplitRule: + name: str + patterns: List[str] + + +@dataclass +class SeveritySplitRule: + name: str + severities: List[str] # e.g., ["critical"], ["high", "medium"], ["*"] + + +@dataclass +class Splitter(Plugin): + name: str = "Splitter" + version: str = "1.0.0" + description: str = "SARIF File Splitter by Categories" + + split_by_path: bool = False + split_by_severity: bool = False + path_rules: List[PathSplitRule] = field(default_factory=list) + severity_rules: List[SeveritySplitRule] = field(default_factory=list) + language: str = "unknown" + + def arguments(self, parser: ArgumentParser): + parser.add_argument( + "--split-by-path", + action="store_true", + help="Split SARIF by file path patterns" + ) + parser.add_argument( + "--split-by-severity", + action="store_true", + help="Split SARIF by security severity levels" + ) + parser.add_argument( + "--language", + default="unknown", + help="Programming language for category naming (default: unknown)" + ) + parser.add_argument( + "--path-config", + help="JSON configuration file for path-based splitting rules" + ) + parser.add_argument( + "--severity-config", + help="JSON configuration file for severity-based splitting rules" + ) + + def run(self, arguments, **kwargs): + if not arguments.sarif: + self.logger.error("SARIF file path is required") + return + + self.split_by_path = arguments.split_by_path + self.split_by_severity = arguments.split_by_severity + self.language = arguments.language + + if not self.split_by_path and not self.split_by_severity: + self.logger.error("At least one splitting method must be enabled (--split-by-path or --split-by-severity)") + return + + # Load configuration + if arguments.path_config: + self._load_path_config(arguments.path_config) + else: + self._set_default_path_rules() + + if arguments.severity_config: + self._load_severity_config(arguments.severity_config) + else: + self._set_default_severity_rules() + + # Load SARIF files + sarif_files = self.loadSarif(arguments.sarif) + + for sarif_model, sarif_file_path in sarif_files: + self.logger.info(f"Processing SARIF file: {sarif_file_path}") + + if self.split_by_path: + self._split_by_path_patterns(sarif_model, sarif_file_path, arguments) + + if self.split_by_severity: + self._split_by_severity_levels(sarif_model, sarif_file_path, arguments) + + def _set_default_path_rules(self): + """Set default path-based splitting rules""" + self.path_rules = [ + PathSplitRule(name="Tests", patterns=["**/test/**", "**/tests/**", "**/*test*"]), + PathSplitRule(name="App", patterns=["**/web/**", "**/api/**", "**/src/**", "**/app/**"]) + ] + + def _set_default_severity_rules(self): + """Set default severity-based splitting rules""" + self.severity_rules = [ + SeveritySplitRule(name="Critical", severities=["critical"]), + SeveritySplitRule(name="High-Medium", severities=["high", "medium"]), + SeveritySplitRule(name="Others", severities=["*"]) # Catch-all for remaining + ] + + def _load_path_config(self, config_path: str): + """Load path configuration from JSON file""" + import json + try: + with open(config_path, 'r') as f: + config = json.load(f) + self.path_rules = [ + PathSplitRule(name=rule['name'], patterns=rule['patterns']) + for rule in config.get('path_rules', []) + ] + except Exception as e: + self.logger.error(f"Failed to load path config: {e}") + self._set_default_path_rules() + + def _load_severity_config(self, config_path: str): + """Load severity configuration from JSON file""" + import json + try: + with open(config_path, 'r') as f: + config = json.load(f) + self.severity_rules = [ + SeveritySplitRule(name=rule['name'], severities=rule['severities']) + for rule in config.get('severity_rules', []) + ] + except Exception as e: + self.logger.error(f"Failed to load severity config: {e}") + self._set_default_severity_rules() + + def _split_by_path_patterns(self, sarif: SarifModel, sarif_file_path: str, arguments): + """Split SARIF file by file path patterns""" + self.logger.info("Splitting by path patterns") + + # Create category buckets + path_buckets = {} + unmatched_results = [] + + for run in sarif.runs: + tool = run.tool.driver + self.logger.info(f"Processing tool: {tool.name} ({tool.semanticVersion})") + + # Initialize buckets for this run + for rule in self.path_rules: + category_name = f"/language:{self.language}/category:{rule.name}" + if category_name not in path_buckets: + path_buckets[category_name] = { + 'run': copy.deepcopy(run), + 'results': [] + } + path_buckets[category_name]['run'].results = [] + + # Process each result + for result in run.results: + matched = False + + # Get the primary location URI + if result.locations and len(result.locations) > 0: + location = result.locations[0] + if location.physicalLocation and location.physicalLocation.artifactLocation: + uri = location.physicalLocation.artifactLocation.uri + self.logger.debug(f"Checking URI: {uri} for result {result.ruleId}") + + # Check against path rules + for rule in self.path_rules: + for pattern in rule.patterns: + if fnmatch.fnmatch(uri, pattern): + category_name = f"/language:{self.language}/category:{rule.name}" + path_buckets[category_name]['results'].append(result) + matched = True + self.logger.debug(f"Matched {uri} to category {rule.name}") + break + if matched: + break + + if not matched: + unmatched_results.append(result) + + # Handle unmatched results with fallback category + if unmatched_results: + fallback_category = f"/language:{self.language}/filter:none" + path_buckets[fallback_category] = { + 'run': copy.deepcopy(sarif.runs[0]), + 'results': unmatched_results + } + path_buckets[fallback_category]['run'].results = [] + + # Create and export split SARIF files + self._export_split_files(path_buckets, sarif, sarif_file_path, "path", arguments) + + def _split_by_severity_levels(self, sarif: SarifModel, sarif_file_path: str, arguments): + """Split SARIF file by security severity levels""" + self.logger.info("Splitting by severity levels") + + # Build rule severity map + rule_severity_map = {} + for run in sarif.runs: + for rule in run.tool.driver.rules: + if rule.properties: + # Try different ways to access security-severity property + security_severity = None + + # Check if it's a dictionary-like object + if hasattr(rule.properties, '__dict__'): + props = rule.properties.__dict__ + # Try common variations of the property name + security_severity = (props.get('security-severity') or + props.get('security_severity') or + props.get('securitySeverity')) + + # If it's a dataclass with direct attribute access + if not security_severity: + for attr_name in ['security-severity', 'security_severity', 'securitySeverity']: + try: + security_severity = getattr(rule.properties, attr_name, None) + if security_severity: + break + except: + pass + + if security_severity: + try: + # Convert security-severity to severity category + severity_val = float(security_severity) + if severity_val >= 9.0: + severity = "critical" + elif severity_val >= 7.0: + severity = "high" + elif severity_val >= 4.0: + severity = "medium" + else: + severity = "low" + rule_severity_map[rule.id] = severity + self.logger.debug(f"Rule {rule.id} has security-severity {security_severity} -> {severity}") + except ValueError: + self.logger.warning(f"Invalid security-severity value for rule {rule.id}: {security_severity}") + else: + self.logger.debug(f"No security-severity found for rule {rule.id}") + + # Create severity buckets + severity_buckets = {} + unmatched_results = [] + + for run in sarif.runs: + tool = run.tool.driver + self.logger.info(f"Processing tool: {tool.name} ({tool.semanticVersion})") + + # Initialize severity buckets for this run + for rule in self.severity_rules: + if rule.severities != ["*"]: # Skip catch-all initially + category_name = f"/language:{self.language}/severity:{rule.name}" + if category_name not in severity_buckets: + severity_buckets[category_name] = { + 'run': copy.deepcopy(run), + 'results': [] + } + severity_buckets[category_name]['run'].results = [] + + # Process each result + for result in run.results: + matched = False + result_severity = rule_severity_map.get(result.ruleId) + + if result_severity: + # Check against severity rules (excluding catch-all) + for rule in self.severity_rules: + if rule.severities != ["*"] and result_severity in rule.severities: + category_name = f"/language:{self.language}/severity:{rule.name}" + severity_buckets[category_name]['results'].append(result) + matched = True + self.logger.debug(f"Matched {result.ruleId} with severity {result_severity} to {rule.name}") + break + + if not matched: + unmatched_results.append(result) + + # Handle unmatched results with catch-all category + if unmatched_results: + catch_all_rule = next((rule for rule in self.severity_rules if rule.severities == ["*"]), None) + if catch_all_rule: + category_name = f"/language:{self.language}/severity:{catch_all_rule.name}" + severity_buckets[category_name] = { + 'run': copy.deepcopy(sarif.runs[0]), + 'results': unmatched_results + } + severity_buckets[category_name]['run'].results = [] + + # Create and export split SARIF files + self._export_split_files(severity_buckets, sarif, sarif_file_path, "severity", arguments) + + def _export_split_files(self, buckets: Dict[str, Dict], original_sarif: SarifModel, + original_file_path: str, split_type: str, arguments): + """Export split SARIF files with proper runAutomationDetails""" + + base_name, ext = os.path.splitext(original_file_path) + + for category, bucket in buckets.items(): + if not bucket['results']: + self.logger.info(f"Skipping empty category: {category}") + continue + + # Create new SARIF model + new_sarif = copy.deepcopy(original_sarif) + new_sarif.runs = [bucket['run']] + + # Set results for the run + new_sarif.runs[0].results = bucket['results'] + + # Add runAutomationDetails for GitHub Advanced Security + from sariftoolkit.sarif.models import AutomationDetailsModel + + if not new_sarif.runs[0].automationDetails: + new_sarif.runs[0].automationDetails = AutomationDetailsModel(id=category) + else: + new_sarif.runs[0].automationDetails.id = category + + # Generate output filename + safe_category = category.replace("/", "_").replace(":", "-") + output_file = f"{base_name}-{split_type}-{safe_category}{ext}" + + if arguments.output: + if os.path.isdir(arguments.output): + output_file = os.path.join(arguments.output, os.path.basename(output_file)) + else: + output_dir = os.path.dirname(arguments.output) + if output_dir: + output_file = os.path.join(output_dir, os.path.basename(output_file)) + else: + output_file = os.path.join(os.path.dirname(original_file_path), os.path.basename(output_file)) + + # Export the split SARIF file + exportSarif(output_file, new_sarif) + self.logger.info(f"Created split SARIF: {output_file} with {len(bucket['results'])} results for category: {category}") \ No newline at end of file diff --git a/sariftoolkit/sarif/models.py b/sariftoolkit/sarif/models.py index 8cc304f..5b45cef 100644 --- a/sariftoolkit/sarif/models.py +++ b/sariftoolkit/sarif/models.py @@ -287,6 +287,11 @@ def __getattr__(self, name: str): return super().__getattribute__(name) +@dataclass +class AutomationDetailsModel(BaseModel): + id: str = None + + @dataclass class RunsModel(BaseModel): tool: ToolModel = None @@ -295,6 +300,7 @@ class RunsModel(BaseModel): results: List[ResultsModel] = field(default_factory=list) columnKind: str = None properties: PropertiesModel = None + automationDetails: AutomationDetailsModel = None @dataclass From a8849d3e4b876b20bc369bf68a13df3f392a7001 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Sep 2025 21:59:35 +0000 Subject: [PATCH 4/5] Address code review feedback and complete splitter plugin documentation Co-authored-by: felickz <1760475+felickz@users.noreply.github.com> --- README.md | 4 + examples/splitter-configs/path-rules.json | 59 +++++ examples/splitter-configs/severity-rules.json | 20 ++ sariftoolkit/plugins/splitter.py | 13 +- splitter/README.md | 223 ++++++++++++++++++ 5 files changed, 311 insertions(+), 8 deletions(-) create mode 100644 examples/splitter-configs/path-rules.json create mode 100644 examples/splitter-configs/severity-rules.json create mode 100644 splitter/README.md diff --git a/README.md b/README.md index ab263b9..6c99a75 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,10 @@ Patches SARIF result files from the relative working directory path to the Actio This tools allows users to split up SARIF files that use submodules into multiple SARIF files that are then published to there appropriate repository. +### [SARIF Splitter](./splitter/README.md) + +Splits large SARIF files into smaller, categorized files based on file paths or security severity levels. Helps overcome upload size restrictions and improves organization for GitHub Advanced Security dashboards. + ## Support Please create issues for any feature requests, bugs, or documentation problems. diff --git a/examples/splitter-configs/path-rules.json b/examples/splitter-configs/path-rules.json new file mode 100644 index 0000000..892902a --- /dev/null +++ b/examples/splitter-configs/path-rules.json @@ -0,0 +1,59 @@ +{ + "path_rules": [ + { + "name": "Tests", + "patterns": [ + "**/test/**", + "**/tests/**", + "**/*test*", + "**/spec/**", + "**/*spec*" + ] + }, + { + "name": "Frontend", + "patterns": [ + "**/web/**", + "**/frontend/**", + "**/client/**", + "**/ui/**", + "**/*.js", + "**/*.jsx", + "**/*.ts", + "**/*.tsx", + "**/*.vue", + "**/*.html", + "**/*.css" + ] + }, + { + "name": "Backend", + "patterns": [ + "**/api/**", + "**/server/**", + "**/service/**", + "**/services/**", + "**/backend/**", + "**/*.py", + "**/*.java", + "**/*.cs", + "**/*.go" + ] + }, + { + "name": "Infrastructure", + "patterns": [ + "**/terraform/**", + "**/infra/**", + "**/infrastructure/**", + "**/deploy/**", + "**/deployment/**", + "**/*.tf", + "**/*.yaml", + "**/*.yml", + "**/Dockerfile*", + "**/docker-compose*" + ] + } + ] +} \ No newline at end of file diff --git a/examples/splitter-configs/severity-rules.json b/examples/splitter-configs/severity-rules.json new file mode 100644 index 0000000..e2d5879 --- /dev/null +++ b/examples/splitter-configs/severity-rules.json @@ -0,0 +1,20 @@ +{ + "severity_rules": [ + { + "name": "Critical", + "severities": ["critical"] + }, + { + "name": "High", + "severities": ["high"] + }, + { + "name": "Medium", + "severities": ["medium"] + }, + { + "name": "Low-and-Others", + "severities": ["*"] + } + ] +} \ No newline at end of file diff --git a/sariftoolkit/plugins/splitter.py b/sariftoolkit/plugins/splitter.py index e84e901..053bdf1 100644 --- a/sariftoolkit/plugins/splitter.py +++ b/sariftoolkit/plugins/splitter.py @@ -1,13 +1,14 @@ import os import copy import fnmatch +import json from typing import List, Dict, Any from dataclasses import dataclass, field from argparse import ArgumentParser from sariftoolkit.plugin import Plugin from sariftoolkit.sarif.sarif import exportSarif -from sariftoolkit.sarif.models import SarifModel, RunsModel +from sariftoolkit.sarif.models import SarifModel, RunsModel, AutomationDetailsModel @dataclass @@ -112,7 +113,6 @@ def _set_default_severity_rules(self): def _load_path_config(self, config_path: str): """Load path configuration from JSON file""" - import json try: with open(config_path, 'r') as f: config = json.load(f) @@ -120,13 +120,12 @@ def _load_path_config(self, config_path: str): PathSplitRule(name=rule['name'], patterns=rule['patterns']) for rule in config.get('path_rules', []) ] - except Exception as e: + except (FileNotFoundError, json.JSONDecodeError, KeyError) as e: self.logger.error(f"Failed to load path config: {e}") self._set_default_path_rules() def _load_severity_config(self, config_path: str): """Load severity configuration from JSON file""" - import json try: with open(config_path, 'r') as f: config = json.load(f) @@ -134,7 +133,7 @@ def _load_severity_config(self, config_path: str): SeveritySplitRule(name=rule['name'], severities=rule['severities']) for rule in config.get('severity_rules', []) ] - except Exception as e: + except (FileNotFoundError, json.JSONDecodeError, KeyError) as e: self.logger.error(f"Failed to load severity config: {e}") self._set_default_severity_rules() @@ -225,7 +224,7 @@ def _split_by_severity_levels(self, sarif: SarifModel, sarif_file_path: str, arg security_severity = getattr(rule.properties, attr_name, None) if security_severity: break - except: + except AttributeError: pass if security_severity: @@ -317,8 +316,6 @@ def _export_split_files(self, buckets: Dict[str, Dict], original_sarif: SarifMod new_sarif.runs[0].results = bucket['results'] # Add runAutomationDetails for GitHub Advanced Security - from sariftoolkit.sarif.models import AutomationDetailsModel - if not new_sarif.runs[0].automationDetails: new_sarif.runs[0].automationDetails = AutomationDetailsModel(id=category) else: diff --git a/splitter/README.md b/splitter/README.md new file mode 100644 index 0000000..f1df5fe --- /dev/null +++ b/splitter/README.md @@ -0,0 +1,223 @@ +# sarif-toolkit - Splitter + +SARIF File Splitter Plugin for categorizing alerts into separate SARIF files. + +This plugin allows you to split large SARIF files into smaller, categorized files based on file paths or security severity levels. This can help with: +- Overcoming upload size restrictions in GitHub Advanced Security +- Organizing alerts by application areas (tests, frontend, backend, etc.) +- Prioritizing security reviews by severity levels +- Improving dashboard search and filtering capabilities + +## Features + +- **Path-based splitting**: Split alerts by file path patterns using glob matching +- **Severity-based splitting**: Split alerts by security severity levels (critical, high, medium, low) +- **GitHub Advanced Security integration**: Automatically sets `runAutomationDetails.id` for proper categorization +- **No alert loss**: Unmatched alerts are preserved in fallback categories +- **Configurable rules**: Support for JSON configuration files or sensible defaults +- **Flexible naming**: Category names follow GitHub Advanced Security conventions + +## Usage + +### Actions + +This Action needs to be placed in between the point where SARIF file(s) are created and where they are uploaded. + +**Simple Path-based Usage** + +```yaml +# ... SARIF file has been created +- uses: advanced-security/sarif-toolkit/splitter@main + with: + sarif: 'results.sarif' + split-by-path: true + language: 'javascript' +# ... Split SARIF files are being uploaded +``` + +**Simple Severity-based Usage** + +```yaml +# ... SARIF file has been created +- uses: advanced-security/sarif-toolkit/splitter@main + with: + sarif: 'results.sarif' + split-by-severity: true + language: 'python' +# ... Split SARIF files are being uploaded +``` + +**Advanced Configuration** + +```yaml +- uses: advanced-security/sarif-toolkit/splitter@main + with: + # SARIF File / Directory location + sarif: 'sarif-output.json' + # Output directory for split files + output: './split-results' + # Split by file paths + split-by-path: true + # Split by security severity + split-by-severity: true + # Programming language for categories + language: 'java' + # Custom path configuration + path-config: 'path-rules.json' + # Custom severity configuration + severity-config: 'severity-rules.json' +``` + +### Command Line + +**Basic usage:** + +```bash +# Split by file paths +python -m sariftoolkit --enable-splitter --split-by-path --language python --sarif results.sarif + +# Split by severity levels +python -m sariftoolkit --enable-splitter --split-by-severity --language javascript --sarif results.sarif + +# Split by both methods +python -m sariftoolkit --enable-splitter --split-by-path --split-by-severity --language java --sarif results.sarif --output ./split-output +``` + +**With custom configuration:** + +```bash +python -m sariftoolkit --enable-splitter \ + --split-by-path \ + --split-by-severity \ + --language csharp \ + --sarif results.sarif \ + --path-config examples/splitter-configs/path-rules.json \ + --severity-config examples/splitter-configs/severity-rules.json \ + --output ./categorized-results +``` + +## Configuration + +### Path Rules Configuration + +Path rules use glob patterns to match file paths. Example `path-rules.json`: + +```json +{ + "path_rules": [ + { + "name": "Tests", + "patterns": [ + "**/test/**", + "**/tests/**", + "**/*test*" + ] + }, + { + "name": "Frontend", + "patterns": [ + "**/web/**", + "**/client/**", + "**/*.js", + "**/*.jsx" + ] + }, + { + "name": "Backend", + "patterns": [ + "**/api/**", + "**/server/**", + "**/*.py", + "**/*.java" + ] + } + ] +} +``` + +### Severity Rules Configuration + +Severity rules group security severity levels. Example `severity-rules.json`: + +```json +{ + "severity_rules": [ + { + "name": "Critical", + "severities": ["critical"] + }, + { + "name": "High-Medium", + "severities": ["high", "medium"] + }, + { + "name": "Others", + "severities": ["*"] + } + ] +} +``` + +### Default Rules + +**Default Path Rules:** +- `Tests`: `**/test/**`, `**/tests/**`, `**/*test*` +- `App`: `**/web/**`, `**/api/**`, `**/src/**`, `**/app/**` + +**Default Severity Rules:** +- `Critical`: security-severity ≥ 9.0 +- `High-Medium`: security-severity 4.0-8.9 +- `Others`: All remaining (catch-all) + +## Output Categories + +### Path-based Categories +- `/language:/category:` - For matched path patterns +- `/language:/filter:none` - For unmatched files + +### Severity-based Categories +- `/language:/severity:` - For matched severity levels +- `/language:/severity:Others` - For unmatched severities + +These category names are compatible with GitHub Advanced Security's `runAutomationDetails.id` field for proper dashboard organization. + +## Example Use Cases + +### Large Repository Splitting +```bash +# Split a large SARIF file by application areas +python -m sariftoolkit --enable-splitter \ + --split-by-path \ + --language typescript \ + --sarif large-codeql-results.sarif \ + --output ./team-specific-results +``` + +### Security Triage Workflow +```bash +# Split by severity for security team triage +python -m sariftoolkit --enable-splitter \ + --split-by-severity \ + --language csharp \ + --sarif security-scan.sarif \ + --output ./security-triage +``` + +### Multi-dimensional Splitting +```bash +# Create both path and severity splits +python -m sariftoolkit --enable-splitter \ + --split-by-path \ + --split-by-severity \ + --language python \ + --sarif comprehensive-scan.sarif \ + --output ./organized-results +``` + +## Notes + +- The splitter preserves all SARIF metadata including tool information, rules, and notifications +- No alerts are ever dropped - unmatched alerts go to fallback categories +- Multiple splitting methods can be used simultaneously +- Output filenames follow the pattern: `{original-name}-{method}-{category}.sarif` +- Categories are URL-safe and compatible with GitHub Advanced Security requirements \ No newline at end of file From 128847bfeadbd3d4d494c825ec5959fa014dc188 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Sep 2025 22:18:04 +0000 Subject: [PATCH 5/5] Implement requested feedback: restrict to single splitting method, add unit tests, and summary output Co-authored-by: felickz <1760475+felickz@users.noreply.github.com> --- examples/comprehensive-test.sarif | 268 ++++++++++++++++++++++++++++ sariftoolkit/plugins/splitter.py | 82 ++++++++- tests/__init__.py | 1 + tests/test_splitter.py | 281 ++++++++++++++++++++++++++++++ 4 files changed, 625 insertions(+), 7 deletions(-) create mode 100644 examples/comprehensive-test.sarif create mode 100644 tests/__init__.py create mode 100644 tests/test_splitter.py diff --git a/examples/comprehensive-test.sarif b/examples/comprehensive-test.sarif new file mode 100644 index 0000000..5d4336c --- /dev/null +++ b/examples/comprehensive-test.sarif @@ -0,0 +1,268 @@ +{ + "$schema": "https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json", + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "TestTool", + "semanticVersion": "1.0.0", + "rules": [ + { + "id": "critical-rule-1", + "name": "Critical Security Issue 1", + "properties": { + "security-severity": "9.8" + } + }, + { + "id": "critical-rule-2", + "name": "Critical Security Issue 2", + "properties": { + "security-severity": "9.2" + } + }, + { + "id": "high-rule-1", + "name": "High Security Issue 1", + "properties": { + "security-severity": "8.5" + } + }, + { + "id": "high-rule-2", + "name": "High Security Issue 2", + "properties": { + "security-severity": "7.8" + } + }, + { + "id": "high-rule-3", + "name": "High Security Issue 3", + "properties": { + "security-severity": "7.0" + } + }, + { + "id": "medium-rule-1", + "name": "Medium Security Issue 1", + "properties": { + "security-severity": "6.5" + } + }, + { + "id": "medium-rule-2", + "name": "Medium Security Issue 2", + "properties": { + "security-severity": "5.0" + } + }, + { + "id": "medium-rule-3", + "name": "Medium Security Issue 3", + "properties": { + "security-severity": "4.2" + } + }, + { + "id": "low-rule-1", + "name": "Low Security Issue 1", + "properties": { + "security-severity": "3.5" + } + }, + { + "id": "low-rule-2", + "name": "Low Security Issue 2", + "properties": { + "security-severity": "2.1" + } + } + ] + } + }, + "results": [ + { + "ruleId": "critical-rule-1", + "message": { + "text": "Critical security vulnerability found in authentication module" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/auth/login.py" + } + } + } + ] + }, + { + "ruleId": "critical-rule-1", + "message": { + "text": "Critical security vulnerability found in password handling" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/auth/password.py" + } + } + } + ] + }, + { + "ruleId": "critical-rule-2", + "message": { + "text": "Critical injection vulnerability" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/database/query.py" + } + } + } + ] + }, + { + "ruleId": "high-rule-1", + "message": { + "text": "High severity security issue in API" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/api/endpoints.py" + } + } + } + ] + }, + { + "ruleId": "high-rule-2", + "message": { + "text": "High severity validation issue" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/validation/input.py" + } + } + } + ] + }, + { + "ruleId": "high-rule-2", + "message": { + "text": "High severity validation issue in forms" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/forms/validator.py" + } + } + } + ] + }, + { + "ruleId": "high-rule-3", + "message": { + "text": "High severity crypto issue" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/crypto/encryption.py" + } + } + } + ] + }, + { + "ruleId": "medium-rule-1", + "message": { + "text": "Medium severity logging issue" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/logging/logger.py" + } + } + } + ] + }, + { + "ruleId": "medium-rule-2", + "message": { + "text": "Medium severity configuration issue" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/config/settings.py" + } + } + } + ] + }, + { + "ruleId": "medium-rule-3", + "message": { + "text": "Medium severity file handling issue" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/utils/files.py" + } + } + } + ] + }, + { + "ruleId": "low-rule-1", + "message": { + "text": "Low severity code quality issue" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/helpers/utils.py" + } + } + } + ] + }, + { + "ruleId": "low-rule-2", + "message": { + "text": "Low severity documentation issue" + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "src/docs/readme.py" + } + } + } + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/sariftoolkit/plugins/splitter.py b/sariftoolkit/plugins/splitter.py index 053bdf1..2423b22 100644 --- a/sariftoolkit/plugins/splitter.py +++ b/sariftoolkit/plugins/splitter.py @@ -73,6 +73,10 @@ def run(self, arguments, **kwargs): self.logger.error("At least one splitting method must be enabled (--split-by-path or --split-by-severity)") return + if self.split_by_path and self.split_by_severity: + self.logger.error("Only one splitting method can be enabled at a time. Choose either --split-by-path or --split-by-severity") + return + # Load configuration if arguments.path_config: self._load_path_config(arguments.path_config) @@ -87,14 +91,29 @@ def run(self, arguments, **kwargs): # Load SARIF files sarif_files = self.loadSarif(arguments.sarif) + # Summary tracking + summary_data = [] + for sarif_model, sarif_file_path in sarif_files: self.logger.info(f"Processing SARIF file: {sarif_file_path}") + # Count original alerts + original_alert_count = sum(len(run.results) for run in sarif_model.runs) + if self.split_by_path: - self._split_by_path_patterns(sarif_model, sarif_file_path, arguments) + split_results = self._split_by_path_patterns(sarif_model, sarif_file_path, arguments) + elif self.split_by_severity: + split_results = self._split_by_severity_levels(sarif_model, sarif_file_path, arguments) - if self.split_by_severity: - self._split_by_severity_levels(sarif_model, sarif_file_path, arguments) + # Add to summary + summary_data.append({ + 'original_file': sarif_file_path, + 'original_alerts': original_alert_count, + 'split_results': split_results + }) + + # Print summary table + self._print_summary_table(summary_data) def _set_default_path_rules(self): """Set default path-based splitting rules""" @@ -107,7 +126,9 @@ def _set_default_severity_rules(self): """Set default severity-based splitting rules""" self.severity_rules = [ SeveritySplitRule(name="Critical", severities=["critical"]), - SeveritySplitRule(name="High-Medium", severities=["high", "medium"]), + SeveritySplitRule(name="High", severities=["high"]), + SeveritySplitRule(name="Medium", severities=["medium"]), + SeveritySplitRule(name="Low", severities=["low"]), SeveritySplitRule(name="Others", severities=["*"]) # Catch-all for remaining ] @@ -195,7 +216,7 @@ def _split_by_path_patterns(self, sarif: SarifModel, sarif_file_path: str, argum path_buckets[fallback_category]['run'].results = [] # Create and export split SARIF files - self._export_split_files(path_buckets, sarif, sarif_file_path, "path", arguments) + return self._export_split_files(path_buckets, sarif, sarif_file_path, "path", arguments) def _split_by_severity_levels(self, sarif: SarifModel, sarif_file_path: str, arguments): """Split SARIF file by security severity levels""" @@ -295,13 +316,14 @@ def _split_by_severity_levels(self, sarif: SarifModel, sarif_file_path: str, arg severity_buckets[category_name]['run'].results = [] # Create and export split SARIF files - self._export_split_files(severity_buckets, sarif, sarif_file_path, "severity", arguments) + return self._export_split_files(severity_buckets, sarif, sarif_file_path, "severity", arguments) def _export_split_files(self, buckets: Dict[str, Dict], original_sarif: SarifModel, original_file_path: str, split_type: str, arguments): """Export split SARIF files with proper runAutomationDetails""" base_name, ext = os.path.splitext(original_file_path) + split_results = [] for category, bucket in buckets.items(): if not bucket['results']: @@ -337,4 +359,50 @@ def _export_split_files(self, buckets: Dict[str, Dict], original_sarif: SarifMod # Export the split SARIF file exportSarif(output_file, new_sarif) - self.logger.info(f"Created split SARIF: {output_file} with {len(bucket['results'])} results for category: {category}") \ No newline at end of file + self.logger.info(f"Created split SARIF: {output_file} with {len(bucket['results'])} results for category: {category}") + + # Track for summary + split_results.append({ + 'file': os.path.basename(output_file), + 'alerts': len(bucket['results']), + 'category': category + }) + + return split_results + + def _print_summary_table(self, summary_data): + """Print a summary table of before/after view""" + if not summary_data: + return + + print("\n" + "="*80) + print("SARIF SPLITTING SUMMARY") + print("="*80) + print(f"{'Original SARIF File':<30} {'Original Alerts':<15} {'Split File':<35} {'Category':<30} {'Alerts':<10}") + print("-"*80) + + for data in summary_data: + original_file = os.path.basename(data['original_file']) + original_alerts = data['original_alerts'] + + if data['split_results']: + # Print first split result on same line as original + first_result = data['split_results'][0] + print(f"{original_file:<30} {original_alerts:<15} {first_result['file']:<35} {first_result['category']:<30} {first_result['alerts']:<10}") + + # Print remaining split results on separate lines + for result in data['split_results'][1:]: + print(f"{'':^30} {'':^15} {result['file']:<35} {result['category']:<30} {result['alerts']:<10}") + else: + print(f"{original_file:<30} {original_alerts:<15} {'No splits created':<35} {'N/A':<30} {'0':<10}") + + print("-"*80) + + # Summary totals + total_original = sum(data['original_alerts'] for data in summary_data) + total_split_files = sum(len(data['split_results']) for data in summary_data) + total_split_alerts = sum(sum(result['alerts'] for result in data['split_results']) for data in summary_data) + + print(f"TOTALS: {len(summary_data)} original file(s), {total_original} original alerts") + print(f" {total_split_files} split file(s) created, {total_split_alerts} total alerts distributed") + print("="*80 + "\n") \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..1128409 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Tests for sarif-toolkit \ No newline at end of file diff --git a/tests/test_splitter.py b/tests/test_splitter.py new file mode 100644 index 0000000..32fb187 --- /dev/null +++ b/tests/test_splitter.py @@ -0,0 +1,281 @@ +import os +import json +import tempfile +import shutil +import unittest +from unittest.mock import MagicMock + +from sariftoolkit.plugins.splitter import Splitter +from sariftoolkit.sarif.models import ( + SarifModel, RunsModel, ToolModel, DriverModel, RulesModel, + ResultsModel, MessageModel, LocationsModel, PhysicallocationModel, + ArtifactlocationModel, PropertiesModel, AutomationDetailsModel +) + + +class TestSplitter(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + self.test_dir = tempfile.mkdtemp() + self.splitter = Splitter() + + # Mock logger to avoid log output during tests + self.splitter.logger = MagicMock() + + def tearDown(self): + """Clean up after each test method.""" + shutil.rmtree(self.test_dir, ignore_errors=True) + + def _create_test_sarif(self, results_data): + """Create a test SARIF model with specified results data.""" + # Create rules with different security severities + rules = [] + for i, (rule_id, security_severity) in enumerate([ + ("critical-rule", "9.5"), + ("high-rule", "7.8"), + ("medium-rule", "5.2"), + ("low-rule", "2.1") + ]): + # Create properties with security_severity manually + props = PropertiesModel() + props.security_severity = security_severity + + rule = RulesModel( + id=rule_id, + properties=props + ) + rules.append(rule) + + # Create tool and driver + driver = DriverModel( + name="TestTool", + semanticVersion="1.0.0", + rules=rules + ) + tool = ToolModel(driver=driver) + + # Create results + results = [] + for rule_id, file_path in results_data: + result = ResultsModel( + ruleId=rule_id, + message=MessageModel(text=f"Test message for {rule_id}"), + locations=[LocationsModel( + physicalLocation=PhysicallocationModel( + artifactLocation=ArtifactlocationModel(uri=file_path) + ) + )] + ) + results.append(result) + + # Create run and SARIF + run = RunsModel(tool=tool, results=results) + sarif = SarifModel(runs=[run], version="2.1.0") + + return sarif + + def test_severity_splitting_creates_correct_categories(self): + """Test that severity-based splitting creates the expected categories.""" + # Create test SARIF with results for each severity level + results_data = [ + ("critical-rule", "src/critical.py"), + ("critical-rule", "src/critical2.py"), + ("high-rule", "src/high.py"), + ("high-rule", "src/high2.py"), + ("high-rule", "src/high3.py"), + ("medium-rule", "src/medium.py"), + ("low-rule", "src/low.py") + ] + + sarif = self._create_test_sarif(results_data) + + # Create arguments mock + args = MagicMock() + args.output = self.test_dir + + # Set up splitter for severity splitting + self.splitter.split_by_severity = True + self.splitter.language = "python" + self.splitter._set_default_severity_rules() + + # Run severity splitting + split_results = self.splitter._split_by_severity_levels(sarif, "test.sarif", args) + + # Verify the correct number of categories were created + self.assertEqual(len(split_results), 4) # Critical, High, Medium, Low + + # Verify each category has the correct number of alerts + category_counts = {result['category'].split('/')[-1]: result['alerts'] for result in split_results} + + self.assertEqual(category_counts.get('severity:Critical', 0), 2) + self.assertEqual(category_counts.get('severity:High', 0), 3) + self.assertEqual(category_counts.get('severity:Medium', 0), 1) + self.assertEqual(category_counts.get('severity:Low', 0), 1) + + # Verify automation details are set correctly + for result in split_results: + self.assertTrue(result['category'].startswith('/language:python/severity:')) + + def test_path_splitting_creates_correct_categories(self): + """Test that path-based splitting creates the expected categories.""" + # Create test SARIF with results for different path patterns + results_data = [ + ("test-rule", "tests/unit/test_file.py"), + ("test-rule", "test/integration/test_api.py"), + ("app-rule", "src/main.py"), + ("app-rule", "api/handler.py"), + ("other-rule", "config/settings.py") + ] + + sarif = self._create_test_sarif(results_data) + + # Create arguments mock + args = MagicMock() + args.output = self.test_dir + + # Set up splitter for path splitting + self.splitter.split_by_path = True + self.splitter.language = "python" + self.splitter._set_default_path_rules() + + # Run path splitting + split_results = self.splitter._split_by_path_patterns(sarif, "test.sarif", args) + + # Verify categories were created + self.assertGreater(len(split_results), 0) + + # Check that test files are categorized correctly + test_category_found = any('category:Tests' in result['category'] for result in split_results) + app_category_found = any('category:App' in result['category'] for result in split_results) + + self.assertTrue(test_category_found or app_category_found) + + def test_single_splitting_method_restriction(self): + """Test that only one splitting method can be enabled at a time.""" + # Create test arguments + args = MagicMock() + args.sarif = "test.sarif" + args.output = self.test_dir + args.path_config = None + args.severity_config = None + + # Set both splitting methods to True + self.splitter.split_by_path = True + self.splitter.split_by_severity = True + + # Mock loadSarif to avoid file loading + self.splitter.loadSarif = MagicMock(return_value=[]) + + # Run should exit early with error log + self.splitter.run(args) + + # Verify error was logged + self.splitter.logger.error.assert_called_with( + "Only one splitting method can be enabled at a time. Choose either --split-by-path or --split-by-severity" + ) + + def test_no_alerts_are_dropped(self): + """Test that all alerts are preserved during splitting.""" + # Create test SARIF with mixed results + results_data = [ + ("critical-rule", "src/critical.py"), + ("high-rule", "src/high.py"), + ("unknown-rule", "src/unknown.py") # This will not match any severity mapping + ] + + sarif = self._create_test_sarif(results_data) + + # Add an additional result with no rule mapping + unknown_result = ResultsModel( + ruleId="unmapped-rule", + message=MessageModel(text="Test message for unmapped rule"), + locations=[LocationsModel( + physicalLocation=PhysicallocationModel( + artifactLocation=ArtifactlocationModel(uri="src/unmapped.py") + ) + )] + ) + sarif.runs[0].results.append(unknown_result) + + # Create arguments mock + args = MagicMock() + args.output = self.test_dir + + # Set up splitter for severity splitting + self.splitter.split_by_severity = True + self.splitter.language = "python" + self.splitter._set_default_severity_rules() + + # Count original alerts + original_count = len(sarif.runs[0].results) + + # Run severity splitting + split_results = self.splitter._split_by_severity_levels(sarif, "test.sarif", args) + + # Count total split alerts + total_split_alerts = sum(result['alerts'] for result in split_results) + + # Verify no alerts were dropped + self.assertEqual(original_count, total_split_alerts) + + def test_automation_details_are_set_correctly(self): + """Test that runAutomationDetails are set correctly in output files.""" + # Create simple test SARIF + results_data = [("high-rule", "src/test.py")] + sarif = self._create_test_sarif(results_data) + + # Create arguments mock + args = MagicMock() + args.output = self.test_dir + + # Set up splitter + self.splitter.split_by_severity = True + self.splitter.language = "java" # Use different language to test + self.splitter._set_default_severity_rules() + + # Run splitting + split_results = self.splitter._split_by_severity_levels(sarif, "test.sarif", args) + + # Verify automation details format + self.assertEqual(len(split_results), 1) + expected_category = "/language:java/severity:High" + self.assertEqual(split_results[0]['category'], expected_category) + + def test_summary_table_output(self): + """Test that summary table contains correct information.""" + # Create test data + summary_data = [{ + 'original_file': '/path/to/test.sarif', + 'original_alerts': 5, + 'split_results': [ + {'file': 'test-critical.sarif', 'alerts': 2, 'category': '/language:python/severity:Critical'}, + {'file': 'test-high.sarif', 'alerts': 3, 'category': '/language:python/severity:High'} + ] + }] + + # Capture print output + import io + import sys + captured_output = io.StringIO() + sys.stdout = captured_output + + try: + self.splitter._print_summary_table(summary_data) + output = captured_output.getvalue() + + # Verify summary table content + self.assertIn("SARIF SPLITTING SUMMARY", output) + self.assertIn("test.sarif", output) + self.assertIn("5", output) # Original alerts count + self.assertIn("test-critical.sarif", output) + self.assertIn("test-high.sarif", output) + self.assertIn("/language:python/severity:Critical", output) + self.assertIn("/language:python/severity:High", output) + self.assertIn("TOTALS:", output) + + finally: + sys.stdout = sys.__stdout__ + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file