From e8723881311b743f1f8f7ab15e32d2c57616d189 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Thu, 9 Apr 2026 14:19:28 -0400 Subject: [PATCH 1/4] test(patch): show trailing garbage fails patch parsing --- src/patch/tests.rs | 144 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/src/patch/tests.rs b/src/patch/tests.rs index 235040c2..01734749 100644 --- a/src/patch/tests.rs +++ b/src/patch/tests.rs @@ -1,6 +1,150 @@ use super::error::ParsePatchErrorKind; use super::parse::{parse, parse_bytes}; +#[test] +fn trailing_garbage_after_complete_hunk() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old line ++new line +this is trailing garbage +that should be ignored +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); +} + +#[test] +fn garbage_before_hunk_complete_fails() { + // If hunk line count isn't satisfied, garbage causes error + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1,3 +1,3 @@ +-line 1 ++LINE 1 +garbage before hunk complete + line 3 +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); +} + +#[test] +fn git_headers_after_hunk_ignored() { + // Git extended headers appearing after a complete hunk should be ignored + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +diff --git a/other.txt b/other.txt +index 1234567..89abcdef 100644 +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); +} + +/// When splitting multi-patch input by `---/+++` boundaries, trailing +/// `diff --git` lines from the next patch may linger. If the last hunk +/// ends with `\ No newline at end of file`, the parser should still +/// recognize the hunk as complete and ignore the trailing content, +/// as GNU patch does. +/// +/// Pattern first appeared in rust-lang/cargo@b119b891df93f128abef634215cd8f967c3cd120 +/// where HTML files lost their trailing newlines. +#[test] +fn no_newline_at_eof_followed_by_trailing_garbage() { + let s = "\ +--- a/file.html ++++ b/file.html +@@ -1,3 +1,3 @@ +
+-

old

++

new

+
+\\ No newline at end of file +diff --git a/other.html b/other.html +index 1234567..89abcdef 100644 +"; + // no_newline_context is set, so next non-@@ line → ExpectedEndOfHunk + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::ExpectedEndOfHunk, + ); +} + +#[test] +fn multi_hunk_with_trailing_garbage() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +@@ -5 +5 @@ +-b ++B +some trailing garbage +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); +} + +#[test] +fn garbage_between_hunks_stops_parsing() { + // GNU patch would try to parse the second @@ as a new patch + // and fail because there's no `---` header. + // + // diffy `Patch` is a single patch parser, so should just ignore everything + // after the first complete hunk when garbage is encountered. + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +not a hunk line +@@ -5 +5 @@ +-b ++B +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); +} + +#[test] +fn context_lines_counted_correctly() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1,4 +1,4 @@ + context 1 +-deleted ++inserted + context 2 + context 3 +trailing garbage +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); +} + #[test] fn test_escaped_filenames() { // No escaped characters From c83b3d2782cbd564deabad826c4b52847b1c7dd9 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Thu, 9 Apr 2026 14:22:54 -0400 Subject: [PATCH 2/4] fix(patch)!: stop parsing at garbage after hunk satisfied After this, we stop parsing at trailing garbage after hunk is satisfied. Hunk is satisfied when line counts from header are satisfied. - `hunk_lines()` now tracks old/new line counts during parsing - Stops at non-hunk line when counts satisfied - Errors if non-hunk line before counts satisfied - Handles trailing garbage after `\ No newline at end of file` marker (pattern first appeared in rust-lang/cargo@b119b891d) This is a preparation for multi-patch parsing where splitting by `---/+++` boundaries may leave trailing `diff --git` lines from the next patch. While this is a breaking change in behavior, it matches GNU patch behavior. --- src/patch/parse.rs | 67 ++++++++++++++++++++++++++++++++++++++-------- src/patch/tests.rs | 44 +++++++++++++----------------- 2 files changed, 75 insertions(+), 36 deletions(-) diff --git a/src/patch/parse.rs b/src/patch/parse.rs index 1176222b..54e8efe3 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -156,7 +156,10 @@ fn verify_hunks_in_order(hunks: &[Hunk<'_, T>]) -> bool { fn hunks<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result>> { let mut hunks = Vec::new(); - while parser.peek().is_some() { + // Following GNU patch behavior: stop at non-@@ content. + // Any trailing content (including hidden @@ headers) is silently ignored. + // This is more permissive than git apply, which errors on junk between hunks. + while parser.peek().is_some_and(|line| line.starts_with("@@ ")) { hunks.push(hunk(parser)?); } @@ -173,13 +176,7 @@ fn hunk<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result> let header_line = parser.next()?; let (range1, range2, function_context) = hunk_header(header_line).map_err(|e| parser.error_at(e.kind, hunk_start))?; - let lines = hunk_lines(parser)?; - - // check counts of lines to see if they match the ranges in the hunk header - let (len1, len2) = super::hunk_lines_count(&lines); - if len1 != range1.len || len2 != range2.len { - return Err(parser.error_at(ParsePatchErrorKind::HunkMismatch, hunk_start)); - } + let lines = hunk_lines(parser, range1.len, range2.len, hunk_start)?; Ok(Hunk::new(range1, range2, function_context, lines)) } @@ -223,36 +220,61 @@ fn range(s: &T) -> Result { Ok(HunkRange::new(start, len)) } -fn hunk_lines<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result>> { +fn hunk_lines<'a, T: Text + ?Sized>( + parser: &mut Parser<'a, T>, + expected_old: usize, + expected_new: usize, + hunk_start: usize, +) -> Result>> { let mut lines: Vec> = Vec::new(); let mut no_newline_context = false; let mut no_newline_delete = false; let mut no_newline_insert = false; + let mut old_count = 0; + let mut new_count = 0; + while let Some(line) = parser.peek() { + let hunk_complete = old_count >= expected_old && new_count >= expected_new; + let line = if line.starts_with("@") { break; } else if no_newline_context { + if hunk_complete { + break; + } return Err(parser.error(ParsePatchErrorKind::ExpectedEndOfHunk)); } else if let Some(line) = line.strip_prefix(" ") { + if hunk_complete { + break; + } Line::Context(line) } else if line.starts_with("\n") { + if hunk_complete { + break; + } Line::Context(*line) } else if let Some(line) = line.strip_prefix("-") { if no_newline_delete { return Err(parser.error(ParsePatchErrorKind::TooManyDeletedLines)); } + if hunk_complete { + break; + } Line::Delete(line) } else if let Some(line) = line.strip_prefix("+") { if no_newline_insert { return Err(parser.error(ParsePatchErrorKind::TooManyInsertedLines)); } + if hunk_complete { + break; + } Line::Insert(line) } else if line.starts_with(NO_NEWLINE_AT_EOF) { let last_line = lines .pop() .ok_or_else(|| parser.error(ParsePatchErrorKind::UnexpectedNoNewlineMarker))?; - match last_line { + let modified = match last_line { Line::Context(line) => { no_newline_context = true; Line::Context(strip_newline(line)?) @@ -265,15 +287,38 @@ fn hunk_lines<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result { + old_count += 1; + new_count += 1; + } + Line::Delete(_) => { + old_count += 1; + } + Line::Insert(_) => { + new_count += 1; + } + } + lines.push(line); parser.next()?; } + if old_count != expected_old || new_count != expected_new { + return Err(parser.error_at(ParsePatchErrorKind::HunkMismatch, hunk_start)); + } + Ok(lines) } diff --git a/src/patch/tests.rs b/src/patch/tests.rs index 01734749..1d8fbfeb 100644 --- a/src/patch/tests.rs +++ b/src/patch/tests.rs @@ -12,10 +12,10 @@ fn trailing_garbage_after_complete_hunk() { this is trailing garbage that should be ignored "; - assert_eq!( - parse(s).unwrap_err().kind, - ParsePatchErrorKind::UnexpectedHunkLine, - ); + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + assert_eq!(patch.hunks()[0].old_range().len(), 1); + assert_eq!(patch.hunks()[0].new_range().len(), 1); } #[test] @@ -48,10 +48,8 @@ fn git_headers_after_hunk_ignored() { diff --git a/other.txt b/other.txt index 1234567..89abcdef 100644 "; - assert_eq!( - parse(s).unwrap_err().kind, - ParsePatchErrorKind::UnexpectedHunkLine, - ); + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); } /// When splitting multi-patch input by `---/+++` boundaries, trailing @@ -76,11 +74,10 @@ fn no_newline_at_eof_followed_by_trailing_garbage() { diff --git a/other.html b/other.html index 1234567..89abcdef 100644 "; - // no_newline_context is set, so next non-@@ line → ExpectedEndOfHunk - assert_eq!( - parse(s).unwrap_err().kind, - ParsePatchErrorKind::ExpectedEndOfHunk, - ); + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + assert_eq!(patch.hunks()[0].old_range().len(), 3); + assert_eq!(patch.hunks()[0].new_range().len(), 3); } #[test] @@ -96,10 +93,8 @@ fn multi_hunk_with_trailing_garbage() { +B some trailing garbage "; - assert_eq!( - parse(s).unwrap_err().kind, - ParsePatchErrorKind::UnexpectedHunkLine, - ); + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 2); } #[test] @@ -120,10 +115,9 @@ not a hunk line -b +B "; - assert_eq!( - parse(s).unwrap_err().kind, - ParsePatchErrorKind::UnexpectedHunkLine, - ); + let patch = parse(s).unwrap(); + // Only first hunk is parsed; second @@ is ignored as garbage + assert_eq!(patch.hunks().len(), 1); } #[test] @@ -139,10 +133,10 @@ fn context_lines_counted_correctly() { context 3 trailing garbage "; - assert_eq!( - parse(s).unwrap_err().kind, - ParsePatchErrorKind::UnexpectedHunkLine, - ); + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + assert_eq!(patch.hunks()[0].old_range().len(), 4); + assert_eq!(patch.hunks()[0].new_range().len(), 4); } #[test] From 30352b0f6b652e141de9b996ad04c3f2ee1d4a15 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Thu, 9 Apr 2026 17:14:00 -0400 Subject: [PATCH 3/4] test(patch): strict mode rejects orphaned hunk headers Tests document the currenet behavior (git-apply incompatible). This will be fixed in the next commit. --- src/patch/tests.rs | 103 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/src/patch/tests.rs b/src/patch/tests.rs index 1d8fbfeb..40807775 100644 --- a/src/patch/tests.rs +++ b/src/patch/tests.rs @@ -139,6 +139,109 @@ trailing garbage assert_eq!(patch.hunks()[0].new_range().len(), 4); } +// Strict mode (git-apply behavior): rejects orphaned hunk headers +// hidden behind trailing content, but allows plain trailing junk. +// Currently all use permissive `parse`; the next commit introduces +// `parse_strict` and updates assertions where behavior diverges. +mod strict_mode { + use super::*; + + #[test] + fn trailing_junk_allowed() { + // git apply accepts trailing junk after all hunks + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +this is trailing garbage +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + } + + #[test] + fn trailing_junk_allowed_bytes() { + let s = b"\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +this is trailing garbage +"; + let patch = parse_bytes(&s[..]).unwrap(); + assert_eq!(patch.hunks().len(), 1); + } + + #[test] + fn orphaned_hunk_header_after_junk() { + // Junk between hunks hides the second @@ — strict should reject this + // since git apply errors with "patch fragment without header". + // Currently permissive: succeeds with 1 hunk. + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +not a hunk line +@@ -5 +5 @@ +-b ++B +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + } + + #[test] + fn no_junk_parses_normally() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 1); + } + + #[test] + fn multi_hunk_no_junk() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-a ++A +@@ -5 +5 @@ +-b ++B +"; + let patch = parse(s).unwrap(); + assert_eq!(patch.hunks().len(), 2); + } + + #[test] + fn garbage_before_hunk_complete_fails() { + let s = "\ +--- a/file.txt ++++ b/file.txt +@@ -1,3 +1,3 @@ +-line 1 ++LINE 1 +garbage before hunk complete + line 3 +"; + assert_eq!( + parse(s).unwrap_err().kind, + ParsePatchErrorKind::UnexpectedHunkLine, + ); + } +} + #[test] fn test_escaped_filenames() { // No escaped characters From 7b7c81cd7d73061ac776b44d518a515ab4b61baa Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Thu, 9 Apr 2026 17:20:51 -0400 Subject: [PATCH 4/4] feat(patch): strict parsing mode (the `git-apply` behavior) Adds `from_str_strict`/`from_bytes_strict` that reject orphaned hunk headers hidden behind trailing content This matches `git apply` behavior. Plain trailing junk is still accepted. The default `from_str`/`from_bytes` remain permissive (the GNU patch behavior ). --- src/patch/error.rs | 4 ++++ src/patch/mod.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ src/patch/parse.rs | 36 ++++++++++++++++++++++++++++++++++++ src/patch/tests.rs | 23 +++++++++++------------ 4 files changed, 93 insertions(+), 12 deletions(-) diff --git a/src/patch/error.rs b/src/patch/error.rs index 70828629..f55f4969 100644 --- a/src/patch/error.rs +++ b/src/patch/error.rs @@ -107,6 +107,9 @@ pub(crate) enum ParsePatchErrorKind { /// Missing newline at end of line. MissingNewline, + + /// Orphaned hunk header found after trailing content. + OrphanedHunkHeader, } impl fmt::Display for ParsePatchErrorKind { @@ -132,6 +135,7 @@ impl fmt::Display for ParsePatchErrorKind { Self::UnexpectedNoNewlineMarker => "unexpected 'No newline at end of file' line", Self::UnexpectedHunkLine => "unexpected line in hunk body", Self::MissingNewline => "missing newline", + Self::OrphanedHunkHeader => "orphaned hunk header after trailing content", }; write!(f, "{msg}") } diff --git a/src/patch/mod.rs b/src/patch/mod.rs index 3a182506..e228f708 100644 --- a/src/patch/mod.rs +++ b/src/patch/mod.rs @@ -16,6 +16,30 @@ use crate::utils::{byte_needs_quoting, fmt_escaped_byte, write_escaped_byte}; const NO_NEWLINE_AT_EOF: &str = "\\ No newline at end of file"; /// Representation of all the differences between two files +/// +/// # Parsing modes +/// +/// `Patch` provides two parsing modes with different strictness levels, +/// modeled after the behavior of GNU patch and `git apply`: +/// +/// | Scenario | GNU patch | git apply | [`from_str`] | [`from_str_strict`] | +/// |-----------------------------------|-------------|-----------|--------------|---------------------| +/// | Junk after all hunks are complete | Ignores | Ignores | Ignores | Ignores | +/// | Junk between hunks | Ignores[^1] | Errors | Ignores[^1] | Errors | +/// +/// [^1]: "Ignores" here means silently stopping at the junk. +/// Only hunks before it are parsed; later hunks are dropped. +/// +/// [`from_str`] and [`from_bytes`] follow GNU patch behavior, +/// silently ignoring non-patch content after a hunk's line counts are satisfied. +/// +/// [`from_str_strict`] and [`from_bytes_strict`] follow `git apply` behavior, +/// additionally rejecting orphaned hunk headers hidden behind trailing content. +/// +/// [`from_str`]: Patch::from_str +/// [`from_bytes`]: Patch::from_bytes +/// [`from_str_strict`]: Patch::from_str_strict +/// [`from_bytes_strict`]: Patch::from_bytes_strict #[derive(PartialEq, Eq)] pub struct Patch<'a, T: ToOwned + ?Sized> { // TODO GNU patch is able to parse patches without filename headers. @@ -108,6 +132,15 @@ impl<'a> Patch<'a, str> { pub fn from_str(s: &'a str) -> Result, ParsePatchError> { parse::parse(s) } + + /// Parse a `Patch` from a string in strict mode + /// + /// Unlike [`Patch::from_str`], + /// this rejects orphaned hunk headers hidden after trailing content, + /// matching `git apply` behavior. + pub fn from_str_strict(s: &'a str) -> Result, ParsePatchError> { + parse::parse_strict(s) + } } impl<'a> Patch<'a, [u8]> { @@ -115,6 +148,15 @@ impl<'a> Patch<'a, [u8]> { pub fn from_bytes(s: &'a [u8]) -> Result, ParsePatchError> { parse::parse_bytes(s) } + + /// Parse a `Patch` from bytes in strict mode + /// + /// Unlike [`Patch::from_bytes`], + /// this rejects orphaned hunk headers hidden after trailing content, + /// matching `git apply` behavior. + pub fn from_bytes_strict(s: &'a [u8]) -> Result, ParsePatchError> { + parse::parse_bytes_strict(s) + } } impl Clone for Patch<'_, T> { diff --git a/src/patch/parse.rs b/src/patch/parse.rs index 54e8efe3..fda46075 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -65,6 +65,19 @@ pub fn parse(input: &str) -> Result> { )) } +pub fn parse_strict(input: &str) -> Result> { + let mut parser = Parser::new(input); + let header = patch_header(&mut parser)?; + let hunks = hunks(&mut parser)?; + reject_orphaned_hunk_headers(&mut parser)?; + + Ok(Patch::new( + header.0.map(convert_cow_to_str), + header.1.map(convert_cow_to_str), + hunks, + )) +} + pub fn parse_bytes(input: &[u8]) -> Result> { let mut parser = Parser::new(input); let header = patch_header(&mut parser)?; @@ -73,6 +86,15 @@ pub fn parse_bytes(input: &[u8]) -> Result> { Ok(Patch::new(header.0, header.1, hunks)) } +pub fn parse_bytes_strict(input: &[u8]) -> Result> { + let mut parser = Parser::new(input); + let header = patch_header(&mut parser)?; + let hunks = hunks(&mut parser)?; + reject_orphaned_hunk_headers(&mut parser)?; + + Ok(Patch::new(header.0, header.1, hunks)) +} + // This is only used when the type originated as a utf8 string fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> { match cow { @@ -154,6 +176,20 @@ fn verify_hunks_in_order(hunks: &[Hunk<'_, T>]) -> bool { true } +/// Scans remaining lines for orphaned `@@ ` hunk headers. +/// +/// In strict mode (git-apply behavior), trailing junk is allowed but +/// an `@@ ` line hiding behind that junk indicates a lost hunk. +fn reject_orphaned_hunk_headers(parser: &mut Parser<'_, T>) -> Result<()> { + while let Some(line) = parser.peek() { + if line.starts_with("@@ ") { + return Err(parser.error(ParsePatchErrorKind::OrphanedHunkHeader)); + } + parser.next()?; + } + Ok(()) +} + fn hunks<'a, T: Text + ?Sized>(parser: &mut Parser<'a, T>) -> Result>> { let mut hunks = Vec::new(); // Following GNU patch behavior: stop at non-@@ content. diff --git a/src/patch/tests.rs b/src/patch/tests.rs index 40807775..19671fd5 100644 --- a/src/patch/tests.rs +++ b/src/patch/tests.rs @@ -1,5 +1,5 @@ use super::error::ParsePatchErrorKind; -use super::parse::{parse, parse_bytes}; +use super::parse::{parse, parse_bytes, parse_bytes_strict, parse_strict}; #[test] fn trailing_garbage_after_complete_hunk() { @@ -141,8 +141,6 @@ trailing garbage // Strict mode (git-apply behavior): rejects orphaned hunk headers // hidden behind trailing content, but allows plain trailing junk. -// Currently all use permissive `parse`; the next commit introduces -// `parse_strict` and updates assertions where behavior diverges. mod strict_mode { use super::*; @@ -157,7 +155,7 @@ mod strict_mode { +new this is trailing garbage "; - let patch = parse(s).unwrap(); + let patch = parse_strict(s).unwrap(); assert_eq!(patch.hunks().len(), 1); } @@ -171,15 +169,14 @@ this is trailing garbage +new this is trailing garbage "; - let patch = parse_bytes(&s[..]).unwrap(); + let patch = parse_bytes_strict(&s[..]).unwrap(); assert_eq!(patch.hunks().len(), 1); } #[test] fn orphaned_hunk_header_after_junk() { - // Junk between hunks hides the second @@ — strict should reject this + // Junk between hunks hides the second @@ — strict rejects this // since git apply errors with "patch fragment without header". - // Currently permissive: succeeds with 1 hunk. let s = "\ --- a/file.txt +++ b/file.txt @@ -191,8 +188,10 @@ not a hunk line -b +B "; - let patch = parse(s).unwrap(); - assert_eq!(patch.hunks().len(), 1); + assert_eq!( + parse_strict(s).unwrap_err().kind, + ParsePatchErrorKind::OrphanedHunkHeader, + ); } #[test] @@ -204,7 +203,7 @@ not a hunk line -old +new "; - let patch = parse(s).unwrap(); + let patch = parse_strict(s).unwrap(); assert_eq!(patch.hunks().len(), 1); } @@ -220,7 +219,7 @@ not a hunk line -b +B "; - let patch = parse(s).unwrap(); + let patch = parse_strict(s).unwrap(); assert_eq!(patch.hunks().len(), 2); } @@ -236,7 +235,7 @@ garbage before hunk complete line 3 "; assert_eq!( - parse(s).unwrap_err().kind, + parse_strict(s).unwrap_err().kind, ParsePatchErrorKind::UnexpectedHunkLine, ); }