From 964036a84e5bc63997a7ff09f3422976d60ae0fb Mon Sep 17 00:00:00 2001 From: Zious Date: Wed, 8 Apr 2026 10:38:26 -0500 Subject: [PATCH 1/2] fix: surface non-UTF-8 SNI hostnames instead of dropping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #28. extract_sni() previously used String::from_utf8(...).ok() which silently discarded any SNI hostname containing non-UTF-8 bytes. For a forensics tool that's the wrong default — non-UTF-8 SNI is a RFC 6066 §3 protocol violation (HostName MUST be ASCII; internationalized names use A-labels per RFC 5890) and a possible (low-confidence) malware/buggy-client indicator worth surfacing to analysts. Approach -------- - New SniValue enum: Utf8(String) for the clean case, NonUtf8 { lossy, hex } when from_utf8 fails. The lossy form is for human display (U+FFFD replacement); the hex form is the lossless evidence record. - handle_client_hello keys sni_counts on a tagged "" form for the NonUtf8 case rather than the lossy string. This is critical: using the lossy string as a key would let two distinct byte sequences collide if their U+FFFD-replaced forms happen to align — bad for a forensics counter. - Emit one Anomaly / Inconclusive / Low finding per non-UTF-8 SNI, with the lossy form in the summary and the hex in the evidence vector. No MITRE technique attached (T1001.001/.003 don't fit cleanly; matches the existing weak-cipher and SSLv3 findings in this file which also use None). - parse_error_count is intentionally NOT incremented — the TLS record parsed cleanly, only one extension field failed UTF-8 decoding. Tests ----- - New build_client_hello_raw_sni helper that takes &[u8] for the SNI, reusing the existing builder for the rest of the record. - test_non_utf8_sni_emits_finding_and_counts_under_hex_key: builds a ClientHello with [0xff, 0xfe, "a.com"] as the SNI bytes, asserts one finding with the right severity/category/MITRE and verifies sni_counts contains the hex-tagged key. - test_ascii_sni_does_not_emit_non_utf8_finding: regression — clean ASCII hostname must not trip the new finding. Scope intentionally excludes valid-UTF-8-but-non-ASCII SNI (e.g. raw U-labels). Those are also a RFC 6066 violation but are a separate detection worth its own issue. --- src/analyzer/tls.rs | 53 +++++++++++++++++++++++++-- tests/tls_analyzer_tests.rs | 73 ++++++++++++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 4 deletions(-) diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs index b5bd66a..d20e408 100644 --- a/src/analyzer/tls.rs +++ b/src/analyzer/tls.rs @@ -145,13 +145,38 @@ fn compute_ja3s(version: u16, cipher: TlsCipherSuiteID, extensions: &[TlsExtensi bytes_to_hex(Md5::digest(ja3s_str.as_bytes()).as_slice()) } +/// Result of decoding a TLS SNI hostname. +/// +/// RFC 6066 §3 specifies that the `HostName` field is "represented as a byte +/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890), +/// which are also ASCII. A `NonUtf8` SNI is therefore a protocol violation — +/// usually a buggy TLS client, but worth surfacing for forensic review. +enum SniValue { + /// Hostname decoded cleanly as UTF-8 (which includes the RFC-compliant ASCII case). + Utf8(String), + /// Hostname bytes failed UTF-8 decoding. `lossy` is the U+FFFD-replaced form + /// for human display; `hex` is the lossless lowercase hex of the raw bytes. + NonUtf8 { lossy: String, hex: String }, +} + /// Extract SNI hostname from the parsed extension list. -fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option { +/// +/// Returns `None` if no SNI extension is present or the extension's hostname +/// list is empty. Otherwise returns an `SniValue` describing the first hostname +/// (we ignore additional entries — multi-name SNI is rare and treating only the +/// first matches the prior behavior). +fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option { for ext in extensions { if let TlsExtension::SNI(list) = ext && let Some((_, hostname)) = list.first() { - return String::from_utf8(hostname.to_vec()).ok(); + return Some(match std::str::from_utf8(hostname) { + Ok(s) => SniValue::Utf8(s.to_string()), + Err(_) => SniValue::NonUtf8 { + lossy: String::from_utf8_lossy(hostname).into_owned(), + hex: bytes_to_hex(hostname), + }, + }); } } None @@ -276,7 +301,29 @@ impl TlsAnalyzer { // SNI if let Some(sni) = extract_sni(&exts) { - Self::increment(&mut self.sni_counts, sni, MAX_MAP_ENTRIES); + // Choose a map key that preserves uniqueness for non-UTF-8 cases. + // Using `lossy` as a key would collapse distinct byte sequences whose + // U+FFFD replacements happen to align — bad for forensic counting. + let key = match &sni { + SniValue::Utf8(s) => s.clone(), + SniValue::NonUtf8 { hex, .. } => format!(""), + }; + Self::increment(&mut self.sni_counts, key, MAX_MAP_ENTRIES); + + if let SniValue::NonUtf8 { lossy, hex } = sni { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + summary: format!( + "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } } // JA3 diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs index 0419962..bf32c88 100644 --- a/tests/tls_analyzer_tests.rs +++ b/tests/tls_analyzer_tests.rs @@ -14,10 +14,15 @@ fn test_flow_key() -> FlowKey { /// Build a minimal TLS ClientHello record with SNI and specified cipher suites. fn build_client_hello(sni: &str, cipher_ids: &[u16]) -> Vec { + build_client_hello_raw_sni(sni.as_bytes(), cipher_ids) +} + +/// Build a minimal TLS ClientHello record with arbitrary raw bytes for the SNI hostname. +/// Used for tests that exercise non-UTF-8 / malformed SNI handling. +fn build_client_hello_raw_sni(sni_bytes: &[u8], cipher_ids: &[u16]) -> Vec { let mut extensions = Vec::new(); // SNI extension (type 0x0000) - let sni_bytes = sni.as_bytes(); let sni_list_len = (3 + sni_bytes.len()) as u16; let sni_ext_len = 2 + sni_list_len; extensions.extend_from_slice(&[0x00, 0x00]); // extension type: server_name @@ -288,3 +293,69 @@ fn test_summarize_output() { assert!(detail.contains_key("cipher_suites")); assert_eq!(detail["parse_errors"], 0); } + +#[test] +fn test_non_utf8_sni_emits_finding_and_counts_under_hex_key() { + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + // 0xFF / 0xFE are invalid as standalone UTF-8 start bytes — guarantees + // from_utf8 fails. Mix in some ASCII so the lossy form is recognizable. + let raw_sni: &[u8] = &[0xff, 0xfe, b'a', b'.', b'c', b'o', b'm']; + let record = build_client_hello_raw_sni(raw_sni, &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + // Parse error counter must NOT be incremented — the record itself parsed, + // only the SNI hostname bytes failed UTF-8 decoding. + assert_eq!(analyzer.parse_error_count(), 0); + + // sni_counts should be keyed on a tagged hex form, not on a lossy string. + // This guarantees distinct byte sequences don't collide. + let expected_key = ""; + assert_eq!( + *analyzer.sni_counts().get(expected_key).unwrap(), + 1, + "expected sni_counts to contain hex-tagged key {expected_key}" + ); + + // Exactly one finding, the non-UTF-8 SNI anomaly. + let findings = analyzer.findings(); + let non_utf8_findings: Vec<_> = findings + .iter() + .filter(|f| f.summary.contains("non-UTF-8 bytes")) + .collect(); + assert_eq!( + non_utf8_findings.len(), + 1, + "expected exactly one non-UTF-8 SNI finding, got: {findings:?}" + ); + let f = non_utf8_findings[0]; + assert_eq!(f.category, wirerust::findings::ThreatCategory::Anomaly); + assert_eq!(f.verdict, wirerust::findings::Verdict::Inconclusive); + assert_eq!(f.confidence, wirerust::findings::Confidence::Low); + assert!(f.summary.contains("RFC 6066")); + // Hex evidence is the lossless representation of the raw bytes. + assert!( + f.evidence.iter().any(|e| e.contains("fffe612e636f6d")), + "expected hex evidence to contain raw byte sequence, got: {:?}", + f.evidence + ); +} + +#[test] +fn test_ascii_sni_does_not_emit_non_utf8_finding() { + // Regression: a normal ASCII hostname must not trip the non-UTF-8 finding. + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + let record = build_client_hello("example.com", &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + assert_eq!(*analyzer.sni_counts().get("example.com").unwrap(), 1); + let non_utf8_findings = analyzer + .findings() + .iter() + .filter(|f| f.summary.contains("non-UTF-8 bytes")) + .count(); + assert_eq!(non_utf8_findings, 0); +} From c8b81ccadea373fccadf03a120196b77121e41d9 Mon Sep 17 00:00:00 2001 From: Zious Date: Wed, 8 Apr 2026 10:54:48 -0500 Subject: [PATCH 2/2] fix: escape control bytes in non-UTF-8 SNI finding summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Copilot review on PR #49. String::from_utf8_lossy only replaces invalid UTF-8 *sequences* with U+FFFD — it preserves valid-but-control bytes like ESC (0x1b) verbatim. Interpolating the lossy form into the finding summary with {} therefore left a terminal injection vector: a malformed SNI like b"\x1b[31mpwnd" would render in red on any terminal showing the finding. Switch the lossy interpolation to {:?} (Debug formatter), which escapes ESC and other control characters as the literal sequence \u{1b}. The hex evidence is unchanged — that remains the lossless record. No behavior change for SNIs whose lossy form is purely printable text. Verified the fix empirically: format!("{:?}", "\u{fffd}\x1b[31mpwnd") contains no raw 0x1b byte in the output, while format!("{}", ...) does. Tests ----- - New test_non_utf8_sni_escapes_control_bytes_in_summary: builds a ClientHello with raw ESC + ANSI CSI in the SNI, asserts the finding summary does NOT contain a literal 0x1b byte but DOES contain the escaped "\u{1b}" sequence, and verifies the hex evidence still carries the original bytes losslessly. --- src/analyzer/tls.rs | 7 +++++- tests/tls_analyzer_tests.rs | 44 +++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs index d20e408..b32fcb2 100644 --- a/src/analyzer/tls.rs +++ b/src/analyzer/tls.rs @@ -315,8 +315,13 @@ impl TlsAnalyzer { category: ThreatCategory::Anomaly, verdict: Verdict::Inconclusive, confidence: Confidence::Low, + // Use Debug formatter ({:?}) to escape control bytes (e.g. ESC 0x1b) + // that String::from_utf8_lossy preserves but the analyst's terminal + // would interpret as ANSI control sequences. Without this an attacker + // could craft a malformed SNI like b"\x1b[31m..." that recolors or + // overwrites the rendered finding line. summary: format!( - "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy}" + "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}" ), evidence: vec![format!("hex: {hex}")], mitre_technique: None, diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs index bf32c88..76bf2d9 100644 --- a/tests/tls_analyzer_tests.rs +++ b/tests/tls_analyzer_tests.rs @@ -359,3 +359,47 @@ fn test_ascii_sni_does_not_emit_non_utf8_finding() { .count(); assert_eq!(non_utf8_findings, 0); } + +#[test] +fn test_non_utf8_sni_escapes_control_bytes_in_summary() { + // Security regression: a malformed SNI containing raw ESC (0x1b) plus an + // ANSI CSI sequence must NOT propagate the literal control byte into the + // finding summary, where it would be interpreted by an analyst's terminal + // and could recolor or overwrite the rendered report line. + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + // 0xff makes from_utf8 fail; 0x1b [ 3 1 m is the ANSI "red" CSI sequence; + // "pwnd" is the visible payload an attacker would inject. + let raw_sni: &[u8] = &[0xff, 0x1b, b'[', b'3', b'1', b'm', b'p', b'w', b'n', b'd']; + let record = build_client_hello_raw_sni(raw_sni, &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let findings = analyzer.findings(); + let f = findings + .iter() + .find(|f| f.summary.contains("non-UTF-8 bytes")) + .expect("expected non-UTF-8 SNI finding"); + + // The summary must not contain the raw ESC byte. Debug formatting ({:?}) + // turns 0x1b into the literal escape sequence "\u{1b}" instead. + assert!( + !f.summary.as_bytes().contains(&0x1b), + "summary contains raw ESC byte (terminal injection vector): {:?}", + f.summary + ); + assert!( + f.summary.contains("\\u{1b}"), + "summary should contain escaped ESC sequence \\u{{1b}}, got: {}", + f.summary + ); + + // Hex evidence is unchanged — that's the lossless record. + assert!( + f.evidence + .iter() + .any(|e| e.contains("ff1b5b33316d70776e64")), + "expected raw bytes in hex evidence, got: {:?}", + f.evidence + ); +}