From 964036a84e5bc63997a7ff09f3422976d60ae0fb Mon Sep 17 00:00:00 2001
From: Zious <jaredbrichards@gmail.com>
Date: Wed, 8 Apr 2026 10:38:26 -0500
Subject: [PATCH 1/2] fix: surface non-UTF-8 SNI hostnames instead of dropping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #28. extract_sni() previously used String::from_utf8(...).ok()
which silently discarded any SNI hostname containing non-UTF-8 bytes.
For a forensics tool that's the wrong default — non-UTF-8 SNI is a
RFC 6066 §3 protocol violation (HostName MUST be ASCII; internationalized
names use A-labels per RFC 5890) and a possible (low-confidence)
malware/buggy-client indicator worth surfacing to analysts.

Approach
--------
- New SniValue enum: Utf8(String) for the clean case, NonUtf8 { lossy,
  hex } when from_utf8 fails. The lossy form is for human display
  (U+FFFD replacement); the hex form is the lossless evidence record.
- handle_client_hello keys sni_counts on a tagged "<non-utf8:HEX>" form
  for the NonUtf8 case rather than the lossy string. This is critical:
  using the lossy string as a key would let two distinct byte sequences
  collide if their U+FFFD-replaced forms happen to align — bad for a
  forensics counter.
- Emit one Anomaly / Inconclusive / Low finding per non-UTF-8 SNI, with
  the lossy form in the summary and the hex in the evidence vector. No
  MITRE technique attached (T1001.001/.003 don't fit cleanly; matches
  the existing weak-cipher and SSLv3 findings in this file which also
  use None).
- parse_error_count is intentionally NOT incremented — the TLS record
  parsed cleanly, only one extension field failed UTF-8 decoding.

Tests
-----
- New build_client_hello_raw_sni helper that takes &[u8] for the SNI,
  reusing the existing builder for the rest of the record.
- test_non_utf8_sni_emits_finding_and_counts_under_hex_key: builds a
  ClientHello with [0xff, 0xfe, "a.com"] as the SNI bytes, asserts
  one finding with the right severity/category/MITRE and verifies
  sni_counts contains the hex-tagged key.
- test_ascii_sni_does_not_emit_non_utf8_finding: regression — clean
  ASCII hostname must not trip the new finding.

Scope intentionally excludes valid-UTF-8-but-non-ASCII SNI (e.g. raw
U-labels). Those are also a RFC 6066 violation but are a separate
detection worth its own issue.
---
 src/analyzer/tls.rs         | 53 +++++++++++++++++++++++++--
 tests/tls_analyzer_tests.rs | 73 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 122 insertions(+), 4 deletions(-)
diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs
index b5bd66a..d20e408 100644
--- a/src/analyzer/tls.rs
+++ b/src/analyzer/tls.rs
@@ -145,13 +145,38 @@ fn compute_ja3s(version: u16, cipher: TlsCipherSuiteID, extensions: &[TlsExtensi
     bytes_to_hex(Md5::digest(ja3s_str.as_bytes()).as_slice())
 }
 
+/// Result of decoding a TLS SNI hostname.
+///
+/// RFC 6066 §3 specifies that the `HostName` field is "represented as a byte
+/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890),
+/// which are also ASCII. A `NonUtf8` SNI is therefore a protocol violation —
+/// usually a buggy TLS client, but worth surfacing for forensic review.
+enum SniValue {
+    /// Hostname decoded cleanly as UTF-8 (which includes the RFC-compliant ASCII case).
+    Utf8(String),
+    /// Hostname bytes failed UTF-8 decoding. `lossy` is the U+FFFD-replaced form
+    /// for human display; `hex` is the lossless lowercase hex of the raw bytes.
+    NonUtf8 { lossy: String, hex: String },
+}
+
 /// Extract SNI hostname from the parsed extension list.
-fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option<String> {
+///
+/// Returns `None` if no SNI extension is present or the extension's hostname
+/// list is empty. Otherwise returns an `SniValue` describing the first hostname
+/// (we ignore additional entries — multi-name SNI is rare and treating only the
+/// first matches the prior behavior).
+fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option<SniValue> {
     for ext in extensions {
         if let TlsExtension::SNI(list) = ext
             && let Some((_, hostname)) = list.first()
         {
-            return String::from_utf8(hostname.to_vec()).ok();
+            return Some(match std::str::from_utf8(hostname) {
+                Ok(s) => SniValue::Utf8(s.to_string()),
+                Err(_) => SniValue::NonUtf8 {
+                    lossy: String::from_utf8_lossy(hostname).into_owned(),
+                    hex: bytes_to_hex(hostname),
+                },
+            });
         }
     }
     None
@@ -276,7 +301,29 @@ impl TlsAnalyzer {
 
         // SNI
         if let Some(sni) = extract_sni(&exts) {
-            Self::increment(&mut self.sni_counts, sni, MAX_MAP_ENTRIES);
+            // Choose a map key that preserves uniqueness for non-UTF-8 cases.
+            // Using `lossy` as a key would collapse distinct byte sequences whose
+            // U+FFFD replacements happen to align — bad for forensic counting.
+            let key = match &sni {
+                SniValue::Utf8(s) => s.clone(),
+                SniValue::NonUtf8 { hex, .. } => format!("<non-utf8:{hex}>"),
+            };
+            Self::increment(&mut self.sni_counts, key, MAX_MAP_ENTRIES);
+
+            if let SniValue::NonUtf8 { lossy, hex } = sni {
+                self.all_findings.push(Finding {
+                    category: ThreatCategory::Anomaly,
+                    verdict: Verdict::Inconclusive,
+                    confidence: Confidence::Low,
+                    summary: format!(
+                        "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy}"
+                    ),
+                    evidence: vec![format!("hex: {hex}")],
+                    mitre_technique: None,
+                    source_ip: None,
+                    timestamp: None,
+                });
+            }
         }
 
         // JA3
diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs
index 0419962..bf32c88 100644
--- a/tests/tls_analyzer_tests.rs
+++ b/tests/tls_analyzer_tests.rs
@@ -14,10 +14,15 @@ fn test_flow_key() -> FlowKey {
 
 /// Build a minimal TLS ClientHello record with SNI and specified cipher suites.
 fn build_client_hello(sni: &str, cipher_ids: &[u16]) -> Vec<u8> {
+    build_client_hello_raw_sni(sni.as_bytes(), cipher_ids)
+}
+
+/// Build a minimal TLS ClientHello record with arbitrary raw bytes for the SNI hostname.
+/// Used for tests that exercise non-UTF-8 / malformed SNI handling.
+fn build_client_hello_raw_sni(sni_bytes: &[u8], cipher_ids: &[u16]) -> Vec<u8> {
     let mut extensions = Vec::new();
 
     // SNI extension (type 0x0000)
-    let sni_bytes = sni.as_bytes();
     let sni_list_len = (3 + sni_bytes.len()) as u16;
     let sni_ext_len = 2 + sni_list_len;
     extensions.extend_from_slice(&[0x00, 0x00]); // extension type: server_name
@@ -288,3 +293,69 @@ fn test_summarize_output() {
     assert!(detail.contains_key("cipher_suites"));
     assert_eq!(detail["parse_errors"], 0);
 }
+
+#[test]
+fn test_non_utf8_sni_emits_finding_and_counts_under_hex_key() {
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    // 0xFF / 0xFE are invalid as standalone UTF-8 start bytes — guarantees
+    // from_utf8 fails. Mix in some ASCII so the lossy form is recognizable.
+    let raw_sni: &[u8] = &[0xff, 0xfe, b'a', b'.', b'c', b'o', b'm'];
+    let record = build_client_hello_raw_sni(raw_sni, &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    // Parse error counter must NOT be incremented — the record itself parsed,
+    // only the SNI hostname bytes failed UTF-8 decoding.
+    assert_eq!(analyzer.parse_error_count(), 0);
+
+    // sni_counts should be keyed on a tagged hex form, not on a lossy string.
+    // This guarantees distinct byte sequences don't collide.
+    let expected_key = "<non-utf8:fffe612e636f6d>";
+    assert_eq!(
+        *analyzer.sni_counts().get(expected_key).unwrap(),
+        1,
+        "expected sni_counts to contain hex-tagged key {expected_key}"
+    );
+
+    // Exactly one finding, the non-UTF-8 SNI anomaly.
+    let findings = analyzer.findings();
+    let non_utf8_findings: Vec<_> = findings
+        .iter()
+        .filter(|f| f.summary.contains("non-UTF-8 bytes"))
+        .collect();
+    assert_eq!(
+        non_utf8_findings.len(),
+        1,
+        "expected exactly one non-UTF-8 SNI finding, got: {findings:?}"
+    );
+    let f = non_utf8_findings[0];
+    assert_eq!(f.category, wirerust::findings::ThreatCategory::Anomaly);
+    assert_eq!(f.verdict, wirerust::findings::Verdict::Inconclusive);
+    assert_eq!(f.confidence, wirerust::findings::Confidence::Low);
+    assert!(f.summary.contains("RFC 6066"));
+    // Hex evidence is the lossless representation of the raw bytes.
+    assert!(
+        f.evidence.iter().any(|e| e.contains("fffe612e636f6d")),
+        "expected hex evidence to contain raw byte sequence, got: {:?}",
+        f.evidence
+    );
+}
+
+#[test]
+fn test_ascii_sni_does_not_emit_non_utf8_finding() {
+    // Regression: a normal ASCII hostname must not trip the non-UTF-8 finding.
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    let record = build_client_hello("example.com", &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    assert_eq!(*analyzer.sni_counts().get("example.com").unwrap(), 1);
+    let non_utf8_findings = analyzer
+        .findings()
+        .iter()
+        .filter(|f| f.summary.contains("non-UTF-8 bytes"))
+        .count();
+    assert_eq!(non_utf8_findings, 0);
+}

From c8b81ccadea373fccadf03a120196b77121e41d9 Mon Sep 17 00:00:00 2001
From: Zious <jaredbrichards@gmail.com>
Date: Wed, 8 Apr 2026 10:54:48 -0500
Subject: [PATCH 2/2] fix: escape control bytes in non-UTF-8 SNI finding
 summary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses Copilot review on PR #49. String::from_utf8_lossy only
replaces invalid UTF-8 *sequences* with U+FFFD — it preserves
valid-but-control bytes like ESC (0x1b) verbatim. Interpolating the
lossy form into the finding summary with {} therefore left a terminal
injection vector: a malformed SNI like b"\x1b[31mpwnd" would render
in red on any terminal showing the finding.

Switch the lossy interpolation to {:?} (Debug formatter), which
escapes ESC and other control characters as the literal sequence
\u{1b}. The hex evidence is unchanged — that remains the lossless
record. No behavior change for SNIs whose lossy form is purely
printable text.

Verified the fix empirically: format!("{:?}", "\u{fffd}\x1b[31mpwnd")
contains no raw 0x1b byte in the output, while format!("{}", ...)
does.

Tests
-----
- New test_non_utf8_sni_escapes_control_bytes_in_summary: builds a
  ClientHello with raw ESC + ANSI CSI in the SNI, asserts the finding
  summary does NOT contain a literal 0x1b byte but DOES contain the
  escaped "\u{1b}" sequence, and verifies the hex evidence still
  carries the original bytes losslessly.
---
 src/analyzer/tls.rs         |  7 +++++-
 tests/tls_analyzer_tests.rs | 44 +++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs
index d20e408..b32fcb2 100644
--- a/src/analyzer/tls.rs
+++ b/src/analyzer/tls.rs
@@ -315,8 +315,13 @@ impl TlsAnalyzer {
                     category: ThreatCategory::Anomaly,
                     verdict: Verdict::Inconclusive,
                     confidence: Confidence::Low,
+                    // Use Debug formatter ({:?}) to escape control bytes (e.g. ESC 0x1b)
+                    // that String::from_utf8_lossy preserves but the analyst's terminal
+                    // would interpret as ANSI control sequences. Without this an attacker
+                    // could craft a malformed SNI like b"\x1b[31m..." that recolors or
+                    // overwrites the rendered finding line.
                     summary: format!(
-                        "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy}"
+                        "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}"
                     ),
                     evidence: vec![format!("hex: {hex}")],
                     mitre_technique: None,
diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs
index bf32c88..76bf2d9 100644
--- a/tests/tls_analyzer_tests.rs
+++ b/tests/tls_analyzer_tests.rs
@@ -359,3 +359,47 @@ fn test_ascii_sni_does_not_emit_non_utf8_finding() {
         .count();
     assert_eq!(non_utf8_findings, 0);
 }
+
+#[test]
+fn test_non_utf8_sni_escapes_control_bytes_in_summary() {
+    // Security regression: a malformed SNI containing raw ESC (0x1b) plus an
+    // ANSI CSI sequence must NOT propagate the literal control byte into the
+    // finding summary, where it would be interpreted by an analyst's terminal
+    // and could recolor or overwrite the rendered report line.
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    // 0xff makes from_utf8 fail; 0x1b [ 3 1 m is the ANSI "red" CSI sequence;
+    // "pwnd" is the visible payload an attacker would inject.
+    let raw_sni: &[u8] = &[0xff, 0x1b, b'[', b'3', b'1', b'm', b'p', b'w', b'n', b'd'];
+    let record = build_client_hello_raw_sni(raw_sni, &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    let findings = analyzer.findings();
+    let f = findings
+        .iter()
+        .find(|f| f.summary.contains("non-UTF-8 bytes"))
+        .expect("expected non-UTF-8 SNI finding");
+
+    // The summary must not contain the raw ESC byte. Debug formatting ({:?})
+    // turns 0x1b into the literal escape sequence "\u{1b}" instead.
+    assert!(
+        !f.summary.as_bytes().contains(&0x1b),
+        "summary contains raw ESC byte (terminal injection vector): {:?}",
+        f.summary
+    );
+    assert!(
+        f.summary.contains("\\u{1b}"),
+        "summary should contain escaped ESC sequence \\u{{1b}}, got: {}",
+        f.summary
+    );
+
+    // Hex evidence is unchanged — that's the lossless record.
+    assert!(
+        f.evidence
+            .iter()
+            .any(|e| e.contains("ff1b5b33316d70776e64")),
+        "expected raw bytes in hex evidence, got: {:?}",
+        f.evidence
+    );
+}