Zious11 · Zious11 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs
@@ -145,13 +145,38 @@ fn compute_ja3s(version: u16, cipher: TlsCipherSuiteID, extensions: &[TlsExtensi
     bytes_to_hex(Md5::digest(ja3s_str.as_bytes()).as_slice())
 }
 
+/// Result of decoding a TLS SNI hostname.
+///
+/// RFC 6066 §3 specifies that the `HostName` field is "represented as a byte
+/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890),
+/// which are also ASCII. A `NonUtf8` SNI is therefore a protocol violation —
+/// usually a buggy TLS client, but worth surfacing for forensic review.
+enum SniValue {
+    /// Hostname decoded cleanly as UTF-8 (which includes the RFC-compliant ASCII case).
+    Utf8(String),
+    /// Hostname bytes failed UTF-8 decoding. `lossy` is the U+FFFD-replaced form
+    /// for human display; `hex` is the lossless lowercase hex of the raw bytes.
+    NonUtf8 { lossy: String, hex: String },
+}
+
 /// Extract SNI hostname from the parsed extension list.
-fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option<String> {
+///
+/// Returns `None` if no SNI extension is present or the extension's hostname
+/// list is empty. Otherwise returns an `SniValue` describing the first hostname
+/// (we ignore additional entries — multi-name SNI is rare and treating only the
+/// first matches the prior behavior).
+fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option<SniValue> {
     for ext in extensions {
         if let TlsExtension::SNI(list) = ext
             && let Some((_, hostname)) = list.first()
         {
-            return String::from_utf8(hostname.to_vec()).ok();
+            return Some(match std::str::from_utf8(hostname) {
+                Ok(s) => SniValue::Utf8(s.to_string()),
+                Err(_) => SniValue::NonUtf8 {
+                    lossy: String::from_utf8_lossy(hostname).into_owned(),
+                    hex: bytes_to_hex(hostname),
+                },
+            });
         }
     }
     None
@@ -276,7 +301,34 @@ impl TlsAnalyzer {
 
         // SNI
         if let Some(sni) = extract_sni(&exts) {
-            Self::increment(&mut self.sni_counts, sni, MAX_MAP_ENTRIES);
+            // Choose a map key that preserves uniqueness for non-UTF-8 cases.
+            // Using `lossy` as a key would collapse distinct byte sequences whose
+            // U+FFFD replacements happen to align — bad for forensic counting.
+            let key = match &sni {
+                SniValue::Utf8(s) => s.clone(),
+                SniValue::NonUtf8 { hex, .. } => format!("<non-utf8:{hex}>"),
+            };
+            Self::increment(&mut self.sni_counts, key, MAX_MAP_ENTRIES);
+
+            if let SniValue::NonUtf8 { lossy, hex } = sni {
+                self.all_findings.push(Finding {
+                    category: ThreatCategory::Anomaly,
+                    verdict: Verdict::Inconclusive,
+                    confidence: Confidence::Low,
+                    // Use Debug formatter ({:?}) to escape control bytes (e.g. ESC 0x1b)
+                    // that String::from_utf8_lossy preserves but the analyst's terminal
+                    // would interpret as ANSI control sequences. Without this an attacker
+                    // could craft a malformed SNI like b"\x1b[31m..." that recolors or
+                    // overwrites the rendered finding line.
+                    summary: format!(
+                        "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}"
+                    ),
+                    evidence: vec![format!("hex: {hex}")],
+                    mitre_technique: None,
+                    source_ip: None,
+                    timestamp: None,
+                });
+            }
         }
 
         // JA3

diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs
@@ -14,10 +14,15 @@ fn test_flow_key() -> FlowKey {
 
 /// Build a minimal TLS ClientHello record with SNI and specified cipher suites.
 fn build_client_hello(sni: &str, cipher_ids: &[u16]) -> Vec<u8> {
+    build_client_hello_raw_sni(sni.as_bytes(), cipher_ids)
+}
+
+/// Build a minimal TLS ClientHello record with arbitrary raw bytes for the SNI hostname.
+/// Used for tests that exercise non-UTF-8 / malformed SNI handling.
+fn build_client_hello_raw_sni(sni_bytes: &[u8], cipher_ids: &[u16]) -> Vec<u8> {
     let mut extensions = Vec::new();
 
     // SNI extension (type 0x0000)
-    let sni_bytes = sni.as_bytes();
     let sni_list_len = (3 + sni_bytes.len()) as u16;
     let sni_ext_len = 2 + sni_list_len;
     extensions.extend_from_slice(&[0x00, 0x00]); // extension type: server_name
@@ -288,3 +293,113 @@ fn test_summarize_output() {
     assert!(detail.contains_key("cipher_suites"));
     assert_eq!(detail["parse_errors"], 0);
 }
+
+#[test]
+fn test_non_utf8_sni_emits_finding_and_counts_under_hex_key() {
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    // 0xFF / 0xFE are invalid as standalone UTF-8 start bytes — guarantees
+    // from_utf8 fails. Mix in some ASCII so the lossy form is recognizable.
+    let raw_sni: &[u8] = &[0xff, 0xfe, b'a', b'.', b'c', b'o', b'm'];
+    let record = build_client_hello_raw_sni(raw_sni, &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    // Parse error counter must NOT be incremented — the record itself parsed,
+    // only the SNI hostname bytes failed UTF-8 decoding.
+    assert_eq!(analyzer.parse_error_count(), 0);
+
+    // sni_counts should be keyed on a tagged hex form, not on a lossy string.
+    // This guarantees distinct byte sequences don't collide.
+    let expected_key = "<non-utf8:fffe612e636f6d>";
+    assert_eq!(
+        *analyzer.sni_counts().get(expected_key).unwrap(),
+        1,
+        "expected sni_counts to contain hex-tagged key {expected_key}"
+    );
+
+    // Exactly one finding, the non-UTF-8 SNI anomaly.
+    let findings = analyzer.findings();
+    let non_utf8_findings: Vec<_> = findings
+        .iter()
+        .filter(|f| f.summary.contains("non-UTF-8 bytes"))
+        .collect();
+    assert_eq!(
+        non_utf8_findings.len(),
+        1,
+        "expected exactly one non-UTF-8 SNI finding, got: {findings:?}"
+    );
+    let f = non_utf8_findings[0];
+    assert_eq!(f.category, wirerust::findings::ThreatCategory::Anomaly);
+    assert_eq!(f.verdict, wirerust::findings::Verdict::Inconclusive);
+    assert_eq!(f.confidence, wirerust::findings::Confidence::Low);
+    assert!(f.summary.contains("RFC 6066"));
+    // Hex evidence is the lossless representation of the raw bytes.
+    assert!(
+        f.evidence.iter().any(|e| e.contains("fffe612e636f6d")),
+        "expected hex evidence to contain raw byte sequence, got: {:?}",
+        f.evidence
+    );
+}
+
+#[test]
+fn test_ascii_sni_does_not_emit_non_utf8_finding() {
+    // Regression: a normal ASCII hostname must not trip the non-UTF-8 finding.
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    let record = build_client_hello("example.com", &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    assert_eq!(*analyzer.sni_counts().get("example.com").unwrap(), 1);
+    let non_utf8_findings = analyzer
+        .findings()
+        .iter()
+        .filter(|f| f.summary.contains("non-UTF-8 bytes"))
+        .count();
+    assert_eq!(non_utf8_findings, 0);
+}
+
+#[test]
+fn test_non_utf8_sni_escapes_control_bytes_in_summary() {
+    // Security regression: a malformed SNI containing raw ESC (0x1b) plus an
+    // ANSI CSI sequence must NOT propagate the literal control byte into the
+    // finding summary, where it would be interpreted by an analyst's terminal
+    // and could recolor or overwrite the rendered report line.
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    // 0xff makes from_utf8 fail; 0x1b [ 3 1 m is the ANSI "red" CSI sequence;
+    // "pwnd" is the visible payload an attacker would inject.
+    let raw_sni: &[u8] = &[0xff, 0x1b, b'[', b'3', b'1', b'm', b'p', b'w', b'n', b'd'];
+    let record = build_client_hello_raw_sni(raw_sni, &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    let findings = analyzer.findings();
+    let f = findings
+        .iter()
+        .find(|f| f.summary.contains("non-UTF-8 bytes"))
+        .expect("expected non-UTF-8 SNI finding");
+
+    // The summary must not contain the raw ESC byte. Debug formatting ({:?})
+    // turns 0x1b into the literal escape sequence "\u{1b}" instead.
+    assert!(
+        !f.summary.as_bytes().contains(&0x1b),
+        "summary contains raw ESC byte (terminal injection vector): {:?}",
+        f.summary
+    );
+    assert!(
+        f.summary.contains("\\u{1b}"),
+        "summary should contain escaped ESC sequence \\u{{1b}}, got: {}",
+        f.summary
+    );
+
+    // Hex evidence is unchanged — that's the lossless record.
+    assert!(
+        f.evidence
+            .iter()
+            .any(|e| e.contains("ff1b5b33316d70776e64")),
+        "expected raw bytes in hex evidence, got: {:?}",
+        f.evidence
+    );
+}