In [1]:
!pip -q install scikit-learn umap-learn hdbscan

In [2]:
import warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics.pairwise import cosine_similarity

try:
    import umap
    HAS_UMAP = True
except Exception:
    HAS_UMAP = False

try:
    import hdbscan
    HAS_HDBSCAN = True
except Exception:
    HAS_HDBSCAN = False

rng = np.random.default_rng(42)

2026-02-17 11:33:56.196191: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771328036.428167      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771328036.495300      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771328037.047204      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771328037.047253      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771328037.047256      17 computation_placer.cc:177] computation placer alr

In [3]:
# Create example "binaries"
# We'll generate byte sequences from different "families" with distinct patterns,
# which high-dimensional n-gram features can separate.

n = 1200
families = np.array(["fam_A", "fam_B", "fam_C", "fam_D", "fam_E"])
p = np.array([0.22, 0.20, 0.19, 0.21, 0.18])
y = rng.choice(families, size=n, p=p)

def make_bytes(label, L=6000):
    b = rng.integers(0, 256, size=L, dtype=np.uint8)
    # inject repeated motifs by family
    if label == "fam_A":
        b[1000:1200] = 0x90  # NOP-sled like
    elif label == "fam_B":
        b[2000:2200] = rng.choice([0xE8, 0xE9, 0xEB], size=200).astype(np.uint8)  # call/jmp-ish
    elif label == "fam_C":
        b[3000:3200] = rng.choice([0x55, 0x8B, 0xEC], size=200).astype(np.uint8)  # prologue-ish
    elif label == "fam_D":
        b[4000:4200] = rng.choice([0x00, 0xFF], size=200).astype(np.uint8)  # packed-like extremes
    elif label == "fam_E":
        b[500:800] = rng.choice([0xDE, 0xAD, 0xBE, 0xEF], size=300).astype(np.uint8)  # signature-like
    return b

# Represent bytes as a "text" stream of tokens (00 ff 1a ...)
# So HashingVectorizer can build n-grams over token stream.
def bytes_to_token_string(arr):
    return " ".join([f"{x:02x}" for x in arr])

docs = [bytes_to_token_string(make_bytes(lbl)) for lbl in y]
ids = np.arange(n)

print("Generated docs:", len(docs))

Generated docs: 1200


In [4]:
# High-dimensional features (hashed byte n-grams)
# ngram_range=(2,4) over byte tokens => very high-dimensional in principle
# HashingVectorizer maps to 2^18 (~262k) dims sparse.

vec = HashingVectorizer(
    n_features=2**18,
    alternate_sign=False,
    norm=None,
    ngram_range=(2,4),
    analyzer="word"   # our "words" are byte tokens like "4f"
)

X = vec.transform(docs)   # sparse matrix: (n_samples, 262144)
X = normalize(X, norm="l2")  # cosine-friendly

print("Feature matrix:", X.shape, "sparsity ~", 1 - (X.nnz / (X.shape[0]*X.shape[1])))

Feature matrix: (1200, 262144) sparsity ~ 0.9365361817677815


In [5]:
# Dimensionality reduction for visualization

if HAS_UMAP:
    reducer = umap.UMAP(
        n_neighbors=20,
        min_dist=0.05,
        metric="cosine",
        random_state=42
    )
    X2 = reducer.fit_transform(X)
    dr_name = "UMAP"
else:
    # PCA needs dense; we reduce with TruncatedSVD-like PCA via sklearn PCA on a small projection
    # For sparse PCA, use TruncatedSVD; keeping simple here:
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=2, random_state=42)
    X2 = svd.fit_transform(X)
    dr_name = "TruncatedSVD"

emb = pd.DataFrame({"id": ids, "x": X2[:,0], "y": X2[:,1], "family": y})
print("\nEmbedding head:")
print(emb.head())


Embedding head:
   id          x          y family
0   0   4.297604  15.273597  fam_D
1   1  -0.750693   0.034496  fam_C
2   2  -0.007208  -9.244028  fam_E
3   3   4.664793  16.275764  fam_D
4   4  13.637394  -1.371150  fam_A


In [6]:
# Clustering in feature space

if HAS_HDBSCAN:
    clusterer = hdbscan.HDBSCAN(min_cluster_size=30, metric="euclidean")
    # HDBSCAN expects dense for euclidean; we cluster on 2D embedding for simplicity
    clusters = clusterer.fit_predict(X2)
    cl_name = "HDBSCAN(on-2D)"
else:
    km = KMeans(n_clusters=len(families), n_init="auto", random_state=42)
    clusters = km.fit_predict(X2)
    cl_name = "KMeans(on-2D)"

emb["cluster"] = clusters
print(f"\nClustering method: {cl_name}")
print("Cluster counts:\n", emb["cluster"].value_counts().head(10))


Clustering method: HDBSCAN(on-2D)
Cluster counts:
 cluster
0    277
1    256
2    230
3    220
4    217
Name: count, dtype: int64


In [7]:
# Outlier detection (novel binaries / weird packed samples)

iso = IsolationForest(n_estimators=300, contamination=0.03, random_state=42)
# outlier detection works better on higher-dim; use a mid-dim projection
from sklearn.decomposition import TruncatedSVD
svd_mid = TruncatedSVD(n_components=50, random_state=42)
X50 = svd_mid.fit_transform(X)
out_score = -iso.fit(X50).score_samples(X50)  # higher => more anomalous
emb["outlier_score"] = out_score

print("\nTop outliers:")
print(emb.sort_values("outlier_score", ascending=False).head(10)[["id","family","cluster","outlier_score"]])


Top outliers:
        id family  cluster  outlier_score
1076  1076  fam_C        4       0.496949
978    978  fam_C        4       0.495430
433    433  fam_E        3       0.493737
489    489  fam_C        4       0.490849
1111  1111  fam_C        4       0.489304
614    614  fam_B        2       0.489035
380    380  fam_C        4       0.487590
153    153  fam_C        4       0.487313
243    243  fam_C        4       0.487166
259    259  fam_C        4       0.486843


In [8]:
# Nearest neighbors (similarity search)
# cosine similarity on high-dimensional space

q = 0  # query sample id index
sims = cosine_similarity(X[q], X).ravel()
nn = np.argsort(-sims)[:6]
print(f"\nNearest neighbors for sample {q} (true family={y[q]}):")
for j in nn:
    print(f"  id={j:4d}  family={y[j]:6s}  sim={sims[j]:.4f}  cluster={emb.loc[emb.id==j,'cluster'].values[0]}")


Nearest neighbors for sample 0 (true family=fam_D):
  id=   0  family=fam_D   sim=1.0000  cluster=1
  id= 410  family=fam_D   sim=0.5292  cluster=1
  id=  54  family=fam_D   sim=0.5279  cluster=1
  id= 707  family=fam_D   sim=0.5271  cluster=1
  id= 497  family=fam_D   sim=0.5271  cluster=1
  id= 563  family=fam_D   sim=0.5262  cluster=1
