In [1]:
!pip -q install scikit-learn

In [2]:
import numpy as np, pandas as pd, re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import top_k_accuracy_score
from sklearn.pipeline import Pipeline

rng = np.random.default_rng(42)

In [3]:
# Example data generator

names = [
    "memcpy","memset","strlen","strcmp",
    "aes_encrypt","sha256_update","base64_decode",
    "parse_json","inflate","crc32","rsa_verify"
]

# token "motifs" per function name (toy but demonstrates workflow)
motifs = {
    "memcpy":       ["rep_movsb","mov","lea","cmp","jb"],
    "memset":       ["rep_stosb","mov","xor","cmp","jb"],
    "strlen":       ["cmp","je","inc","movzx","test","jne"],
    "strcmp":       ["cmp","jne","movzx","inc","test","je"],
    "aes_encrypt":  ["xor","aesenc","aesenclast","movdqu","pxor"],
    "sha256_update":["ror","shr","xor","add","and","or","rol"],
    "base64_decode":["shl","shr","and","or","cmp","jl","sub"],
    "parse_json":   ["cmp","je","jne","call_parse","skip_ws","isdigit"],
    "inflate":      ["bitbuf","shr","and","huff","loop","len_dist"],
    "crc32":        ["xor","shr","crc","table","and","cmp"],
    "rsa_verify":   ["modexp","mul","mont","cmp","bn_add","bn_sub"],
}

def synth_tokens(name, L=120):
    base = []
    # add motif tokens frequently
    for _ in range(L):
        if rng.random() < 0.35:
            base.append(rng.choice(motifs[name]))
        else:
            base.append(rng.choice([
                "mov","add","sub","xor","or","and","cmp","test","jmp","call",
                "push","pop","lea","shl","shr","rol","ror","nop"
            ]))
    # add a little compiler noise
    if rng.random() < 0.5: base += ["prologue","epilogue"]
    rng.shuffle(base)
    return " ".join(base)

n_per = 500
rows = []
for nm in names:
    for i in range(n_per):
        rows.append({"func_id": f"{nm}_{i}", "tokens": synth_tokens(nm), "name": nm})
df = pd.DataFrame(rows)

In [4]:
# Train: TF-IDF over tokens + multiclass classifier

X = df["tokens"].values
y = df["name"].values
classes = np.unique(y)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,3), min_df=2, max_features=250_000)),
    ("clf", LogisticRegression(max_iter=300, n_jobs=-1))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

proba_oof = np.zeros((len(df), len(classes)), dtype=np.float32)
for fold, (tr, va) in enumerate(skf.split(X, y), 1):
    pipe.fit(X[tr], y[tr])
    proba = pipe.predict_proba(X[va])
    # align columns to 'classes'
    proba_oof[va] = proba
    print(f"Fold {fold}: top-1={top_k_accuracy_score(y[va], proba, k=1, labels=classes):.4f} "
          f"top-5={top_k_accuracy_score(y[va], proba, k=5, labels=classes):.4f}")

print("\nOverall:",
      "top-1=", top_k_accuracy_score(y, proba_oof, k=1, labels=classes),
      "top-5=", top_k_accuracy_score(y, proba_oof, k=5, labels=classes))

Fold 1: top-1=0.9100 top-5=1.0000
Fold 2: top-1=0.9173 top-5=1.0000
Fold 3: top-1=0.9109 top-5=1.0000
Fold 4: top-1=0.9082 top-5=1.0000
Fold 5: top-1=0.9173 top-5=1.0000

Overall: top-1= 0.9127272727272727 top-5= 1.0


In [5]:
# Inference helper: show top-k predicted names for a function

pipe.fit(X, y)

def predict_topk(tokens, k=5):
    p = pipe.predict_proba([tokens])[0]
    idx = np.argsort(-p)[:k]
    return list(zip(classes[idx], p[idx]))

example = df.iloc[0]["tokens"]
print("\nExample top-5:", predict_topk(example, k=5))


Example top-5: [('memcpy', np.float64(0.9132618773486105)), ('sha256_update', np.float64(0.02268139881640454)), ('memset', np.float64(0.011579633303014761)), ('base64_decode', np.float64(0.009804450395873776)), ('crc32', np.float64(0.009677668926683381))]
