In [1]:
!pip -q install scikit-learn

In [2]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import top_k_accuracy_score

rng = np.random.default_rng(0)

In [3]:
# Example "variables" with context

names = ["i", "j", "idx", "len", "size", "count", "flag", "mask",
         "buf", "src", "dst", "key", "ctx", "path", "mode", "fd", "err", "rc"]

# Simple synthetic contexts (pretend from decompiler token window)
templates = {
    "i":     ["for ( i = 0 ; i < N ; i ++ )", "i = i + 1", "if ( i < limit )"],
    "j":     ["for ( j = 0 ; j < M ; j ++ )", "j = j + 1", "A [ i ] [ j ]"],
    "idx":   ["idx = find ( buf , key )", "buf [ idx ]", "idx < count"],
    "len":   ["memcpy ( dst , src , len )", "len = strlen ( str )", "read ( fd , buf , len )"],
    "size":  ["malloc ( size )", "realloc ( buf , size )", "size = n * sizeof"],
    "count": ["count ++", "if ( count == 0 )", "for ( k < count )"],
    "flag":  ["if ( flag )", "flag = 1", "flag &= mask"],
    "mask":  ["x & mask", "mask = 0xff", "flags | mask"],
    "buf":   ["read ( fd , buf , len )", "buf [ i ]", "memset ( buf , 0 , size )"],
    "src":   ["memcpy ( dst , src , len )", "src [ i ]", "src_ptr"],
    "dst":   ["memcpy ( dst , src , len )", "dst [ i ]", "write ( fd , dst , len )"],
    "key":   ["encrypt ( ctx , key )", "find ( buf , key )", "key_len"],
    "ctx":   ["init ( ctx )", "encrypt ( ctx , key )", "ctx->state"],
    "path":  ["fopen ( path , mode )", "stat ( path , & st )", "path = argv [ 1 ]"],
    "mode":  ["fopen ( path , mode )", "mode = \"rb\"", "chmod ( path , mode )"],
    "fd":    ["fd = open ( path , flags )", "read ( fd , buf , len )", "close ( fd )"],
    "err":   ["err = errno", "if ( err != 0 )", "return err"],
    "rc":    ["rc = func ( )", "if ( rc < 0 )", "return rc"],
}

rows = []
n_per = 400
for nm in names:
    for _ in range(n_per):
        ctx = rng.choice(templates[nm])
        # add some noise tokens
        noise = " ".join(rng.choice(["tmp","v1","v2","sub_401000","(",")",";","=","+","-","&","|"], size=10))
        rows.append({"context": ctx + " " + noise, "name": nm})

df = pd.DataFrame(rows)

X = df["context"].values
y = df["name"].values
classes = np.unique(y)

In [4]:
# train: TF-IDF + Logistic Regression

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,3), min_df=2, max_features=200_000)),
    ("clf", LogisticRegression(max_iter=300, n_jobs=-1))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
proba_oof = np.zeros((len(df), len(classes)), dtype=np.float32)

for fold, (tr, va) in enumerate(skf.split(X, y), 1):
    pipe.fit(X[tr], y[tr])
    proba = pipe.predict_proba(X[va])
    proba_oof[va] = proba
    print(f"Fold {fold}: top1={top_k_accuracy_score(y[va], proba, k=1, labels=classes):.3f} "
          f"top5={top_k_accuracy_score(y[va], proba, k=5, labels=classes):.3f}")

print("\nOverall:",
      "top1=", top_k_accuracy_score(y, proba_oof, k=1, labels=classes),
      "top5=", top_k_accuracy_score(y, proba_oof, k=5, labels=classes))

pipe.fit(X, y)

def infer_var_name(context, k=5):
    p = pipe.predict_proba([context])[0]
    idx = np.argsort(-p)[:k]
    return list(zip(classes[idx], p[idx]))

Fold 1: top1=0.847 top5=1.000
Fold 2: top1=0.849 top5=1.000
Fold 3: top1=0.851 top5=0.999
Fold 4: top1=0.854 top5=0.999
Fold 5: top1=0.852 top5=0.999

Overall: top1= 0.8506944444444444 top5= 0.9993055555555556


In [5]:
print("\nExample inference:", infer_var_name("memcpy ( dst , src , n ) ; n = strlen ( str ) ;", k=5))


Example inference: [('len', np.float64(0.8140948889006431)), ('src', np.float64(0.09325166675385387)), ('dst', np.float64(0.08095851075079097)), ('ctx', np.float64(0.0009454688458727864)), ('err', np.float64(0.000923028202322062))]
