# Binary Code Similarity Detection

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# df = pd.read_csv("/kaggle/input/YOUR_DATASET/functions.csv")

df = pd.DataFrame({
    "func_id": ["A1", "A2", "A3", "B1", "B2", "B3"],
    "asm": [
        "push rbp; mov rbp, rsp; mov eax, edi; add eax, esi; pop rbp; ret",
        "push rbp; mov rbp, rsp; cmp edi, 10; jle L1; mov eax, 1; jmp L2; L1: mov eax, 0; L2: pop rbp; ret",
        "push rbp; mov rbp, rsp; mov rax, [rbp-8]; xor rax, rax; mov [rbp-8], rax; pop rbp; ret",
        "mov eax, edi; lea eax, [eax+esi]; ret",
        "cmp edi, 0x0a; jg Lx; xor eax, eax; ret; Lx: mov eax, 1; ret",
        "mov rax, [rbp-0x8]; xor rax, rax; mov [rbp-0x8], rax; ret"
    ]
})

In [2]:
# 2. Normalize assembly

REGEX_REG = re.compile(r"\b(r(?:[abcd]x|[sb]p|[sd]i|[0-9]{1,2})|e(?:[abcd]x|[sb]p|[sd]i)|[abcd][lh]|[sd]il|[sb]pl)\b", re.IGNORECASE)
REGEX_IMM = re.compile(r"(?<![\w])(?:0x[0-9a-f]+|\d+)(?![\w])", re.IGNORECASE)
REGEX_MEM = re.compile(r"\[[^\]]+\]")

def normalize_asm(s):
    s = s.lower()
    s = REGEX_MEM.sub(" MEM ", s)
    s = REGEX_REG.sub(" REG ", s)
    s = REGEX_IMM.sub(" IMM ", s)
    s = re.sub(r"[^a-z0-9_]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["norm"] = df["asm"].apply(normalize_asm)

In [3]:
# 3. TF-IDF vectorization

vectorizer = TfidfVectorizer(
    analyzer="word",
    ngram_range=(2, 4),
    min_df=1,
    max_df=1.0
)

X = vectorizer.fit_transform(df["norm"])

In [4]:
# 4. Compute similarity matrix

S = cosine_similarity(X, X)

In [5]:
# 5. Build similarity results table

results = []

for i, query_id in enumerate(df["func_id"]):
    sims = S[i].copy()
    sims[i] = -1  # exclude self
    top_idx = np.argsort(-sims)[:3]  # top 3 matches
    
    for rank, j in enumerate(top_idx, 1):
        results.append({
            "query_func": query_id,
            "match_func": df.iloc[j]["func_id"],
            "similarity_score": round(float(sims[j]), 4),
            "rank": rank
        })

results_df = pd.DataFrame(results)

In [6]:
# 6. Show merged results

print("====== Similarity Results ======")
display(results_df.sort_values(["query_func", "rank"]))



Unnamed: 0,query_func,match_func,similarity_score,rank
0,A1,A3,0.2044,1
1,A1,A2,0.0512,2
2,A1,B1,0.0,3
3,A2,A1,0.0512,1
4,A2,A3,0.0469,2
5,A2,B1,0.0,3
6,A3,B3,0.2654,1
7,A3,A1,0.2044,2
8,A3,A2,0.0469,3
9,B1,A1,0.0,1
