In [1]:
from clang import *
from tokenizers import NormalizedString,PreTokenizedString
from tokenizers.models import BPE
from tokenizers import Tokenizer, normalizers
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.normalizers import StripAccents,Replace
from tokenizers import processors
from tokenizers.processors import TemplateProcessing
import threading
from typing import List

class MyTokenizer:
    cidx = cindex.Index.create()

    def __init__(self, timeout=5):  # 设置超时时间
        self.timeout = timeout

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        """
        使用 Clang 对代码进行分词，增加超时机制
        """
        result = []
        exception = None

        def parse():
            nonlocal result, exception
            try:
                tok = []
                tu = self.cidx.parse(
                    'tmp.c',
                    args=[''],  
                    unsaved_files=[('tmp.c', str(normalized_string.original))],  
                    options=0
                )
                for t in tu.get_tokens(extent=tu.cursor.extent):
                    spelling = t.spelling.strip()
                    if spelling == '':
                        continue
                    tok.append(NormalizedString(spelling))
                result = tok
            except Exception as e:
                exception = e

        # 创建线程
        thread = threading.Thread(target=parse)
        thread.start()
        thread.join(self.timeout)  # 等待超时时间

        if thread.is_alive():  # 超时检查
            print(f"Timeout occurred while parsing: {normalized_string.original[:100]}...")
            thread.join(0)  # 跳过此任务
            return []
        if exception:
            print(f"Error during Clang parsing: {exception}")
            return []

        return result

    def pre_tokenize(self, pretok: PreTokenizedString):
        """
        对预分词字符串进行处理，调用 Clang 分词器
        """
        def preprocess_and_split(i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
            return self.clang_split(i, normalized_string)
        
        pretok.split(preprocess_and_split)

import re
def cleaner(code):
    pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')
    code = re.sub(pat, '', code)
    code = re.sub('\n', '', code)
    code = re.sub('\t', '', code)
    return code

In [None]:
import pandas as pd
dataset = "reveal"
m1 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_train.pkl')
m2 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_val.pkl')
m3 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_test.pkl')

for df in [m1, m2, m3]:
    if "functionSource" in df.columns:
        df["func"] = df["functionSource"].apply(cleaner)
        
    if dataset == "draper":
        df["target"] = df["combine"] * 1

    if "label" in df.columns:
        df["target"] = df["label"]

    if dataset == "mvd":
        df["target"] = df["target"].apply(lambda x: 1 if x != 0 else 0)

m1 = m1[["func", "target"]]
m2 = m2[["func", "target"]]
m3 = m3[["func", "target"]]


In [6]:
m1.head()

Unnamed: 0,func,target
11902,OM_uint32 kg_sync_ccache_name ( krb5_context c...,0
11648,static void write_bootloader ( CPUMIPSState * ...,0
261,static bool search_header ( struct message_sea...,0
4192,static int qemuAgentIOProcessLine ( qemuAgentP...,0
4548,static char * default_opaque_literal_tag ( tvb...,0


In [None]:
from openai import OpenAI
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import torch

tqdm.pandas()

client = OpenAI(api_key="your_api_key", base_url="https://api.deepseek.com")

def detect_vulnerability(code, gpu_id):
    try:
        device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a cybersecurity expert analyzing code for vulnerabilities. Respond with '1' if vulnerable or '0' if safe."},
                {"role": "user", "content": f"Does this code contain security vulnerabilities? Respond with only '1' for yes or '0' for no:\n\n{code}"}
            ],
            stream=False
        )
        
        result = response.choices[0].message.content.strip()
        return int(result)
    
    except Exception as e:
        print(f"Error analyzing code with DeepSeek: {e}")
        return 0

def calculate_metrics(y_true, y_pred, dataset_name):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"\nMetrics for {dataset_name}:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def process_single_task(func, gpu_id):
    return detect_vulnerability(func, gpu_id)

def process_dataset(df, name, gpu_count=8):
    results = []
    with ProcessPoolExecutor(max_workers=gpu_count) as executor:
        futures = []
        
        for idx, func in enumerate(tqdm(df['func'], desc=f"Processing {name} set", position=0, leave=True)):
            gpu_id = idx % gpu_count
            futures.append(executor.submit(process_single_task, func, gpu_id))

        for future in as_completed(futures):
            result = future.result()
            results.append(result)

    df['deepseek_prediction'] = results
    
    if 'target' in df.columns:
        metrics = calculate_metrics(df['target'], df['deepseek_prediction'], name)
        df['deepseek_match'] = (df['deepseek_prediction'] == df['target']).astype(int)
    
    return df

def process_datasets_sequentially(datasets):
    results = {}
    for idx, (df, name) in enumerate(zip(datasets, ['test'])):
        print(f"\nProcessing {name} set...")
        results[name] = process_dataset(df, name)
    
    return results

datasets = [m3]
results = process_datasets_sequentially(datasets)

results['test'].to_pickle(f'../../data/finetune/{dataset}/{dataset}_test_with_deepseek.pkl')

print("\n=== Final Results Summary ===")
for name, df in results.items():
    metrics = calculate_metrics(df['target'], df['deepseek_prediction'], name)
    print(f"\n{name.upper()} Set:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize():<10}: {value:.4f}")


Processing train set...


Processing train set: 100%|██████████| 18187/18187 [00:00<00:00, 20059.07it/s]


Error analyzing code with DeepSeek: invalid literal for int() with base 10: '1\n\nThis code contains potential security vulnerabilities, including:\n1. Lack of proper input validation which could lead to injection attacks\n2. Potential memory leaks if error paths are taken (t
Error analyzing code with DeepSeek: Error code: 400 - {'error': {'message': "This model's maximum context length is 65536 tokens. However, you requested 84570 tokens (84570 in the messages, 0 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Error analyzing code with DeepSeek: invalid literal for int() with base 10: "This code appears to be part of the MySQL/MyISAM storage engine's repair functionality. After careful analysis, here are the potential security concerns:\n\n1. The code uses unsafe functions like `pr
Error analyzing code with DeepSeek: invalid literal for int() with base 10: 'This code appears

Processing val set: 100%|██████████| 2273/2273 [00:00<00:00, 27674.21it/s]



Metrics for val:
Accuracy:  0.8284
Precision: 0.1429
Recall:    0.1714
F1 Score:  0.1558

Processing test set...


Processing test set: 100%|██████████| 2274/2274 [00:00<00:00, 23112.42it/s]


Error analyzing code with DeepSeek: Error code: 400 - {'error': {'message': "This model's maximum context length is 65536 tokens. However, you requested 84570 tokens (84570 in the messages, 0 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Metrics for test:
Accuracy:  0.8087
Precision: 0.1161
Recall:    0.1348
F1 Score:  0.1247

=== Final Results Summary ===

Metrics for train:
Accuracy:  0.8183
Precision: 0.1345
Recall:    0.1539
F1 Score:  0.1436

TRAIN Set:
Accuracy  : 0.8183
Precision : 0.1345
Recall    : 0.1539
F1        : 0.1436

Metrics for val:
Accuracy:  0.8284
Precision: 0.1429
Recall:    0.1714
F1 Score:  0.1558

VAL Set:
Accuracy  : 0.8284
Precision : 0.1429
Recall    : 0.1714
F1        : 0.1558

Metrics for test:
Accuracy:  0.8087
Precision: 0.1161
Recall:    0.1348
F1 Score:  0.1247

TEST Set:
Accuracy  : 0.8087
Precision : 0.1161
Recall    : 0.1348
F1        :

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['deepseek_prediction'] = results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['deepseek_match'] = (df['deepseek_prediction'] == df['target']).astype(int)
  values = np.array([convert(v) for v in values])
