In [None]:
import google.generativeai as genai
import os

genai.configure(api_key="your_api_key", transport="rest")

model = genai.GenerativeModel('gemini-2.0-flash')

response = model.generate_content("你是谁？")
print(response.text)

我是一个大型语言模型，由 Google 训练。



In [1]:
import re
def cleaner(code):
    pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')
    code = re.sub(pat, '', code)
    code = re.sub('\n', '', code)
    code = re.sub('\t', '', code)
    return code

In [None]:
import pandas as pd
dataset = "crossvul"
m1 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_train.pkl')
m2 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_val.pkl')
m3 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_test.pkl')

for df in [m1, m2, m3]:
    if "functionSource" in df.columns:
        df["func"] = df["functionSource"].apply(cleaner)
        
    if dataset == "draper":
        df["target"] = df["combine"] * 1

    if "label" in df.columns:
        df["target"] = df["label"]

    if dataset == "mvd":
        df["target"] = df["target"].apply(lambda x: 1 if x != 0 else 0)

m1 = m1[["func", "target"]]
m2 = m2[["func", "target"]]
m3 = m3[["func", "target"]]

In [None]:
import os
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
from datetime import datetime

genai.configure(api_key="your_api_key", transport="rest")
model = genai.GenerativeModel('gemini-2.0-flash')

def detect_vulnerability(code: str) -> int:
    prompt = (
        "You are a cybersecurity expert analyzing code for vulnerabilities. "
        "Respond with '1' if vulnerable or '0' if safe.\n\n" + code
    )
    try:
        response = model.generate_content(prompt)
        return int(response.text.strip())
    except Exception as e:
        return 0

def calculate_metrics(y_true, y_pred, dataset_name):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"\nMetrics for {dataset_name}:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def process_dataset_multithread(df: pd.DataFrame, name: str, max_workers: int = 10) -> pd.DataFrame:
    results = [None] * len(df)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_idx = {
            executor.submit(detect_vulnerability, code): idx
            for idx, code in enumerate(df['func'])
        }
        for future in tqdm(as_completed(future_to_idx),
                           total=len(future_to_idx),
                           desc=f"Processing {name} set",
                           ncols=80):
            idx = future_to_idx[future]
            try:
                results[idx] = future.result()
            except Exception:
                results[idx] = 0

    df['gemini_prediction'] = results
    if 'target' in df.columns:
        metrics = calculate_metrics(df['target'], df['gemini_prediction'], name)
        df['gemini_match'] = (df['gemini_prediction'] == df['target']).astype(int)

    return df

def process_datasets_in_parallel(datasets, names, max_workers: int = 10):
    results = {}
    for df, name in zip(datasets, names):
        print(f"\n--- Start {name} set ---")
        results[name] = process_dataset_multithread(df, name, max_workers=max_workers)
    return results

datasets = [m3]
names = ['test']
results = process_datasets_in_parallel(datasets, names, max_workers=8)
results['test'].to_pickle(f'../../data/finetune/{dataset}/{dataset}_test_with_gemini.pkl')
for name, df in results.items():
    calculate_metrics(df['target'], df['gemini_prediction'], name)


--- Start test set ---


Processing test set: 100%|████████████████| 13305/13305 [19:24<00:00, 11.43it/s]


Metrics for test:
Accuracy:  0.6445
Precision: 0.0788
Recall:    0.5470
F1 Score:  0.1378

Metrics for test:
Accuracy:  0.6445
Precision: 0.0788
Recall:    0.5470
F1 Score:  0.1378



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gemini_prediction'] = results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gemini_match'] = (df['gemini_prediction'] == df['target']).astype(int)
