In [None]:
import google.generativeai as genai
import os

genai.configure(api_key="your_api_key", transport="rest")

model = genai.GenerativeModel('gemini-2.0-flash')

response = model.generate_content("你是谁？")
print(response.text)

models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-04-17
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-think

In [2]:
import re
def cleaner(code):
    pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')
    code = re.sub(pat, '', code)
    code = re.sub('\n', '', code)
    code = re.sub('\t', '', code)
    return code

In [None]:
import pandas as pd
dataset = "reveal"
m1 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_train.pkl')
m2 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_val.pkl')
m3 = pd.read_pickle(f'../../data/finetune/{dataset}/{dataset}_test.pkl')

for df in [m1, m2, m3]:
    if "functionSource" in df.columns:
        df["func"] = df["functionSource"].apply(cleaner)
        
    if dataset == "draper":
        df["target"] = df["combine"] * 1

    if "label" in df.columns:
        df["target"] = df["label"]

    if dataset == "mvd":
        df["target"] = df["target"].apply(lambda x: 1 if x != 0 else 0)

m1 = m1[["func", "target"]]
m2 = m2[["func", "target"]]
m3 = m3[["func", "target"]]

In [None]:
from openai import OpenAI
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

client = OpenAI(
    api_key="your_api_key",
    base_url="https://api.x.ai/v1",
)

def detect_vulnerability(code):
    try:
        response = client.chat.completions.create(
            model="grok-3-latest",
            messages=[
                {"role": "system", "content": "You are a cybersecurity expert analyzing code for vulnerabilities. Respond with '1' if vulnerable or '0' if safe."},
                {"role": "user", "content": f"Does this code contain security vulnerabilities? Respond with only '1' for yes or '0' for no:\n\n{code}"}
            ],
            stream=False
        )
        return int(response.choices[0].message.content.strip())
    except Exception as e:
        print(f"Error analyzing code: {e}")
        return 0

def calculate_metrics(y_true, y_pred, dataset_name):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"\nMetrics for {dataset_name}:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def process_dataset_multithread(df: pd.DataFrame, name: str, max_workers: int = 10) -> pd.DataFrame:
    results = [None] * len(df)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_idx = {
            executor.submit(detect_vulnerability, code): idx
            for idx, code in enumerate(df['func'])
        }
        for future in tqdm(as_completed(future_to_idx),
                           total=len(future_to_idx),
                           desc=f"Processing {name} set",
                           ncols=80):
            idx = future_to_idx[future]
            try:
                results[idx] = future.result()
            except Exception as e:
                results[idx] = 0

    df['grok_prediction'] = results
    if 'target' in df.columns:
        metrics = calculate_metrics(df['target'], df['grok_prediction'], name)
        df['grok_match'] = (df['grok_prediction'] == df['target']).astype(int)

    return df

def process_datasets_in_parallel(datasets, names, max_workers: int = 10):
    results = {}
    for df, name in zip(datasets, names):
        print(f"\n--- Start {name} set ---")
        results[name] = process_dataset_multithread(df, name, max_workers=max_workers)
    return results

datasets = [m3]
names = ['test']
results = process_datasets_in_parallel(datasets, names, max_workers=8)

results['test'].to_pickle(f'../../data/finetune/{dataset}/{dataset}_test_with_grok.pkl')

for name, df in results.items():
    calculate_metrics(df['target'], df['grok_prediction'], name)


--- Start test set ---


Processing test set:   9%|█▊                 | 213/2274 [00:27<03:07, 10.96it/s]

Error analyzing code: invalid literal for int() with base 10: "there! I've taken a look at the code you provided. After analyzing it, I’ve determined that there are potential security vulnerabilities, particularly related to improper error handling and resource 


Processing test set:  11%|██▏                | 259/2274 [00:33<03:41,  9.12it/s]

Error analyzing code: invalid literal for int() with base 10: "there, I'm diving into this code snippet to check for any security vulnerabilities. After a thorough look, I've determined that this code is safe. It appears to be a straightforward function for diss


Processing test set:  15%|██▊                | 338/2274 [00:45<03:50,  8.39it/s]

Error analyzing code: invalid literal for int() with base 10: "there! I've taken a look at the code snippet you provided. Based on my analysis, I'm going to have to say:\n\n1\n\nHere's why I flagged this as vulnerable: The code appears to be manipulating bit pos


Processing test set:  20%|███▊               | 450/2274 [00:57<05:37,  5.40it/s]

Error analyzing code: invalid literal for int() with base 10: 'there, thanks for sharing this code snippet. After taking a look, I’ve determined that this code does have a potential security vulnerability. The function `gcry_pk_lookup_name` uses `_gcry_module_lo


Processing test set:  22%|████               | 493/2274 [01:05<06:23,  4.64it/s]

Error analyzing code: invalid literal for int() with base 10: "there! After analyzing the provided code, I've determined that it does contain potential security vulnerabilities. Therefore, my response is:\n\n1\n\nHere's a quick breakdown of the issues I spotted:


Processing test set:  32%|██████▏            | 738/2274 [01:43<03:15,  7.85it/s]

Error analyzing code: invalid literal for int() with base 10: "there! I'm taking a look at this code snippet from what appears to be a GStreamer ASF demuxer. Let me analyze it for potential security vulnerabilities.\n\nAfter reviewing the code, I can see that it


Processing test set:  41%|███████▋           | 924/2274 [02:17<04:57,  4.54it/s]

Error analyzing code: invalid literal for int() with base 10: "there! I've analyzed the code snippet you provided, which appears to be a register read function for a PXA2xx I2S (Inter-IC Sound) controller in an embedded system or emulator context. Let's break do


Processing test set:  45%|████████          | 1026/2274 [02:36<04:05,  5.09it/s]

Error analyzing code: invalid literal for int() with base 10: "1\n\nExplanation: The provided code is a Wireshark dissector for the H.245 protocol, which is used for multimedia communication control in VoIP and video conferencing systems. While the code itself i


Processing test set:  64%|███████████▍      | 1446/2274 [03:24<03:49,  3.60it/s]

Error analyzing code: invalid literal for int() with base 10: 'Assistant: 1'


Processing test set:  65%|███████████▋      | 1484/2274 [03:33<02:52,  4.59it/s]

Error analyzing code: invalid literal for int() with base 10: "1\n\nExplanation: The provided code, which appears to be a DNS packet dissector (likely from Wireshark), has potential vulnerabilities due to insufficient bounds checking and improper handling of inp


Processing test set:  66%|███████████▊      | 1499/2274 [03:34<01:46,  7.30it/s]

Error analyzing code: invalid literal for int() with base 10: 'there! I\'ve taken a look at the code snippet you provided, which appears to be a search function (`mdb_search`) from an LDAP database backend, likely related to OpenLDAP\'s LMDB (Lightning Memory-Ma


Processing test set:  77%|█████████████▉    | 1756/2274 [04:12<00:55,  9.33it/s]

Error analyzing code: invalid literal for int() with base 10: "there, after analyzing the code you've provided, I'm responding with '1' to indicate that yes, there are potential security vulnerabilities present.\n\n1"


Processing test set:  87%|███████████████▋  | 1981/2274 [04:47<00:47,  6.12it/s]

Error analyzing code: invalid literal for int() with base 10: '"analysis": "The provided code appears to be a packet dissector for RSVP (Resource Reservation Protocol) with 3GPP extensions, likely from a tool like Wireshark. After careful review, several potenti


Processing test set:  92%|████████████████▌ | 2096/2274 [05:12<00:25,  6.89it/s]

Error analyzing code: invalid literal for int() with base 10: '1]'


Processing test set:  99%|█████████████████▊| 2257/2274 [05:35<00:01, 10.60it/s]

Error analyzing code: invalid literal for int() with base 10: "there! I've taken a look at the code snippet you provided, which appears to be a set of macros and inline functions for managing stacks in OpenSSL (likely from the `safestack.h` or similar header fil


Processing test set: 100%|█████████████████▉| 2268/2274 [05:36<00:00, 10.72it/s]

Error analyzing code: invalid literal for int() with base 10: "there! I've taken a look at the code snippet you provided, which appears to be related to initializing a DMA (Direct Memory Access) controller for a SPARC32 system in an emulation environment like QE


Processing test set: 100%|██████████████████| 2274/2274 [05:37<00:00,  6.74it/s]


Metrics for test:
Accuracy:  0.2700
Precision: 0.1054
Recall:    0.8304
F1 Score:  0.1871

Metrics for test:
Accuracy:  0.2700
Precision: 0.1054
Recall:    0.8304
F1 Score:  0.1871



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['grok_prediction'] = results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['grok_match'] = (df['grok_prediction'] == df['target']).astype(int)
