In [3]:
import pandas as pd
import ast
import os

In [4]:
# Create dataset directory
os.makedirs("datasets/security", exist_ok=True)
print("Created folder: datasets/security")


Created folder: datasets/security


In [5]:
from datasets import load_dataset

# Load the dataset from Hugging Face
raw_dataset = load_dataset("code_x_glue_cc_defect_detection")

# Check the available splits
print(raw_dataset)

# Show a few samples
raw_dataset["train"][0]

DatasetDict({
    train: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 21854
    })
    validation: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 2732
    })
    test: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 2732
    })
})


{'id': 0,
 'func': 'static av_cold int vdadec_init(AVCodecContext *avctx)\n\n{\n\n    VDADecoderContext *ctx = avctx->priv_data;\n\n    struct vda_context *vda_ctx = &ctx->vda_ctx;\n\n    OSStatus status;\n\n    int ret;\n\n\n\n    ctx->h264_initialized = 0;\n\n\n\n    /* init pix_fmts of codec */\n\n    if (!ff_h264_vda_decoder.pix_fmts) {\n\n        if (kCFCoreFoundationVersionNumber < kCFCoreFoundationVersionNumber10_7)\n\n            ff_h264_vda_decoder.pix_fmts = vda_pixfmts_prior_10_7;\n\n        else\n\n            ff_h264_vda_decoder.pix_fmts = vda_pixfmts;\n\n    }\n\n\n\n    /* init vda */\n\n    memset(vda_ctx, 0, sizeof(struct vda_context));\n\n    vda_ctx->width = avctx->width;\n\n    vda_ctx->height = avctx->height;\n\n    vda_ctx->format = \'avc1\';\n\n    vda_ctx->use_sync_decoding = 1;\n\n    vda_ctx->use_ref_buffer = 1;\n\n    ctx->pix_fmt = avctx->get_format(avctx, avctx->codec->pix_fmts);\n\n    switch (ctx->pix_fmt) {\n\n    case AV_PIX_FMT_UYVY422:\n\n        vda_

In [6]:
# Convert to DataFrames
df_train = pd.DataFrame(raw_dataset["train"])
df_valid = pd.DataFrame(raw_dataset["validation"])
df_test = pd.DataFrame(raw_dataset["test"])

# Basic Cleaning and Save Raw
df_train.to_csv("datasets/security/train.csv", index=False)
df_valid.to_csv("datasets/security/valid.csv", index=False)
df_test.to_csv("datasets/security/test.csv", index=False)

print("Dataset saved as CSV files.")


Dataset saved as CSV files.


In [7]:
# Tokenizer (simple whitespace split for CNN or embedding)
def tokenize_func(func_str):
    return func_str.split()

df_train["tokens"] = df_train["func"].apply(tokenize_func)
df_valid["tokens"] = df_valid["func"].apply(tokenize_func)
df_test["tokens"] = df_test["func"].apply(tokenize_func)

# Dummy vocab index for illustration
from collections import defaultdict
vocab = defaultdict(lambda: len(vocab))
vocab["<PAD>"]  # make sure PAD token is 0

0

In [8]:
def encode(tokens):
    return [vocab[token] for token in tokens]

df_train["input_ids"] = df_train["tokens"].apply(encode)
df_valid["input_ids"] = df_valid["tokens"].apply(encode)
df_test["input_ids"] = df_test["tokens"].apply(encode)

In [9]:
# Truncate / pad input_ids to 512
MAX_LEN = 512
def pad(seq):
    seq = seq[:MAX_LEN]
    return seq + [0]*(MAX_LEN - len(seq))

df_train["input_ids"] = df_train["input_ids"].apply(pad)
df_valid["input_ids"] = df_valid["input_ids"].apply(pad)
df_test["input_ids"] = df_test["input_ids"].apply(pad)

df_train["attention_mask"] = df_train["input_ids"].apply(lambda x: [1 if i > 0 else 0 for i in x])
df_valid["attention_mask"] = df_valid["input_ids"].apply(lambda x: [1 if i > 0 else 0 for i in x])
df_test["attention_mask"] = df_test["input_ids"].apply(lambda x: [1 if i > 0 else 0 for i in x])

In [10]:
# Feature Engineering
def add_features(df):
    df["func_length"] = df["tokens"].apply(len)
    df["num_loops"] = df["func"].apply(lambda x: x.count("for") + x.count("while"))
    df["has_eval"] = df["func"].apply(lambda x: int("eval" in x))
    df["has_system"] = df["func"].apply(lambda x: int("system" in x or "exec" in x))
    df["num_if"] = df["func"].apply(lambda x: x.count("if"))
    df["num_return"] = df["func"].apply(lambda x: x.count("return"))
    df["uses_pointer"] = df["func"].apply(lambda x: int("*" in x))
    df["uses_buffer"] = df["func"].apply(lambda x: int("buffer" in x or "memcpy" in x or "strcpy" in x))
    df["is_short_func"] = df["func_length"].apply(lambda x: 1 if x < 50 else 0)
    return df

In [11]:
# Apply feature engineering
df_train = add_features(df_train)
df_valid = add_features(df_valid)
df_test = add_features(df_test)

In [12]:
pre_dir = "datasets/security"
df_train.to_csv(f"{pre_dir}/train_preprocessed.csv", index=False)
df_valid.to_csv(f"{pre_dir}/valid_preprocessed.csv", index=False)
df_test.to_csv(f"{pre_dir}/test_preprocessed.csv", index=False)

In [13]:
print("Preprocessing complete. Data saved.")

Preprocessing complete. Data saved.


In [1]:
import pandas as pd
import json

files = {
    "train": "train.jsonl",
    "valid": "valid.jsonl",
    "test": "test.jsonl"
}

for name, path in files.items():
    with open(path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)
    df.to_csv(f"{name}.csv", index=False)

print("Conversion complete!")


Conversion complete!
