<a href="https://colab.research.google.com/github/arigig/ceg-afpm/blob/main/codeT5_base_finueTune_with_DiverseVul_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets evaluate transformers[sentencepiece]



In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("arigos/diversevul")

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 297442
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 16525
    })
    valid: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 16525
    })
})

In [5]:
# Only taking 10% of whole dataset : train (10%) + test (5%) + validation (5%)
train_test_valid = raw_datasets['train'].train_test_split(test_size=0.1)

In [6]:
train_test_valid

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 267697
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 29745
    })
})

In [7]:
# Split the 10% test + valid in half test, half valid
train_data = train_test_valid['test'].train_test_split(test_size=0.5)

In [8]:
train_data

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 14872
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 14873
    })
})

In [9]:
valid_test_data = train_data['test'].train_test_split(test_size=0.5)

In [10]:
valid_test_data

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 7436
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 7437
    })
})

In [11]:
from datasets import DatasetDict
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_data['train'],
    'test': valid_test_data['test'],
    'valid': valid_test_data['train']})

In [12]:
train_test_valid_dataset['train'].features

{'func': Value(dtype='string', id=None),
 'target': Value(dtype='int64', id=None),
 'cwe': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'project': Value(dtype='string', id=None),
 'commit_id': Value(dtype='string', id=None),
 'hash': Value(dtype='float64', id=None),
 'size': Value(dtype='int64', id=None),
 'message': Value(dtype='string', id=None)}

In [13]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer( example["message"], truncation=True)


tokenized_datasets = train_test_valid_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/14872 [00:00<?, ? examples/s]

Map:   0%|          | 0/7437 [00:00<?, ? examples/s]

Map:   0%|          | 0/7436 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets.rename_column('target','labels')


DatasetDict({
    train: Dataset({
        features: ['func', 'labels', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14872
    })
    test: Dataset({
        features: ['func', 'labels', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7437
    })
    valid: Dataset({
        features: ['func', 'labels', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7436
    })
})

In [15]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ['func', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message']}
[len(x) for x in samples["input_ids"]]

[34, 150, 6, 22, 284, 261, 512, 67]

In [16]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'target': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'token_type_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}

In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

ImportError: ignored

In [19]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
!pip install accelerate -U
import accelerate
import transformers

transformers.__version__, accelerate.__version__

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


('4.35.2', '0.25.0')

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)