# Fine-tuning von Modellen mit der Trainer API oder Keras

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [13]:
!pip install datasets evaluate transformers[sentencepiece]



In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("arigos/diversevul")


In [15]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 297442
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 16525
    })
    valid: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 16525
    })
})

In [16]:
# 90% train, 10% test + validation
train_testvalid = raw_datasets['train'].train_test_split(test_size=0.1)


In [17]:
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

In [18]:
test_valid

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 14872
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 14873
    })
})

In [19]:
train_testvalid

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 267697
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message'],
        num_rows: 29745
    })
})

In [20]:
from datasets import DatasetDict
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [21]:
#train_test_valid_dataset.push_to_hub("arigos/diversevul",private=True,token="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")

In [22]:
train_dataset = raw_datasets['train']
train_dataset.features

{'func': Value(dtype='string', id=None),
 'target': Value(dtype='int64', id=None),
 'cwe': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'project': Value(dtype='string', id=None),
 'commit_id': Value(dtype='string', id=None),
 'hash': Value(dtype='float64', id=None),
 'size': Value(dtype='int64', id=None),
 'message': Value(dtype='string', id=None)}

Till this point done

In [23]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["func"], example["message"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/16525 [00:00<?, ? examples/s]

In [24]:
#tokenized_datasets['train']['input_ids'][0]
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 297442
    })
    test: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16525
    })
    valid: Dataset({
        features: ['func', 'target', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16525
    })
})

In [25]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ['func', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message']}
[len(x) for x in samples["input_ids"]]

[214, 512, 512, 512, 354, 512, 512, 64]

In [26]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'target': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'token_type_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}

In [27]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [28]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.35.2', '0.25.0')

In [29]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
!pip install accelerate -U



In [41]:
train_dataset = tokenized_datasets["train"].rename_column('target','labels')
#dataset.rename_column("old_column_name", "new_column_name")
valid_dataset = tokenized_datasets["valid"].rename_column('target','labels')

In [42]:
valid_dataset

Dataset({
    features: ['func', 'labels', 'cwe', 'project', 'commit_id', 'hash', 'size', 'message', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 16525
})

In [43]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

Step,Training Loss
500,0.2602
1000,0.2458
1500,0.2479
2000,0.2534
2500,0.2389
3000,0.2494
3500,0.2523
4000,0.2374
4500,0.2371


In [None]:
test_dataset = tokenized_datasets["test"].rename_column('target','labels')
predictions = trainer.predict(test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)