In [1]:
import json
import numpy as np
from datasets import Dataset

In [2]:
# Instruction Tuning with GPT-4 dataset, Peng et al. 2023
with open('comparison_data_v2.json', 'r') as file:
    data = json.load(file)


In [3]:
# Code Alpaca Dataset, Chaudhary 2023
with open('code_alpaca_20k.json', 'r') as file:
    datacode = json.load(file)

In [4]:
# 52k general prompts in Instruction tuning database
print(len(data))

52001


In [5]:
# 20k coding prompts in Code Alpaca
print(len(datacode))

20022


In [6]:
text = []
labels = []

In [7]:
# Extract the instruction from the Coding dataset, assign a label of 1
# Iterate through each item in the data list
for item in datacode:
    # Concatenate the instruction and input values, separated by a space
    textitem = item['instruction'] + ' ' + item['input']
    # Append the concatenated string to the concatenated_list
    text.append(textitem)
    labels.append(1)


In [8]:
# Extract the instruction from the General prompting dataset, assign a label of 0
for item in data:
    # Extracting the instruction text
    instruction_start = item['user_input'].find('### Instruction:') + len('### Instruction:\n')
    instruction_end = item['user_input'].find('### Input:')
    instruction_text = item['user_input'][instruction_start:].strip().replace("### Input:", "")

    
    scores = [response['score'] for response in item['responses_and_scores']]
    if len(scores) == 3:
        text.append(instruction_text)
        labels.append(0)


In [9]:
data_dict = {
    "text": text,
    "labels": labels,
}

# Convert the data dictionary into a HuggingFace Dataset
hf_dataset = Dataset.from_dict(data_dict)

# Print dataset format
print(hf_dataset)

Dataset({
    features: ['text', 'labels'],
    num_rows: 70020
})


In [10]:
from datasets import DatasetDict

# Convert your Dataset into a DatasetDict
dataset_dict = DatasetDict({"all": hf_dataset})

# Shuffle the dataset
shuffled_dataset = dataset_dict["all"].shuffle(seed=42)  # You can provide a seed for reproducibility

# Splitting the shuffled dataset into train and test
splits = shuffled_dataset.train_test_split(test_size=0.2)  # Adjust the test_size parameter as needed

train_dataset = splits["train"]
test_dataset = splits["test"]
print(train_dataset)
print(test_dataset)
print(train_dataset["labels"][:20])

Dataset({
    features: ['text', 'labels'],
    num_rows: 56016
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 14004
})
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1]


In [11]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/56016 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/14004 [00:00<?, ? examples/s]

In [13]:
print(tokenized_datasets)

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 56016
})


In [14]:
tokenized_datasets = tokenized_datasets.with_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [15]:
tokenized_datasets_test = tokenized_datasets_test.with_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [16]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
import numpy as np
from datasets import load_metric
 
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [18]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
from transformers import Trainer, TrainingArguments
import torch

device = torch.device("mps")

# Initialize the model
model.to(device)  # Move the model to the GPU if CUDA is available

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    do_train=True,
    do_eval=True,
    output_dir="./results",
    learning_rate=1e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1577,0.154092,0.952299,0.916354
2,0.149,0.150749,0.954941,0.923096
3,0.1068,0.150101,0.954941,0.9227


  load_accuracy = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

TrainOutput(global_step=10503, training_loss=0.13874378879700436, metrics={'train_runtime': 7254.1597, 'train_samples_per_second': 23.166, 'train_steps_per_second': 1.448, 'total_flos': 2.226088140934349e+16, 'train_loss': 0.13874378879700436, 'epoch': 3.0})

In [44]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.041739050298929214}


In [35]:
torch.save(model.state_dict(), "./finetunedcode/model_weights.pth")
tokenizer.save_pretrained("./finetunedcode")


('./finetunedcode/tokenizer_config.json',
 './finetunedcode/special_tokens_map.json',
 './finetunedcode/vocab.txt',
 './finetunedcode/added_tokens.json')

In [36]:
config = model.distilbert.config
config.save_pretrained("./finetunedcode")
