# Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import LoraConfig, PromptTuningConfig, PromptEncoderConfig, PrefixTuningConfig, TaskType, PeftModel, PromptTuningInit, get_peft_model
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset, load_from_disk, Dataset

# Tokenizer

In [None]:
model_name = "NousResearch/Llama-3.2-1B"
NUM_VIRTUAL_TOKENS = 500
NUM_EPOCHS = 10

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
foundational_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

In [None]:
def tokenizer_function(dataset):
    question = []
    input_ids = []
    attention_mask = []
    for elm in dataset["question"]:
        question.append(elm)
        prompt = "Example of a data science interview question: " + elm
        inputs = tokenizer(prompt)
        input_ids.append(inputs["input_ids"])
        attention_mask.append(inputs["attention_mask"])
    ds = Dataset.from_dict({"question": question, "input_ids": input_ids, "attention_mask": attention_mask})
    return ds

# Dataset

In [None]:
dataset = load_dataset("csv", data_files="dataset_6.csv")
dataset = dataset["train"].train_test_split(test_size=0.1)
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,question,answer,difficulty,topic
0,What if we set all the weights of a neural net...,If all the weights of a neural network are set...,intermediate,neural networks
1,What is the ROC curve? When to use it?,ROC stands for *Receiver Operating Characteris...,intermediate,classification
2,What is the PR (precision-recall) curve?,A *precision*-*recall curve* (or PR Curve) is ...,intermediate,classification
3,Do we want to have a constant learning rate or...,"Generally, it is recommended to start learning...",intermediate,neural networks
4,How can we use CNN for text classification?,Answer here,advanced,text classification


In [None]:
train_sample = tokenizer_function(dataset["train"])
df = pd.DataFrame(train_sample)
df

Unnamed: 0,question,input_ids,attention_mask
0,What if we set all the weights of a neural net...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,What is the ROC curve? When to use it?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,What is the PR (precision-recall) curve?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Do we want to have a constant learning rate or...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,How can we use CNN for text classification?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
145,What’s the effect of L2 regularization on the ...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
146,How do we check if a variable follows the norm...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
147,What is TF-IDF? How is it useful for text clas...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
148,Which regularization techniques do you know?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [None]:
val_sample = tokenizer_function(dataset["test"])
df = pd.DataFrame(val_sample)
df

Unnamed: 0,question,input_ids,attention_mask
0,Is it easy to parallelize training of a random...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,What is the cold start problem?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,When do we need to perform feature normalizati...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,How would you evaluate your ranking algorithms...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,How do you approach tuning parameters in XGBoo...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5,What is backpropagation? How does it work? Why...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,Can we use L1 regularization for feature selec...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,What is classification? Which models would you...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
8,What is object detection? Do you know any arch...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9,If a weight for one variable is higher than fo...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# P-tuning model

In [None]:
generation_config = PromptTuningConfig(
    peft_type="P_TUNING",
    task_type=TaskType.CAUSAL_LM,  # This type indicates the model will generate text.
    prompt_tuning_init=PromptTuningInit.RANDOM,  # The added virtual tokens are initializad with random numbers
    num_virtual_tokens=NUM_VIRTUAL_TOKENS,  # Number of virtual tokens to be added and trained.
    tokenizer_name_or_path=model_name,  # The pre-trained model.
)

peft_model = get_peft_model(foundational_model, generation_config)
print(peft_model.print_trainable_parameters())

trainable params: 1,024,000 || all params: 1,236,838,400 || trainable%: 0.0828
None


In [None]:
model

PeftModelForCausalLM(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 2048)
      (layers): ModuleList(
        (0-15): 16 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (k_proj): Linear(in_features=2048, out_features=512, bias=False)
            (v_proj): Linear(in_features=2048, out_features=512, bias=False)
            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
            (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
            (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
          (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
  

# Training

In [None]:
def create_training_arguments(path, epochs=6):
    training_args = TrainingArguments(
        output_dir = path,
        num_train_epochs = epochs,
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        torch_empty_cache_steps = 100,
        #optim = optim,
        learning_rate = 1e-3,
        max_grad_norm = 0.3,
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        warmup_ratio = 0.03,
        #eval_strategy="steps",
        #eval_steps=10,
        #save_strategy="steps",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_safetensors = True,
        save_only_model = True,
    )
    return training_args

In [None]:
training_args = create_training_arguments("output/p-tuning", NUM_EPOCHS)

In [None]:
def create_trainer(model, training_args, train_dataset, val_dataset):
    trainer = Trainer(
        model=model,
        args=training_args,  # The args for the training.
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),  # mlm=False indicates not to use masked language modeling
    )
    return trainer

In [None]:
trainer = create_trainer(peft_model, training_args, train_sample, val_sample)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,4.377367
2,No log,3.361925
3,No log,3.073158
4,No log,2.891921
5,No log,2.764492
6,No log,2.683021
7,No log,2.634903
8,No log,2.606918
9,No log,2.597763
10,No log,2.596462


TrainOutput(global_step=380, training_loss=2.91003128854852, metrics={'train_runtime': 465.1755, 'train_samples_per_second': 3.225, 'train_steps_per_second': 0.817, 'total_flos': 235096530321408.0, 'train_loss': 2.91003128854852, 'epoch': 10.0})

# Inference, question generation

In [None]:
def generate_new_question(model, prompt_text = "Example of a data science interview question: "):
    inputs = tokenizer(prompt_text, return_tensors="pt")
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, min_new_tokens=5, max_new_tokens=50, stop_strings=["None", "question:", "Question:", "Answer:"], tokenizer=tokenizer, repetition_penalty=2.0, early_stopping=True, do_sample=True, num_beams=3, temperature=1.5, top_p=0.75, min_p=0.1)
    text_output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    text_output = text_output[len(prompt_text):]
    text_output = text_output[:text_output.find("?")+1]
    if len(text_output) > 3:
        if text_output[0] in "0123456789" and text_output[1:3] == ". ":
          text_output = text_output[3:]
        elif text_output[:2] == "1 ":
          text_output = text_output[2:]
        elif text_output[0] in "0123456789" and text_output[1:3] == ") ":
          text_output = text_output[3:]
    return text_output.strip()

# Text generation for evaluation

In [None]:
# checkpoints: [38, 76, 114, 152, 190, 228, 266, 304, 342, 380]
for num in range(380, 381, 38):
    load_path = "output/p-tuning/checkpoint-" + str(num)

    model_name = 'NousResearch/Llama-3.2-1B'
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    model = PeftModel.from_pretrained(base_model, load_path)
    model = model.to("cuda")

    questions = set()
    for i in range(100):
        question = generate_new_question(model)
        words = question.split()
        if len(question) >= 10 and len(words) >= 3:
            questions.add(question)

    with open('output/evaluation/p-tuning/checkpoint-' + str(num) + '.txt', 'a') as file:
        for question in questions:
            file.write(question + '\n')