# Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import LoraConfig, PromptTuningConfig, PromptEncoderConfig, PrefixTuningConfig, TaskType, PeftModel, PromptTuningInit, get_peft_model
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset, load_from_disk, Dataset

# Tokenizer

In [None]:
model_name = "NousResearch/Llama-3.2-1B"
NUM_VIRTUAL_TOKENS = 88
NUM_EPOCHS = 10

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
foundational_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

In [None]:
def tokenizer_function(dataset):
    question = []
    input_ids = []
    attention_mask = []
    for elm in dataset["question"]:
        question.append(elm)
        prompt = "Example of a data science interview question: " + elm
        #print(prompt)
        inputs = tokenizer(prompt)
        input_ids.append(inputs["input_ids"])
        attention_mask.append(inputs["attention_mask"])
    ds = Dataset.from_dict({"question": question, "input_ids": input_ids, "attention_mask": attention_mask})
    return ds

# Dataset

In [None]:
dataset = load_dataset("csv", data_files="dataset_6.csv")
dataset = dataset["train"].train_test_split(test_size=0.1)
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,question,answer,difficulty,topic
0,What are augmentations? Why do we need them?,Augmentations are an artifical way of expandin...,beginner,neural networks
1,What’s the difference between random forest an...,1. Random Forests builds each tree independent...,intermediate,feature selection
2,How large should be N for our bag of words whe...,Answer here,intermediate,text classification
3,What is the normal equation?,Normal equations are equations obtained by set...,intermediate,supervised learning
4,What if we want to build a model for predictin...,"Data is not normal. Specially, real-world data...",intermediate,supervised learning


In [None]:
train_sample = tokenizer_function(dataset["train"])
df = pd.DataFrame(train_sample)
df

Unnamed: 0,question,input_ids,attention_mask
0,What are augmentations? Why do we need them?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,What’s the difference between random forest an...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,How large should be N for our bag of words whe...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,What is the normal equation?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,What if we want to build a model for predictin...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
145,Why do we need to split our data into three pa...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
146,What is precision and recall at k?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
147,What kind of augmentations do you know?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
148,What’s the difference between L2 and L1 regula...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
val_sample = tokenizer_function(dataset["test"])
df = pd.DataFrame(val_sample)
df

Unnamed: 0,question,input_ids,attention_mask
0,What is TF-IDF? How is it useful for text clas...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,What methods for solving linear regression do ...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,How do we know how many trees we need in rando...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,How would you evaluate your ranking algorithms...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,What is mean average precision at k?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5,What is a time series?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
6,Which feature selection techniques do you know?,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,You have a series with a variable “y” and a se...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
8,What is the area under the PR curve? Is it a u...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9,Can we have both L1 and L2 regularization comp...,"[128000, 13617, 315, 264, 828, 8198, 7274, 348...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Prefix tuning model

In [None]:
generation_config = PrefixTuningConfig(
    peft_type="PREFIX_TUNING",
    task_type=TaskType.CAUSAL_LM,  # This type indicates the model will generate text.
    num_virtual_tokens=NUM_VIRTUAL_TOKENS,  # Number of virtual tokens to be added and trained.
    #token_dim=280,
    #num_transformer_submodules=1,
    #num_attention_heads=12,
    #num_layers=12,
    #encoder_hidden_size=128,
    #base_model_name_or_path = model_name
)

peft_model = get_peft_model(foundational_model, generation_config)
print(peft_model.print_trainable_parameters())

trainable params: 1,441,792 || all params: 1,237,256,192 || trainable%: 0.1165
None


In [None]:
peft_model

PeftModelForCausalLM(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 2048)
      (layers): ModuleList(
        (0-15): 16 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (k_proj): Linear(in_features=2048, out_features=512, bias=False)
            (v_proj): Linear(in_features=2048, out_features=512, bias=False)
            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
            (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
            (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
          (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
  

# Training

In [None]:
def create_training_arguments(path, epochs=6):
    training_args = TrainingArguments(
        output_dir = path,
        num_train_epochs = epochs,
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        torch_empty_cache_steps = 100,
        #optim = optim,
        learning_rate = 1e-3,
        max_grad_norm = 0.3,
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        warmup_ratio = 0.03,
        #eval_strategy="steps",
        #eval_steps=10,
        #save_strategy="steps",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_safetensors = True,
        save_only_model = True,
    )
    return training_args

In [None]:
training_args = create_training_arguments("output/prefix", NUM_EPOCHS)

In [None]:
def create_trainer(model, training_args, train_dataset, val_dataset):
    trainer = Trainer(
        model=model,
        args=training_args,  # The args for the training.
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),  # mlm=False indicates not to use masked language modeling
    )
    return trainer

In [None]:
trainer = create_trainer(peft_model, training_args, train_sample, val_sample)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,4.757418
2,No log,3.960869
3,No log,3.715076
4,No log,3.527806
5,No log,3.427896
6,No log,3.354516
7,No log,3.333267
8,No log,3.306718
9,No log,3.291918
10,No log,3.291041


TrainOutput(global_step=380, training_loss=3.4148382889597038, metrics={'train_runtime': 34.2445, 'train_samples_per_second': 43.803, 'train_steps_per_second': 11.097, 'total_flos': 235108208074752.0, 'train_loss': 3.4148382889597038, 'epoch': 10.0})

# Inference, question generation

In [None]:
def generate_new_question(model, prompt_text = "Example of a data science interview question: "):
    inputs = tokenizer(prompt_text, return_tensors="pt")
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, min_new_tokens=5, max_new_tokens=50, stop_strings=["None", "question:", "Question:", "Answer:"], tokenizer=tokenizer, repetition_penalty=2.0, early_stopping=True, do_sample=True, num_beams=5, temperature=1.75, top_p=0.5, min_p=0.05)
    text_output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    #print(text_output)
    text_output = text_output[len(prompt_text):]
    text_output = text_output[:text_output.find("?")+1]
    if len(text_output) > 3:
        if text_output[0] in "0123456789" and text_output[1:3] == ". ":
          text_output = text_output[3:]
        elif text_output[:2] == "1 ":
          text_output = text_output[2:]
        elif text_output[0] in "0123456789" and text_output[1:3] == ") ":
          text_output = text_output[3:]
    return text_output.strip()

# Text generation for evaluation

In [None]:
# checkpoints: [38, 76, 114, 152, 190, 228, 266, 304, 342, 380]
for num in range(380, 381, 38):
    load_path = "output/prefix/checkpoint-" + str(num)

    model_name = 'NousResearch/Llama-3.2-1B'
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    model = PeftModel.from_pretrained(base_model, load_path)
    model = model.to("cuda")

    questions = set()
    for i in range(100):
        question = generate_new_question(model)
        words = question.split()
        if len(question) >= 10 and len(words) >= 3:
            questions.add(question)

    with open('output/evaluation/prefix/checkpoint-' + str(num) + '.txt', 'a') as file:
        for question in questions:
            file.write(question + '\n')