# Imports

In [None]:
import torch
from peft import LoraConfig, PromptEncoderConfig, PrefixTuningConfig, TaskType, PeftModel
import datasets
from datasets import load_dataset, load_from_disk, Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM
from peft import get_peft_model
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import pandas as pd

# Dataset

In [None]:
dataset = load_dataset("csv", data_files="dataset_6.csv")
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,question,answer,difficulty,topic
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner,supervised learning
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner,supervised learning
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner,supervised learning
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate,supervised learning
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner,supervised learning


## Difficulty datasets

In [None]:
df["difficulty"].value_counts()

difficulty
intermediate    114
beginner         40
advanced         13
Name: count, dtype: int64

In [None]:
beginner_dataset_list = []
intermediate_dataset_list = []
advanced_dataset_list = []
for i in range(len(dataset['train'])):
  new_value = {'question': dataset['train'][i]['question'], 'answer': dataset['train'][i]['answer'], 'difficulty': dataset['train'][i]['difficulty']}
  if dataset['train'][i]['difficulty'] == "beginner":
    beginner_dataset_list.append(new_value)
  elif dataset['train'][i]['difficulty'] == "intermediate":
    intermediate_dataset_list.append(new_value)
  elif dataset['train'][i]['difficulty'] == "advanced":
    advanced_dataset_list.append(new_value)

beginner_dataset = datasets.DatasetDict()
beginner_dataset['train'] = Dataset.from_list(beginner_dataset_list)
intermediate_dataset = datasets.DatasetDict()
intermediate_dataset['train'] = Dataset.from_list(intermediate_dataset_list)
advanced_dataset = datasets.DatasetDict()
advanced_dataset['train'] = Dataset.from_list(advanced_dataset_list)

In [None]:
df = pd.DataFrame(beginner_dataset['train'])
df.head()

Unnamed: 0,question,answer,difficulty
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner
3,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner
4,Which metrics for evaluating regression models...,1. Mean Squared Error(MSE)\n2. Root Mean Squar...,beginner


In [None]:
df = pd.DataFrame(intermediate_dataset['train'])
df.head()

Unnamed: 0,question,answer,difficulty
0,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate
1,How do we check if a variable follows the norm...,1. Plot a histogram out of the sampled data. I...,intermediate
2,What if we want to build a model for predictin...,"Data is not normal. Specially, real-world data...",intermediate
3,What methods for solving linear regression do ...,"To solve linear regression, you need to find t...",intermediate
4,What is gradient descent? How does it work?,Gradient descent is an algorithm that uses cal...,intermediate


In [None]:
df = pd.DataFrame(advanced_dataset['train'])
df.head()

Unnamed: 0,question,answer,difficulty
0,"What if instead of finding the best split, we ...",Answer here,advanced
1,Are there any differences between continuous a...,Answer here,advanced
2,How do you approach tuning parameters in XGBoo...,"Depending upon the dataset, parameter tuning c...",advanced
3,Are CNNs resistant to rotations? What happens ...,CNNs are not resistant to rotation by design. ...,advanced
4,What kind of CNN architectures for classificat...,Image Classification\n* Inception v3\n* Xcepti...,advanced


## Topic datasets

In [None]:
df["topic"].value_counts()

topic
neural networks          34
feature selection        30
classification           21
unsupervised learning    21
supervised learning      20
text classification      14
regularization           13
recommender systems       7
time series               7
Name: count, dtype: int64

In [None]:
topics = list(df["topic"].value_counts().index)

In [None]:
topic_dfs = dict()
for topic in topics:
    dataset_list = []
    for i in range(len(dataset['train'])):
      new_value = {'question': dataset['train'][i]['question'], 'topic': dataset['train'][i]['topic']}
      if dataset['train'][i]['topic'] == topic:
        dataset_list.append(new_value)

    topic_dataset = datasets.DatasetDict()
    topic_dataset['train'] = Dataset.from_list(dataset_list)

    df = pd.DataFrame(topic_dataset['train'])
    topic_dfs[topic] = df
print(len(topic_dfs))

9


In [None]:
topic_dfs['time series'].head()

Unnamed: 0,question,topic
0,What is a time series?,time series
1,How is time series different from the usual re...,time series
2,Which models do you know for solving time seri...,time series
3,"If there’s a trend in our series, how we can r...",time series
4,You have a series with only one variable “y” m...,time series


## Train-test split

In [None]:
train_test_dataset = beginner_dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 36
    })
    test: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 4
    })
})


In [None]:
train_test_dataset = intermediate_dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 102
    })
    test: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 12
    })
})


In [None]:
train_test_dataset = advanced_dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 11
    })
    test: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 2
    })
})


# LoRA model

In [None]:
lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=16, lora_alpha=32, lora_dropout=0, bias="none", target_modules='all-linear')

model_name = 'NousResearch/Llama-3.2-1B'
base_model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)

lora_model = get_peft_model(base_model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


In [None]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
                (b

# Tokenizer

In [None]:
model_name = 'NousResearch/Llama-3.2-1B'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True, padding=True, truncation=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Training

## Training the general model

In [None]:
train_test_dataset = dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'difficulty', 'topic'],
        num_rows: 150
    })
    test: Dataset({
        features: ['question', 'answer', 'difficulty', 'topic'],
        num_rows: 17
    })
})


In [None]:
training_args = TrainingArguments(
    output_dir = "output/lora",
    num_train_epochs = 10,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    torch_empty_cache_steps = 100,
    #optim = optim,
    learning_rate = 1e-3,
    max_grad_norm = 0.3,
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.03,
    #eval_strategy="steps",
    #eval_steps=10,
    #save_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_safetensors = True,
    save_only_model = True,
    #label_names = label_names,
)

model = lora_model

def formatting_func(example):
    output_texts = []

    if type(example['question']) == str:
      text = f"Example of a data science interview question: {example['question']}"
      output_texts.append(text)
    elif type(example['question']) == list:
      for i in range(len(example['question'])):
        text = f"Example of a data science interview question: {example['question'][i]}"
        output_texts.append(text)

    return output_texts

response_template = " question:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

#SFT Trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = train_test_dataset["train"],
    eval_dataset = train_test_dataset["test"],
    peft_config = lora_config,
    formatting_func = formatting_func,
    data_collator=collator,
    #max_seq_length = max_seq_length,
    args = training_args,
    processing_class = tokenizer,
    #packing = packing,
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/150 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/17 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/17 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/17 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/17 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/17 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,2.188293
2,No log,2.253862
3,No log,2.535734
4,No log,2.648258
5,No log,2.672976
6,No log,2.970395
7,No log,3.040295
8,No log,3.140867
9,No log,3.220491
10,No log,3.231928


TrainOutput(global_step=50, training_loss=0.8935701751708984, metrics={'train_runtime': 39.7386, 'train_samples_per_second': 37.747, 'train_steps_per_second': 1.258, 'total_flos': 344255018582016.0, 'train_loss': 0.8935701751708984})

## Training the difficulty models

### Beginner difficulty

In [None]:
train_test_dataset = beginner_dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 36
    })
    test: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 4
    })
})


In [None]:
training_args = TrainingArguments(
    output_dir = "output/beginner/lora",
    num_train_epochs = 10,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    torch_empty_cache_steps = 100,
    #optim = optim,
    learning_rate = 1e-3,
    max_grad_norm = 0.3,
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.03,
    #eval_strategy="steps",
    #eval_steps=10,
    #save_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_safetensors = True,
    save_only_model = True,
    #label_names = label_names,
)

model = lora_model

def formatting_func(example):
    output_texts = []

    if type(example['question']) == str:
      text = f"Example of a data science interview question: {example['question']}"
      output_texts.append(text)
    elif type(example['question']) == list:
      for i in range(len(example['question'])):
        text = f"Example of a data science interview question: {example['question'][i]}"
        output_texts.append(text)

    return output_texts

response_template = " question:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

#SFT Trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = train_test_dataset["train"],
    eval_dataset = train_test_dataset["test"],
    peft_config = lora_config,
    formatting_func = formatting_func,
    data_collator=collator,
    #max_seq_length = max_seq_length,
    args = training_args,
    processing_class = tokenizer,
    #packing = packing,
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/36 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/36 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/36 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/36 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/36 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/4 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/4 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/4 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/4 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/4 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,1.911035
2,No log,2.640229
3,No log,2.000848
4,No log,2.204309
5,No log,2.141031
6,No log,2.2565
7,No log,2.283427
8,No log,2.282562
9,No log,2.343166
10,No log,2.353372


TrainOutput(global_step=20, training_loss=1.1467519760131837, metrics={'train_runtime': 14.8377, 'train_samples_per_second': 24.262, 'train_steps_per_second': 1.348, 'total_flos': 58025552510976.0, 'train_loss': 1.1467519760131837})

### Intermediate difficulty

In [None]:
train_test_dataset = intermediate_dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 102
    })
    test: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 12
    })
})


In [None]:
training_args = TrainingArguments(
    output_dir = "output/intermediate/lora",
    num_train_epochs = 10,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    torch_empty_cache_steps = 100,
    #optim = optim,
    learning_rate = 1e-3,
    max_grad_norm = 0.3,
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.03,
    #eval_strategy="steps",
    #eval_steps=10,
    #save_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_safetensors = True,
    save_only_model = True,
    #label_names = label_names,
)

model = lora_model

def formatting_func(example):
    output_texts = []

    if type(example['question']) == str:
      text = f"Example of a data science interview question: {example['question']}"
      output_texts.append(text)
    elif type(example['question']) == list:
      for i in range(len(example['question'])):
        text = f"Example of a data science interview question: {example['question'][i]}"
        output_texts.append(text)

    return output_texts

response_template = " question:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

#SFT Trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = train_test_dataset["train"],
    eval_dataset = train_test_dataset["test"],
    peft_config = lora_config,
    formatting_func = formatting_func,
    data_collator=collator,
    #max_seq_length = max_seq_length,
    args = training_args,
    processing_class = tokenizer,
    #packing = packing,
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/102 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/102 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/102 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/102 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/102 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/12 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/12 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/12 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/12 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/12 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,2.29987
2,No log,2.284327
3,No log,2.145491
4,No log,2.296995
5,No log,2.533541
6,No log,2.60253
7,No log,2.683346
8,No log,2.791818
9,No log,2.819408
10,No log,2.824565


TrainOutput(global_step=40, training_loss=0.8879347801208496, metrics={'train_runtime': 38.5336, 'train_samples_per_second': 26.47, 'train_steps_per_second': 1.038, 'total_flos': 239012826537984.0, 'train_loss': 0.8879347801208496})

### Advanced difficulty

In [None]:
train_test_dataset = advanced_dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 11
    })
    test: Dataset({
        features: ['question', 'answer', 'difficulty'],
        num_rows: 2
    })
})


In [None]:
training_args = TrainingArguments(
    output_dir = "output/advanced/lora",
    num_train_epochs = 10,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    torch_empty_cache_steps = 100,
    #optim = optim,
    learning_rate = 1e-3,
    max_grad_norm = 0.3,
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.03,
    #eval_strategy="steps",
    #eval_steps=10,
    #save_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_safetensors = True,
    save_only_model = True,
    #label_names = label_names,
)

model = lora_model

def formatting_func(example):
    output_texts = []

    if type(example['question']) == str:
      text = f"Example of a data science interview question: {example['question']}"
      output_texts.append(text)
    elif type(example['question']) == list:
      for i in range(len(example['question'])):
        text = f"Example of a data science interview question: {example['question'][i]}"
        output_texts.append(text)

    return output_texts

response_template = " question:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

#SFT Trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = train_test_dataset["train"],
    eval_dataset = train_test_dataset["test"],
    peft_config = lora_config,
    formatting_func = formatting_func,
    data_collator=collator,
    #max_seq_length = max_seq_length,
    args = training_args,
    processing_class = tokenizer,
    #packing = packing,
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/6 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,2.830185
2,No log,6.077318
3,No log,5.643332
4,No log,4.815195
5,No log,4.231895
6,No log,4.514734
7,No log,4.391691
8,No log,4.248922
9,No log,4.323319
10,No log,4.369726



Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as poss

TrainOutput(global_step=10, training_loss=0.7272938251495361, metrics={'train_runtime': 34.7511, 'train_samples_per_second': 1.727, 'train_steps_per_second': 0.288, 'total_flos': 15238795345920.0, 'train_loss': 0.7272938251495361})

## Training of topic models

In [None]:
i = 8  # index from 0 to 8

In [None]:
topic = topics[i]
topic

'time series'

In [None]:
topic_dataset = datasets.DatasetDict()
topic_list = list(topic_dfs[topic]['question'])
topic_list = [ {'question': q } for q in topic_list ]
topic_dataset['train'] = Dataset.from_list(topic_list)

train_test_dataset = topic_dataset["train"].train_test_split(test_size=0.1)

print(train_test_dataset)

DatasetDict({
    train: Dataset({
        features: ['question'],
        num_rows: 6
    })
    test: Dataset({
        features: ['question'],
        num_rows: 1
    })
})


In [None]:
training_args = TrainingArguments(
    output_dir = "output/topic/" + topic,
    num_train_epochs = 5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    torch_empty_cache_steps = 100,
    #optim = optim,
    learning_rate = 1e-3,
    max_grad_norm = 0.3,
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.03,
    #eval_strategy="steps",
    #eval_steps=10,
    #save_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_safetensors = True,
    save_only_model = True,
    #label_names = label_names,
)

model = lora_model

def formatting_func(example):
    output_texts = []

    if type(example['question']) == str:
      text = f"Example of a data science interview question: {example['question']}"
      output_texts.append(text)
    elif type(example['question']) == list:
      for i in range(len(example['question'])):
        text = f"Example of a data science interview question: {example['question'][i]}"
        output_texts.append(text)

    return output_texts

response_template = " question:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

#SFT Trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = train_test_dataset["train"],
    eval_dataset = train_test_dataset["test"],
    peft_config = lora_config,
    formatting_func = formatting_func,
    data_collator=collator,
    #max_seq_length = max_seq_length,
    args = training_args,
    processing_class = tokenizer,
    #packing = packing,
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/6 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,2.830185
2,No log,6.077318
3,No log,5.499052
4,No log,5.404317
5,No log,5.177464



Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in NousResearch/Llama-3.2-1B.


TrainOutput(global_step=5, training_loss=1.232447052001953, metrics={'train_runtime': 25.3998, 'train_samples_per_second': 1.181, 'train_steps_per_second': 0.197, 'total_flos': 7619397672960.0, 'train_loss': 1.232447052001953})

# Inference, question generation

In [None]:
def generate_new_question(model, prompt_text = "Example of a data science interview question: "):
    #prompt_text = "Example of a data science interview question: "
    inputs = tokenizer(prompt_text, return_tensors="pt")
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, min_new_tokens=5, max_new_tokens=50, stop_strings=["None", "question:", "Question:", "Answer:"], tokenizer=tokenizer, repetition_penalty=2.0, early_stopping=True, do_sample=True, num_beams=3, temperature=1.5, top_p=0.75, min_p=0.1)
    text_output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    text_output = text_output[len(prompt_text):]
    text_output = text_output[:text_output.find("?")+1]
    if len(text_output) > 3:
        if text_output[0] in "0123456789" and text_output[1:3] == ". ":
          text_output = text_output[3:]
        elif text_output[:2] == "1 ":
          text_output = text_output[2:]
        elif text_output[0] in "0123456789" and text_output[1:3] == ") ":
          text_output = text_output[3:]
    return text_output.strip()

# Text generation for evaluation

## General model

In [None]:
for num in range(5, 51, 5):
    load_path = "output/lora/checkpoint-" + str(num)

    model_name = 'NousResearch/Llama-3.2-1B'
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    model = PeftModel.from_pretrained(base_model, load_path)
    model = model.merge_and_unload()
    model = model.to("cuda")

    questions = set()
    for i in range(100):
        question = generate_new_question(model)
        words = question.split()
        if len(question) >= 10 and len(words) >= 3:
            questions.add(question)

    with open('output/evaluation/lora/checkpoint-' + str(num) + '.txt', 'a') as file:
        for question in questions:
            file.write(question + '\n')

## Difficulty models

In [None]:
# beginner
for num in range(2, 21, 2):
    load_path = "output/beginner/lora/checkpoint-" + str(num)

    model_name = 'NousResearch/Llama-3.2-1B'
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    model = PeftModel.from_pretrained(base_model, load_path)
    model = model.merge_and_unload()
    model = model.to("cuda")

    questions = set()
    for i in range(100):
        question = generate_new_question(model)
        words = question.split()
        if len(question) >= 10 and len(words) >= 3:
            questions.add(question)

    with open('output/evaluation/beginner/lora/checkpoint-' + str(num) + '.txt', 'a') as file:
        for question in questions:
            file.write(question + '\n')

In [None]:
# intermediate
for num in range(4, 41, 4):
    load_path = "output/intermediate/lora/checkpoint-" + str(num)

    model_name = 'NousResearch/Llama-3.2-1B'
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    model = PeftModel.from_pretrained(base_model, load_path)
    model = model.merge_and_unload()
    model = model.to("cuda")

    questions = set()
    for i in range(100):
        question = generate_new_question(model)
        words = question.split()
        if len(question) >= 10 and len(words) >= 3:
            questions.add(question)

    with open('output/evaluation/intermediate/lora/checkpoint-' + str(num) + '.txt', 'a') as file:
        for question in questions:
            file.write(question + '\n')

In [None]:
# advanced
for num in range(1, 11, 1):
    load_path = "output/advanced/lora/checkpoint-" + str(num)

    model_name = 'NousResearch/Llama-3.2-1B'
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    model = PeftModel.from_pretrained(base_model, load_path)
    model = model.merge_and_unload()
    model = model.to("cuda")

    questions = set()
    for i in range(100):
        question = generate_new_question(model)
        words = question.split()
        if len(question) >= 10 and len(words) >= 3:
            questions.add(question)

    with open('output/evaluation/advanced/lora/checkpoint-' + str(num) + '.txt', 'a') as file:
        for question in questions:
            file.write(question + '\n')

## Topic models

In [None]:
topic_list = ['supervised learning', 'classification', 'regularization', 'feature selection', 'neural networks', 'text classification', 'unsupervised learning', 'recommender systems', 'time series']

In [None]:
for topic in topic_list:
    for num in range(1, 6, 1):
        load_path = "output/topic/" + topic + "/checkpoint-" + str(num)

        model_name = 'NousResearch/Llama-3.2-1B'
        base_model = AutoModelForCausalLM.from_pretrained(model_name)
        model = PeftModel.from_pretrained(base_model, load_path)
        model = model.merge_and_unload()
        model = model.to("cuda")

        questions = set()
        for i in range(100):
            question = generate_new_question(model)
            words = question.split()
            if len(question) >= 10 and len(words) >= 3:
                questions.add(question)

        with open('output/evaluation/topic/' + topic + '/checkpoint-' + str(num) + '.txt', 'a') as file:
            for question in questions:
                file.write(question + '\n')

# Text generation for creating the extended datasets

## Questions generated with general model

In [None]:
num = 15
load_path = "output/lora/checkpoint-" + str(num)

model_name = 'NousResearch/Llama-3.2-1B'
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, load_path)
model = model.merge_and_unload()
model = model.to("cuda")

questions = set()
for i in range(100):
    question = generate_new_question(model)
    words = question.split()
    if len(question) >= 10 and len(words) >= 3:
        questions.add(question)

with open('questions/general.txt', 'a') as file:
    for question in questions:
        file.write(question + '\n')

## Difficulty extended dataset

In [None]:
# beginner
num = 6
load_path = "output/beginner/lora/checkpoint-" + str(num)

model_name = 'NousResearch/Llama-3.2-1B'
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, load_path)
model = model.merge_and_unload()
model = model.to("cuda")

questions = set()
for i in range(100):
    question = generate_new_question(model)
    words = question.split()
    if len(question) >= 10 and len(words) >= 3:
        questions.add(question)

with open('questions/beginner.txt', 'a') as file:
    for question in questions:
        file.write(question + '\n')

In [None]:
# intermediate
num = 16
load_path = "output/intermediate/lora/checkpoint-" + str(num)

model_name = 'NousResearch/Llama-3.2-1B'
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, load_path)
model = model.merge_and_unload()
model = model.to("cuda")

questions = set()
for i in range(100):
    question = generate_new_question(model)
    words = question.split()
    if len(question) >= 10 and len(words) >= 3:
        questions.add(question)

with open('questions/intermediate.txt', 'a') as file:
    for question in questions:
        file.write(question + '\n')

In [None]:
# advanced
num = 2
load_path = "output/advanced/lora/checkpoint-" + str(num)

model_name = 'NousResearch/Llama-3.2-1B'
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, load_path)
model = model.merge_and_unload()
model = model.to("cuda")

questions = set()
for i in range(500):
    question = generate_new_question(model)
    words = question.split()
    if len(question) >= 10 and len(words) >= 3:
        questions.add(question)

with open('questions/advanced.txt', 'a') as file:
    for question in questions:
        file.write(question + '\n')

## Topic extended dataset

In [None]:
i = 8 # from 0 to 8
topic = topics[i]
num = 1
load_path = "output/topic/" + topic + "/checkpoint-" + str(num)

model_name = 'NousResearch/Llama-3.2-1B'
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, load_path)
model = model.merge_and_unload()
model = model.to("cuda")

questions = set()
for i in range(100):
    question = generate_new_question(model)
    words = question.split()
    if len(question) >= 10 and len(words) >= 3:
        questions.add(question)

with open('questions/topic/' + topic + '.txt', 'a') as file:
    for question in questions:
        file.write(question + '\n')

# Random questions generated with none fine-tuned LLaMA

## Questions for the evaluation of the data science question classification model

In [None]:
model_name = 'NousResearch/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to("cuda")
prompt_text = "A random question: "
questions = set()
i = 1
while len(questions) < 167:
  text = generate_new_question(model, prompt_text)
  if text != "":
      questions.add(text)
      print(i, text)
      i = i + 1

1 what is the best way to make a list of books you’ve read?
2 What is the difference between a man and a woman?
3 What do you think is the biggest problem facing the U.S. today?
4 What is the difference between a normal distribution and a beta distribution?
5 What was the first thing you did when you got home after being out of town for a week?
6 What is the difference between a person and a machine?
7 What do you do for a living?
8 What do you think of this song?
9 What do you get when you cross a human with a dog?
10 How do we know that the earth is round?
11 What is the difference between a "non-profit" and a "tax-exempt" organization?
12 What’s the most important thing you can do to be a better leader?
13 How many of you have been to a restaurant that has an amazing menu but the food is not good?
14 what's the name of the guy in the picture above?
15 how do you feel about the idea of a "free" university?
16 Do you know of anyone who has had a positive pregnancy test and then later 

In [None]:
df_questions = pd.DataFrame()
df_questions['question'] = pd.DataFrame(questions)
df_questions

Unnamed: 0,question
0,"What is the difference between a ""pocket"" and ..."
1,I was wondering if anyone knows of any way tha...
2,How do you like to spend your time when you’re...
3,What would you like to see on this site?
4,I am interested in finding out if there is a w...
...,...
162,What is the name of this type of tree?
163,"2+2=3\nThe answer to this question is simple, ..."
164,What do you think is the most important thing ...
165,Do you know of anyone who has had a positive p...


In [None]:
df_questions.to_csv('question_dataset.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [None]:
df = pd.read_csv("question_dataset.csv")
df.head()

Unnamed: 0,question
0,"What is the difference between a ""pocket"" and ..."
1,I was wondering if anyone knows of any way tha...
2,How do you like to spend your time when you’re...
3,What would you like to see on this site?
4,I am interested in finding out if there is a w...


## Generation of additional non-data science related questions for the merged dataset

In [None]:
model_name = 'NousResearch/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to("cuda")
prompt_text = "A random question: "
questions = set()
i = 1
while len(questions) < 500:
  text = generate_new_question(model, prompt_text)
  if len(text) > 10:
      questions.add(text)
      print(i, text)
      i = i + 1

1 What is the best way to start a blog?
2 Is it possible to make a non-convex function of two variables convex?
3 How does one know that a given set of integers has the property that for every integer $n$, there is an integer $m$ such that $n = m^2$?
4 What is the name of the game you're playing right now?
5 What was the last book you read?
6 Is it possible for a person to be born with a congenital heart defect and survive?
7 What is your favorite thing about being a writer?
8 how do you keep your car from rusting?
9 What would you do if you won the lottery?
10 What’s your favorite place in the world?
11 What is the difference between a “differential” and a “derivative”?
12 Why don’t we see more of these in the news?
13 If you could only eat one kind of food for the rest of your life, what would it be?
14 What's the best way to get a good night's sleep?
15 What’s the first thing that comes to your mind when you hear the word “poker?
16 what is the best way to start a blog?
17 What is t

In [None]:
df_questions = pd.DataFrame()
df_questions['question'] = pd.DataFrame(questions)
df_questions

Unnamed: 0,question
0,What is your favorite place to eat in Chicago?
1,How do you know when you’ve hit your limit on ...
2,What are your top 5 favorite books?
3,Is there a way to change the color of the back...
4,"2+2=4, but why?"
...,...
495,What is your favorite type of music to listen to?
496,What is the best way to clean your car?
497,What is your favorite place to eat in the city?
498,what do you think is the best way to go about ...


In [None]:
df_questions.to_csv('non_data_science_question_dataset.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [None]:
df = pd.read_csv("non_data_science_question_dataset.csv")
df

Unnamed: 0,question
0,What is your favorite place to eat in Chicago?
1,How do you know when you’ve hit your limit on ...
2,What are your top 5 favorite books?
3,Is there a way to change the color of the back...
4,2+2=4 but why?
...,...
495,What is your favorite type of music to listen to?
496,What is the best way to clean your car?
497,What is your favorite place to eat in the city?
498,what do you think is the best way to go about ...


In [None]:
model_name = 'NousResearch/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to("cuda")
prompt_text = "A random question: "
questions = set()
i = 1
while len(questions) < 30:
  text = generate_new_question(model, prompt_text)
  if len(text) > 10:
      questions.add(text)
      print(i, text)
      i = i + 1

1 Is it possible to get the same result with a simple loop like this?
2 What do you think of the new "Machete" movie?
3 What do you think of this?
4 What would be the best way to find a specific record in a database?
5 Why do people not seem to be able to understand the concept of a fixed point?
6 What is the name of the book you are reading right now?
7 What is the name of the book you are reading at the moment?
8 Do you have any idea what's going to happen in this year?
9 Is there a way to change the width of a column in a table?
10 Which of the following is not a feature of schizophrenia?
11 is it possible to have a group with more than 2 members?
12 What is your favorite piece of clothing?
13 How do you know when to quit?
14 What is the purpose of a non-mathematical definition?
15 Do you have any tips, tricks, or suggestions for getting the most out of your time in Paris?
16 What’s the best way to get a copy of a CD or DVD?
17 Do you have any tips on how to find a good place to sta

In [None]:
model_name = 'NousResearch/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to("cuda")
prompt_text = "A random question: "
questions = set()
i = 1
while len(questions) < 10:
  text = generate_new_question(model, prompt_text)
  if len(text) > 10:
      questions.add(text)
      print(i, text)
      i = i + 1

1 Do you know any good books about how to write and publish a book?
2 What do you like to do in your free time?
3 How many of you have been in a situation where you were the only one who knew what was going on?
4 What do you think of the new iPhone?
5 2/3 of the voters in a certain country are women. What is the probability that a randomly selected voter is a woman?
6 What is the difference between a "user" and a "customer"?
7 1/4 of a litre of milk contains 8% fat. How many litres of milk must be added to increase the fat content to 10%?
8 How did you get started in photography?
9 How did you get started in photography, and what is your favorite thing about it?
10 How do you know if someone is lying to you?
