# TODOs

- Figure out why resumption is not working
- Verify that loss function is implemented right 
- record / plot loss better and easy background training
- [X] Research more QLoRA
  - [ ] make sure that QLoRA is actually happening..
  - RTFP
  - [X] Find good solid example code with transformers
  - Determine QLoRA parameters to experiment with and how
    - datatype
    - what bits
    - r
    - dropout
    - target_modules
    - lora alpha
- Determine training parameters to experiment with
  - AdamW RTFP
  - tune adamW starting learning rate?
  - Batch size. Should it vary over time?
  - 
- get complicated and weird
  - explore mining hard negatives
  - explore synthesizing hard negatives
  - synthesize more varied data
  - generated data off of a target domain (easy-on fine tuning)
- benchmark
  - add inferrence code path
  - figure out how to run model against an mteb benchmark
- update code to be able to run it distributed in the cloud?
  - related to deepspeed and accelerate
    - https://arxiv.org/abs/2104.07857
    - https://huggingface.co/docs/accelerate/usage_guides/deepspeed
- [ ] get a bigger graphics card

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
seed = 42

In [3]:
from transformers import LlamaModel, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = LlamaModel.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T",
    device_map="auto",
    quantization_config=bnb_config,
    # load_in_4bit=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T"
)

base_dataset = load_dataset(
    "andersonbcdefg/synthetic_tuples_gpt35_turbo", split="train"
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from peft import TaskType


# consider and experiment withadding a specific pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# Tokenize the dataset
def tokenize_function(examples):
    max_len = 128 # TODO - reconsider this
    tokenized_query = tokenizer(examples["query"], padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    tokenized_pos = tokenizer(examples["pos"], padding=True,  truncation=True, max_length=max_len, return_tensors="pt")
    tokenized_neg = tokenizer(examples["neg"], padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    return {
        "input_ids_query": tokenized_query["input_ids"],
        "attention_mask_query": tokenized_query["attention_mask"],
        "input_ids_pos": tokenized_pos["input_ids"],
        "attention_mask_pos": tokenized_pos["attention_mask"],
        "input_ids_neg": tokenized_neg["input_ids"],
        "attention_mask_neg": tokenized_neg["attention_mask"],
    }


dataset = base_dataset.map(tokenize_function, batched=True, cache_file_name="./cache/tokenized_datasets")

train_test_split = dataset.train_test_split(test_size=0.2, seed=seed)

# Access the new train and test datasets
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=[
        "q_proj",
        "v_proj",
        "o_proj",
        "down_proj",
        "up_proj",
        "gate_proj",
    ],
    inference_mode=False,
)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing = True)
model = get_peft_model(model, peft_config)
# model.add_adapter(peft_config)

In [6]:
from transformers import Trainer, TrainingArguments, IntervalStrategy
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from datasets import Dataset


class TinyEmbedTrainer(Trainer):
    """
    Subclasses hugging face trainer to do all the needed things:
    - embedding training needs multiple inputs to train on (query, positive, negative).
    -   collate function to reshape the dict and pad the batch
    """
    def __init__(
        self,
        model,
        args: TrainingArguments,
        train_dataset: Dataset,
        eval_dataset: Dataset,
        tokenizer: AutoTokenizer,
    ):
        # Consider reworking the model's signature to conform to training expectations
        args.remove_unused_columns = False
        super().__init__(
            model,
            args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            data_collator=self._map_collate_fn,
        )
        # self.train_dataset = train_dataset
        # self.eval_dataset = eval_dataset
        # self.tokenizer = tokenizer

    def _map_collate_fn(self, batch):
        """
        pads and truncates the batch for each of the three inputs
        and returns a single dict, with each key 
        shape the the List[Dict[str, torch.Tensor]] into a Dict[str, torch.Tensor]
        
        """
        def get_field(field: str):
            attention = [item["attention_mask_" + field] for item in batch]
            inputs = [item["input_ids_" + field] for item in batch]
            # probably something wrong at the dataset level... but munge things into the right shape here,
            # truncate the batch to the max length, but also 0 pad those that are less than the max length
            # for both input_ids and attention_mask
            max_len = max([sum(x) for x in attention])
            attention_mask = []
            input_ids = []
            for i in range(len(attention)):
                attention_i = attention[i]
                input_ids_i = inputs[i]
                attention_trunc = attention_i[:max_len]
                input_ids_trunc = input_ids_i[:max_len]
                attention_pad = attention_trunc + [0] * (max_len - len(attention_trunc))
                input_ids_pad = input_ids_trunc + [self.tokenizer.pad_token_id] * (
                    max_len - len(input_ids_trunc)
                )
                attention_mask.append(attention_pad)
                input_ids.append(input_ids_pad)

            return {
                ("attention_mask_" + field): torch.tensor(attention_mask),
                ("input_ids_" + field): torch.tensor(input_ids),
            }
        return {
            **get_field("query"), **get_field("pos"), **get_field("neg")
        }

    def _get_embeddings(self, model, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  # Get the last hidden states
        # Compute the indices of the last non-padding tokens
        sequence_lengths = attention_mask.sum(dim=1)
        last_token_indices = sequence_lengths - 1
        embeddings = hidden_states[
            torch.arange(hidden_states.size(0)), last_token_indices, :
        ]
        # Normalize the embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        return embeddings
    
    def compute_loss(self, model, inputs, return_outputs=False):
        query_embeddings = self._get_embeddings(model, inputs["input_ids_query"], inputs["attention_mask_query"])
        pos_embeddings = self._get_embeddings(model, inputs["input_ids_pos"], inputs["attention_mask_pos"])
        neg_embeddings = self._get_embeddings(model, inputs["input_ids_neg"], inputs["attention_mask_neg"])

        # Compute InfoNCE loss
        pos_similarity = torch.sum(query_embeddings * pos_embeddings, dim=1)
        neg_similarity = torch.sum(query_embeddings * neg_embeddings, dim=1)
        losses = -torch.log(
            torch.exp(pos_similarity)
            / (torch.exp(pos_similarity) + torch.exp(neg_similarity))
        )
        return losses.mean()



In [8]:

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    log_level="info",
    logging_strategy=IntervalStrategy.STEPS,
    gradient_checkpointing=True,
    logging_steps=5,
    save_steps=100
)

trainer = TinyEmbedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)
trainer.train(resume_from_checkpoint=True)

Loading model from ./results/checkpoint-1300.


***** Running training *****
  Num examples = 163,636
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Training with DataParallel so batch size has been adjusted to: 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 490,908
  Number of trainable parameters = 5,902,336
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 1300
  Will skip the first 0 epochs then the first 1300 batches in the first epoch.


Step,Training Loss
1350,0.2801
1400,0.3498


Saving model checkpoint to ./results/tmp-checkpoint-1400
tokenizer config file saved in ./results/tmp-checkpoint-1400/tokenizer_config.json
Special tokens file saved in ./results/tmp-checkpoint-1400/special_tokens_map.json
