In [None]:
!pip install huggingface_hub -q -U
from huggingface_hub import notebook_login

notebook_login()

In [None]:
cache_dir='.'

In [None]:
!pip install wandb -q -U
import wandb
wandb.login()

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!python -m pip install --upgrade pip -q -U
!pip install -q datasets
!pip install -q -U scipy

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

In [None]:
local_path = model_id
local_save_path = f"{cache_dir}/{local_path}"

In [None]:
!pip install git+https://github.com/huggingface/transformers.git -q -U
!pip install -q -U bitsandbytes
!pip install -q -U peft
!pip install -q -U accelerate

In [None]:
from huggingface_hub import snapshot_download
import os

def download_model_repo(repo_id, local_dir):
    # Download the whole repository to the specified local directory
    repo_path = snapshot_download(repo_id=repo_id,
                                  cache_dir=local_dir,
                                  local_dir=local_dir,
                                  local_dir_use_symlinks=False)

    print(f"Repository downloaded to: {repo_path}")

def main():
    download_model_repo(model_id, local_save_path)
    print()

if __name__ == "__main__":
    main()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    local_save_path,
    quantization_config=bnb_config,
    device_map='auto',
    torch_dtype=torch.float16,
    cache_dir=cache_dir)
tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=128,
    lora_alpha=128,
    target_modules=[
              "self_attn.q_proj",
              "self_attn.k_proj",
              "self_attn.v_proj",
              "self_attn.o_proj",
              # "self_attn.rotary_emb.inv_freq",
              # "mlp.gate_proj",
              # "mlp.up_proj",
              # "mlp.down_proj",
              # "input_layernorm.weight",
              # "post_attention_layernorm.weight",
              # "model.norm.weight",
              # "lm_head.weight"
              ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
ft_model = get_peft_model(
    model,
    config,
)
print_trainable_parameters(ft_model)


In [None]:
if '<pad>' in tokenizer.get_vocab():
    print('<pad> token is in the tokenizer. Using <pad> for pad')
    # Set the pad token
    tokenizer.pad_token = '<pad>'
elif '<unk>' in tokenizer.get_vocab():
    print('<unk> token is in the tokenizer. Using unk for pad')
    # Set the pad token
    tokenizer.pad_token = '<unk>'
else:
    print(f'Using EOS token, {tokenizer.eos_token}, for padding')
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.pad_token_id = tokenizer.pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

assert model.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

print('Tokenizer pad token ID:', tokenizer.pad_token_id)
print('Model pad token ID:', model.pad_token_id)
print('Model config pad token ID:', model.config.pad_token_id)
print('Number of tokens now in tokenizer:', tokenizer.vocab_size)

In [None]:
model.generation_config.do_sample = False
model.generation_config.temperature = 1.0
model.generation_config.top_p = 1.0

In [None]:
from transformers import TextStreamer
from peft import PeftModel

system_prompt = 'You are a helpful assistant. You provide succinct answers.'

# # For Mistral instruct
# system_prompt = ''

# Define a stream
def stream(user_prompt, model_type, adapter_model):

    if model_type == 'base':
        eval_model = model
    elif model_type == 'fine-tuned':
        eval_model = PeftModel.from_pretrained(
            model,
            adapter_model,
        )
    elif model_type == 'model_to_push':
        eval_model = model_to_push
    else:
        print('You must set the model_type to base or fine-tuned')
        exit()  # or raise an exception

    # print(f'Proceeding to inference with {model_type} model')

    eval_model.config.use_cache = True

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    # #For Mistral instruct
    # B_SYS, E_SYS = "", ""

    # added_prompt = "In the context of Touch Rugby and the International Playing Rules set in 2020... "
    added_prompt = ''

    # Chat model prompt with system message
    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{added_prompt}{user_prompt.strip()} {E_INST}\n\n"

    # # Without system message
    # prompt = f"{B_INST} {added_prompt}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    # _ = eval_model.generate(**inputs, streamer=streamer, max_new_tokens=50, temperature=0.01
    _ = eval_model.generate(**inputs, streamer=streamer, max_new_tokens=500) #if do_sample is False by default

def evaluation(model_type, adapter_model=''):
questions = [
        "evaluation question"
    ]

    answers = [
        "evalauation answer"
    ]

    for question, answer in zip(questions, answers):
        stream(question, model_type, adapter_model)
        print("Correct Answer:", answer)
        print('\n\n')

In [None]:
evaluation("base")


In [None]:
data_length = 1000

from datasets import load_dataset

data = load_dataset("link_to_dataset")

In [None]:
print("First row of train:", data['train'][11])
print("First row of test:", data['test'][0])

In [None]:
from torch.utils.data import DataLoader, Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, encodings, response_lengths, input_lengths):
        self.encodings = encodings
        self.response_lengths = response_lengths
        self.input_lengths = input_lengths

    def __getitem__(self, idx):
        if isinstance(idx, int):
            # print(f"__getitem__ called with index {idx}")
            item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
            response_start_position = self.input_lengths[idx]
            response_end_position = self.input_lengths[idx] + self.response_lengths[idx]
        elif isinstance(idx, list):
            # print(f"__getitem__ called with list {idx}")
            item = {key: torch.stack([val[i].clone().detach() for i in idx]) for key, val in self.encodings.items()}
            response_start_position = [self.input_lengths[i] for i in idx]
            response_end_position = [self.input_lengths[i] + self.response_lengths[i] for i in idx]

        # Set labels to be the same as input_ids
        item["labels"] = item["input_ids"].clone()

        # Create a loss mask that covers only the response tokens
        item["loss_mask"] = torch.zeros_like(item["input_ids"])
        item["loss_mask"][response_start_position:response_end_position] = 1

        # Shift the loss mask to the left by one position
        shifted_loss_mask = torch.cat([item["loss_mask"][1:], torch.tensor([0])])
        item["loss_mask"] = shifted_loss_mask

        # Shift the labels to the left by one position
        item["labels"][:-1] = item["input_ids"][1:]

        # Replace the token after the response with an EOS token
        item["labels"][response_end_position - 1] = 2

        # Replace the token after the response with an 1 in the loss mask
        item["loss_mask"][response_end_position - 1] = 1

        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [None]:
def prepare_dataset(dataset, tokenizer):
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    B_INST, E_INST = "[INST]", "[/INST]"


    formatted_dataset = dataset.map(
        lambda x: {
            "input_text": "".join([
                f"{B_INST} {x['prompt'].strip()} {E_INST}\n\n"
                f"{x['completion'].strip()}",
            ]),
            "response_text": "".join([
                f"{x['completion'].strip()}",
            ]),
        }
    )

    # Tokenize the datasets
    encodings = tokenizer([dialogue["input_text"] for dialogue in formatted_dataset], truncation=True, padding=True, max_length=data_length, return_tensors='pt', add_special_tokens=True)

    # Tokenize the response one by one without padding and special tokens for the purpose of calculating length
    response_lengths = [len(tokenizer.encode(dialogue["response_text"], truncation=True, max_length=data_length, padding=False, add_special_tokens=False)) for dialogue in formatted_dataset]

    # Tokenize the input one by one without padding and with the initial special token for the purpose of calculating length
    total_lengths = [len(tokenizer.encode(dialogue["input_text"], truncation=True, max_length=data_length, padding=False, add_special_tokens=True)) for dialogue in formatted_dataset]
    input_lengths = [total_length - response_length for total_length, response_length in zip(total_lengths, response_lengths)]

    # Create TextDataset
    text_dataset = TextDataset(encodings, response_lengths, input_lengths)

    return text_dataset

In [None]:
train_dataset = prepare_dataset(data['train'], tokenizer)
test_dataset = prepare_dataset(data['test'], tokenizer)

In [None]:
epochs = 3

In [None]:
save_dir = cache_dir + f'/results/{model_id}_{epochs}_epochs_allModules_{data_length}_length_qa_openaidata'
print(f'save_dir = {save_dir}')

In [None]:
import torch.nn as nn
import os
import transformers

# Custom callback to log metrics and save checkpoints
class LoggingCallback(transformers.TrainerCallback):
    def __init__(self, log_file_path, save_dir):
        self.log_file_path = log_file_path
        self.save_dir = save_dir

    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        with open(self.log_file_path, 'a') as f:
            if 'loss' in logs:
                f.write(f"Step: {state.global_step}, Training Loss: {logs['loss']}\n")
            if 'eval_loss' in logs:
                f.write(f"Step: {state.global_step}, Eval Loss: {logs['eval_loss']}\n")
            f.flush()  # Force flush the buffered data to file

# Log file path
log_file_path = os.path.join(cache_dir, "training_logs.txt")

# Create an instance of the custom callback
logging_callback = LoggingCallback(log_file_path, save_dir)


In [None]:
class CustomTrainer(transformers.Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Define the number of tokens you want to display
        num_tokens = 25  # This displays info on the actual and predicted tokens at the end of each sequence.

        labels = inputs.pop("labels")
        loss_mask = inputs.pop("loss_mask")

        # Forward pass
        outputs = model(**inputs)

        logits = outputs.logits

        # Check for NaN in logits and labels
        if torch.isnan(logits).any():
            print("NaN detected in logits")
            print(logits)

        # Convert logits to probabilities using softmax function
        probs = nn.functional.softmax(logits, dim=-1)

        # Get the most probable tokens
        predicted_token_ids = torch.argmax(probs, dim=-1)

        # Compute the loss
        loss_fct = nn.CrossEntropyLoss(reduction='none')
        losses = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))

        # Reshaping the losses to have dimensions [batch_size, seq_length]
        losses = losses.view(-1, inputs['input_ids'].size(1))

        # Apply the loss mask
        masked_loss = losses * loss_mask

        # Check for NaN in losses and zero in loss_mask.sum()
        if torch.isnan(losses).any():
            print("NaN detected in losses")
            # print(losses)

        if loss_mask.sum() == 0:
            print("Sum of loss_mask is zero")
            return (torch.tensor(0).to(loss_mask.device), outputs) if return_outputs else torch.tensor(0).to(loss_mask.device)  # Early return

        # Aggregate the masked losses
        loss = masked_loss.sum() / (loss_mask.sum() + 1e-9)

        # Print formatted tokens
        batch_size, seq_length = inputs['input_ids'].size()

        return (loss, outputs) if return_outputs else loss

    def get_train_dataloader(self):
      train_dataset = self.train_dataset
      data_collator = self.data_collator

      dataloader_params = {
          "batch_size": self.args.train_batch_size,
          "collate_fn": data_collator,
          "num_workers": self.args.dataloader_num_workers,
          "pin_memory": self.args.dataloader_pin_memory,
      }

      if not isinstance(train_dataset, torch.utils.data.IterableDataset):
          dataloader_params["sampler"] = self._get_train_sampler()
          dataloader_params["drop_last"] = self.args.dataloader_drop_last

      return DataLoader(train_dataset, **dataloader_params)

    def get_eval_dataloader(self, eval_dataset=None):
      eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
      if eval_dataset is None:
          raise ValueError("Trainer: evaluation requires an eval_dataset.")

      data_collator = self.data_collator

      # Parameters for the DataLoader
      dataloader_params = {
          "batch_size": self.args.eval_batch_size,
          "collate_fn": data_collator,
          "num_workers": self.args.dataloader_num_workers,
          "pin_memory": self.args.dataloader_pin_memory,
      }

      if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
          dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
          dataloader_params["drop_last"] = False
      return DataLoader(eval_dataset, **dataloader_params)


In [None]:
class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):

        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        labels = torch.stack([item['labels'] for item in batch])
        loss_mask = torch.stack([item['loss_mask'] for item in batch])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
            'loss_mask': loss_mask
        }

data_collator = CustomDataCollator(tokenizer)

In [None]:
trainer = CustomTrainer(
    model=ft_model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=transformers.TrainingArguments(
        # max_steps=3,
        num_train_epochs=epochs,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=1,
        evaluation_strategy="steps",
        max_grad_norm=1,
        warmup_ratio=0.1,
        eval_steps=0.2,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=1,
        output_dir=save_dir,
        optim="adamw_torch",
        # lr_scheduler_type='cosine',
        lr_scheduler_type='constant',
        save_steps=0.2, #not supported for 4-bit models
        hub_private_repo=True
    ),
    data_collator=data_collator,
    callbacks=[logging_callback],
)
ft_model.config.use_cache = False

In [None]:
trainer.train()


In [None]:
import matplotlib.pyplot as plt

train_losses = []
eval_losses = []
train_steps = []
eval_steps = []

for entry in trainer.state.log_history:
    if 'loss' in entry:
        train_losses.append(entry['loss'])
        train_steps.append(entry['step'])
    if 'eval_loss' in entry:
        eval_losses.append(entry['eval_loss'])
        eval_steps.append(entry['step'])

plt.plot(train_steps, train_losses, label='Train Loss')
plt.plot(eval_steps, eval_losses, label='Eval Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
adapter_dir = save_dir + '/checkpoint-54'
print(f'Running evaluation using the adapter at: {adapter_dir}\n\n')
evaluation("fine-tuned", adapter_dir)

In [None]:
adapter_to_push = save_dir + '/checkpoint-1028'

print(f'Defining the parameters to push adapters for checkpoint: {adapter_to_push}\n\n')

# Extract the last portion of the base_model
model_name = model_id.split("/")[-1]
fine_tuned_tag = 'sft-test-push'

# Define the save and push paths
adapter_model_name = f"Augustya07/{model_name}-{fine_tuned_tag}-adapters"
new_model = f"Augustya07/{model_name}-{fine_tuned_tag}"
print(f"Setting up for pushing to repos:\n- {adapter_model_name}\n- {new_model}")

In [None]:
model.config._name_or_path = model_id

model_to_push = PeftModel.from_pretrained(
    model,
    adapter_to_push,
)

In [None]:
local_adapter_model = adapter_model_name + '-local'
model_to_push.save_pretrained(local_adapter_model, token=True)

In [None]:
model_to_push.push_to_hub(adapter_model_name, token=True, safe_serialization=True)