In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes peft trl datasets
!huggingface-cli login

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load Dataset

In [None]:
import json
from datasets import load_dataset
dataset_dir_name = '/content/drive/MyDrive/paper2slides/finetune_dataset.json'
dataset = load_dataset("json", data_files=dataset_dir_name,split="train")

## Adding prompt to dataset

In [None]:
# Prompt for Topic Generation + Summarization

system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics and generating summaries.
<</SYS>>
"""
main_prompt = """
[INST]
I have a topic that contains the following documents:
{documents}

Based on the information about the topic above, you have two tasks.
Task-1: Please create a short label of this topic. Make sure you to only return the label and nothing more.
Task-2: Please create a short summmary of this topic describing the steps in the documents. Make sure that the you do not report more than six sentences in the list. Make sure to report the summary in a list of sentences. Make sure that each sentence does not exceed 10 words. Make sure to only return the list of sentences and nothing more.

Put this data into a JSON list with keys "label" and "summary".
[/INST]
"""
prompt = system_prompt+main_prompt

def add_prefix(example):
    example["text"] = prompt.format(documents=example['documents'])
    completion = json.dumps({"label":example['title'],"summary":example['summary']})
    example["text"]+=completion
    example["text"]+=" </s>"
    return example
dataset = dataset.map(add_prefix)

Map:   0%|          | 0/920 [00:00<?, ? examples/s]

## Loading libraries for fine tuning

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import torch



## Loading base model and tokenizer

In [None]:
# Model from Hugging Face hub
base_model = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned model
new_model = "llama-2-7b-chat-paper-to-slides"


In [None]:
# 4-bit quantization configuration
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Loading Llama 2 model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## Set fine tuning parameters

In [None]:
# PEFT parameters
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# Training parameters
training_params = TrainingArguments(
    output_dir="/content/drive/MyDrive/paper2slides/results",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_params)


# Model fine tuning
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

## Start training

In [None]:
trainer.train('/content/drive/MyDrive/paper2slides/results/checkpoint-850')

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
875,1.049
900,1.0911




TrainOutput(global_step=920, training_loss=0.07894699677177097, metrics={'train_runtime': 1042.3983, 'train_samples_per_second': 1.765, 'train_steps_per_second': 0.883, 'total_flos': 4.132859302163251e+16, 'train_loss': 0.07894699677177097, 'epoch': 2.0})

In [None]:
# Saving Model
trainer.save_model()
trainer.model.save_pretrained(new_model)
model_dir_name = '/content/drive/MyDrive/paper2slides/' + new_model
trainer.model.save_pretrained(model_dir_name)

## Save combined LoRA + base model

In [None]:
from peft import PeftModel
# Reload model in FP16 and merge it with LoRA weights
base_model1 = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(base_model1, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
!huggingface-cli login
model.push_to_hub("abhi757/"+new_model, use_temp_dir=False)
tokenizer.push_to_hub("abhi757/"+new_model, use_temp_dir=False)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/abhi757/llama-2-7b-chat-paper-to-slides/commit/d85ac8e62b0d8ba00dab50f97ed8c51527f84d58', commit_message='Upload tokenizer', commit_description='', oid='d85ac8e62b0d8ba00dab50f97ed8c51527f84d58', pr_url=None, pr_revision=None, pr_num=None)