<a href="https://colab.research.google.com/github/Yash2G3/Hyperverge_project-Fine-tuning-LLMs-/blob/main/Llama2_Fine_Tuning_using_LORA_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'coding-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4609245%2F7858108%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240610%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240610T131159Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D32e2c93bb23bd4b4d00ecc5af3ca0bc01ef5afd589465a445082dcf832d7330c79e6c15241552b4656c24ed2b57103203d889af72d0a3bc77b509dfa9201e2f9c20937975af17e2d50efbdfea9bd965cfbed4e8f57e4c8b67ebb5c881756ae7eb8f92b878e25d29be37213b6de807a3e6b654901a8814c978ba80e0eb256d10d5312e4f91795d3f395b77f070df32fbfd06bab7243417496fa5d2d2d235429d7edb461f0a667cdfba4ce40f626479f7837af07ffb73e41e2e24180eb7dd46847543172c595eb01be75a0b6bf9e83cb6f0356366e9b49bc0175c6465a1c4bb4884b052509eb7a6d90c9601d18b4d92bed459dcc2a22b401ca83a7ca4cd78257de'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# <b><span style='color:#F1A424'>|</span> Install Libraries</b><a class='anchor' id='install_libraries'></a> [↑](#top)

***

Install all the required libraries for this notebook.

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

# <b><span style='color:#F1A424'>|</span> Import Libraries</b><a class='anchor' id='import_libraries'></a> [↑](#top)

***

Import all the required libraries for this notebook.

In [2]:
# Import necessary libraries
import pandas as pd
from tqdm import tqdm

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# <b><span style='color:#F1A424'>|</span> Load custom dataset</b><a class='anchor' id='load_data'></a> [↑](#top)

***

Custom dataset is used in this notebook. You can use any data but dataset should contain two columns with name 'prompt' and 'response'. The prompt column should contain the input text.

In [4]:
import sklearn
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
df = pd.read_csv("data_leetcode_problems.csv")
df.drop('Unnamed: 0',axis = 1,inplace=True)
train_df, eval_df = train_test_split(df, test_size = 0.15, random_state = 42)

train_ds = Dataset.from_pandas(train_df, split="train")
eval_ds = Dataset.from_pandas(eval_df, split="test")

from transformers import AutoTokenizer
model_name_or_path = "NousResearch/llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
train_ds = train_ds.flatten()
type(train_ds)
# def preprocess_function(examples):
#     if "instruction" in examples:  # Check if the column exists
#         return tokenizer([" ".join(x) for x in examples["instruction"]])
#     else:
#     # Handle the case where the column is missing (e.g., raise an error or use a default value)
#         raise ValueError("Instruction column not found in the dataset")
#         return tokenizer([" ".join(x) for x in train_ds["output"]])
# tokenizer_train_ds = train_ds.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=train_ds.column_names,
# )
# tokenizer_eval_ds = eval_ds.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=train_ds.column_names,
# )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [13]:
train_df.reset_index(inplace = True)

In [14]:
train_df

Unnamed: 0,index,instruction,input,output,text
0,1832,Create a solution in python for the input asked.,We will use a variation of Dijkstra's algorith...,```python\nimport heapq\nfrom collections impo...,"Below is an instruction that describes a task,..."
1,231,Create a solution in python for the input asked.,The algorithm uses a recursive approach to fin...,```python\ndef findStrobogrammatic(n):\n de...,"Below is an instruction that describes a task,..."
2,457,Create a solution in python for the input asked.,"1. Initialize a 2-dimensional DP table dp, whe...","```python\ndef findMaxForm(strs, m, n):\n d...","Below is an instruction that describes a task,..."
3,135,Create a solution in python for the input asked.,The algorithm utilizes the XOR bitwise operati...,```python\ndef singleNumber(nums):\n result...,"Below is an instruction that describes a task,..."
4,25,Create a solution in python for the input asked.,We use two pointers `i` and `j`. `i` starts at...,```python\ndef removeDuplicates(nums):\n if...,"Below is an instruction that describes a task,..."
...,...,...,...,...,...
2000,1638,Create a solution in python for the input asked.,The algorithm calculates the total amount of m...,```python\ndef total_money(n):\n weeks = n ...,"Below is an instruction that describes a task,..."
2001,1095,Create a solution in python for the input asked.,We iterate through each puzzle in the given pu...,"```python\ndef find_num_of_valid_words(words, ...","Below is an instruction that describes a task,..."
2002,1130,Create a solution in python for the input asked.,1. Sort the intervals by their starting points...,```python\ndef remove_covered_intervals(interv...,"Below is an instruction that describes a task,..."
2003,1294,Create a solution in python for the input asked.,We create an array called `counter` to keep tr...,```python\ndef minNumberOfFrogs(croakOfFrogs: ...,"Below is an instruction that describes a task,..."


In [15]:
input_lengths = train_df['input'].apply(len)
output_lengths = train_df['output'].apply(len)

# Compute average lengths
average_input_length = input_lengths.mean()
average_output_length = output_lengths.mean()

print(f"Average input length: {average_input_length}")
print(f"Average output length: {average_output_length}")

Average input length: 724.8608478802993
Average output length: 448.8837905236908


In [24]:
# Assuming the dataset has columns 'input' and 'output'
train_inputs = train_df['input'].tolist()
train_outputs = train_df['output'].tolist()

eval_inputs = eval_df['input'].tolist()
eval_outputs = eval_df['output'].tolist()

In [16]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, outputs, tokenizer, max_length=600):
        self.inputs = inputs
        self.outputs = outputs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]
        encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encodings = self.tokenizer(
            output_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        encodings['labels'] = target_encodings['input_ids']
        return {key: val.squeeze() for key, val in encodings.items()}

In [25]:
train_dataset = CustomDataset(train_inputs, train_outputs, tokenizer)
eval_dataset = CustomDataset(eval_inputs, eval_outputs, tokenizer)

In [None]:
# block_size = 128
# def group_texts(examples):
#     # Concatenate all texts.
#     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
#     total_length = len(concatenated_examples[list(examples.keys())[0]])
#     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
#     # customize this part to your needs.
#     if total_length >= block_size:
#         total_length = (total_length // block_size) * block_size
#     # Split by chunks of block_size.
#     result = {
#         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
#         for k, t in concatenated_examples.items()
#     }
#     result["labels"] = result["input_ids"].copy()
#     return result
# code_data =  tokenizer_train_ds.map(group_texts, batched=True, num_proc=4)
# eval_data = tokenizer_eval_ds.map(group_texts, batched=True, num_proc=4)

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

# <b><span style='color:#F1A424'>|</span> Configuration</b><a class='anchor' id='configuration'></a> [↑](#top)

***

Central repository for this notebook's hyperparameters.

In [22]:
model_name = "NousResearch/llama-2-7b-chat-hf"
dataset_name = "/content/train.jsonl"
new_model = "llama-2-7b-custom"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1  # Reduced epochs
fp16 = False
bf16 = False
per_device_train_batch_size = 32  # Increased batch size
per_device_eval_batch_size = 32
gradient_accumulation_steps = 2  # Adjust gradient accumulation
gradient_checkpointing = True
max_grad_norm = 0.2
learning_rate = 3e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = 200
warmup_ratio = 0.03
group_by_length = True
save_steps = 15
logging_steps = 5
max_seq_length = None
packing = False
device_map = {"": 0, "1": 1}  # Utilize both GPUs

# <b><span style='color:#F1A424'>|</span> Configuration of Quantization and LORA parameters</b><a class='anchor' id='configure_parameters'></a> [↑](#top)

***

As model size is big it is loaded in 4 bit.

In [None]:
!pip install accelerate
!pip install -i https://test.pypi.org/simple/ bitsandbytes

Looking in indexes: https://test.pypi.org/simple/


In [23]:
# Configure quantization parameters
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load pre-trained model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Configure LoRA-specific parameters
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)



config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

# <b><span style='color:#F1A424'>|</span> Training</b><a class='anchor' id='training'></a> [↑](#top)

***


In [27]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model  # Ensure get_peft_model is imported
from transformers import BitsAndBytesConfig
from datasets import load_dataset


model = get_peft_model(model, peft_config)

model.train()



for param in model.parameters():
    if param.dtype in [torch.float16, torch.float32, torch.float64, torch.complex64, torch.complex128]:
        param.requires_grad = True



# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    fp16=False,
    bf16=False,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    logging_dir=f"{output_dir}/logs",
    optim=optim,
    lr_scheduler_type=lr_scheduler_type,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    logging_steps=logging_steps,
    save_steps=save_steps,
    max_steps=max_steps,
    group_by_length=group_by_length,
    evaluation_strategy="steps",
    eval_steps = 15,
)

# Trainer instantiation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.29 GiB. GPU 

# <b><span style='color:#F1A424'>|</span> Testing</b><a class='anchor' id='testing'></a> [↑](#top)

***

Testing on test data

In [None]:
# Suppress logging messages to avoid unnecessary output
logging.set_verbosity(logging.CRITICAL)

# Create text generation pipelines using the specified model and tokenizer
# Define two pipelines with different maximum lengths
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=250)
pipe2 = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)

# Initialize an empty list to store generated text
generated_text = []

# Iterate over the test data
for i in tqdm(range(len(final_test_data))):
    # Extract the prompt from the test data
    prompt = final_test_data['prompt'].iloc[i]

    # Attempt to generate text using the first pipeline with a max length of 250
    try:
        result = pipe(prompt)
        # Append the generated text to the list, extracting the relevant part after '[/INST]'
        generated_text.append(result[0]['generated_text'].split('[/INST]')[1])
    except:
        # If an exception occurs, try the second pipeline with a max length of 500
        try:
            result = pipe2(prompt)
            # Append the generated text to the list, extracting the relevant part after '[/INST]'
            generated_text.append(result[0]['generated_text'].split('[/INST]')[1])
        except:
            # If both pipelines fail, append a default placeholder text
            generated_text.append("ABCD1234@#")

# The 'generated_text' list now contains the generated text for each prompt in the test data

In [None]:
# Assign the generated text to a new column 'generated_text' in the 'final_test_data' DataFrame
final_test_data['generated_text'] = generated_text

# Reset the index of the DataFrame for a cleaner representation in the CSV file
final_test_data = final_test_data.reset_index(drop=True)

# Save the DataFrame to a CSV file at the specified path
final_test_data.to_csv('/content/drive/MyDrive/llama2_finetune_output_1128.csv', index=False)

# <b><span style='color:#F1A424'>|</span> Saving Model for inference</b><a class='anchor' id='save_model'></a> [↑](#top)

***


In [None]:
# Set the path where the merged model will be saved
model_path = "/content/drive/MyDrive/llama-2-7b-custom"

# Reload the base model in FP16 and configure settings
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Instantiate a PeftModel using the base model and the new model
model = PeftModel.from_pretrained(base_model, new_model)  # Combine the base model and the fine-tuned weights

# Merge the base model with LoRA weights and unload unnecessary parts
model = model.merge_and_unload()  # Finalize the model by merging and unloading any redundant components

# Reload the tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer