## **Installing Libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q accelerate==0.21.0
!pip install peft==0.4.0
!pip install bitsandbytes==0.40.2
!pip install transformers==4.31.0
!pip install trl==0.4.7
!pip install datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.4.0
Collecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.40.2
Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading token

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


## **Importing Libraries**

In [None]:
import os
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from google.colab import files
import pandas as pd

## **Loading Dataset and Analysis**

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/Extracted Data in CSV/training_data.csv")
print(dataset.shape)
dataset.head()

(444, 3)


Unnamed: 0.1,Unnamed: 0,Question,Answer
0,0,Literature on Cardiac amyloidosis. Please let...,Cardiac amyloidosis is a disorder caused by de...
1,1,Migraine seems to be a spectrum of conditions ...,There is no specific cure for migraine headach...
2,2,DO I USE PYRIDOXINE TABLETS EVEN IF IM PREGNANT?,"Before taking pyridoxine, tell your doc..."
3,3,i have lymphoma what causes cramp after chemo ...,Muscle cramps are common and often occur when ...
4,4,I wonder of new research and testing on macula...,These resources address the diagnosis or manag...


In [None]:
dataset['text'] = '<s>[INST] ' + dataset['Question'] + ' [/INST] ' + dataset['Answer'] + ' </s>'

dataset = Dataset.from_pandas(dataset[['text']])
dataset

Dataset({
    features: ['text'],
    num_rows: 444
})

In [None]:
#Base model identifier from Hugging Face
model_name = "NousResearch/Llama-2-7b-chat-hf"

#LoRA settings for modifying attention mechanisms
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

#4-bit precision settings for model efficiency
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

#Training settings
num_train_epochs = 3
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

#Sequence-to-sequence (SFT) training settings
max_seq_length = None  # Max sequence length
packing = False  # Pack short sequences together
device_map = {"": 0}  # Load model on specific GPU

In [None]:
#Setting up the data type for computation based on the precision setting
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

#Configuring the 4-bit quantization and precision for the model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

#Verifying if the current GPU supports bfloat16 to suggest using it for better performance
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Accelerate training with bf16=True")
        print("=" * 80)

#Loading the specified model with the above quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

#Initializing the tokenizer for the model and setting padding configurations
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Setting the pad token
tokenizer.padding_side = "right"  # Adjusting padding to the right to avoid issues during training

#Configuring LoRA parameters for the model to fine-tune its attention mechanisms
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

#Defining various training parameters such as directory, epochs, batch sizes, optimization settings, etc.
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,  # Grouping by length for efficient batching
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"  # Reporting to TensorBoard for monitoring
)

#Setting up the fine-tuning trainer with the specified model, dataset, tokenizer, and training arguments
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",  # Specifying which dataset field to use for text
    max_seq_length=max_seq_length,  # Setting the maximum sequence length
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,  # Enabling packing for efficiency
)

#Starting the training process
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]



Map:   0%|          | 0/444 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.1116
50,1.7756
75,1.6957
100,1.7144
125,1.6301
150,1.6376
175,1.6141
200,1.5318
225,1.5428
250,1.522


TrainOutput(global_step=333, training_loss=1.6394649740453955, metrics={'train_runtime': 1813.1606, 'train_samples_per_second': 0.735, 'train_steps_per_second': 0.184, 'total_flos': 1.03969342513152e+16, 'train_loss': 1.6394649740453955, 'epoch': 3.0})

In [None]:
# Save trained model
trainer.model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/Llama-2-7b-healthcare_question_answer-finetune")
trainer.tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/healthcare_question_answer-finetune_tockenizer")

('/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/healthcare_question_answer-finetune_tockenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/healthcare_question_answer-finetune_tockenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/healthcare_question_answer-finetune_tockenizer/tokenizer.model',
 '/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/healthcare_question_answer-finetune_tockenizer/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/healthcare_question_answer-finetune_tockenizer/tokenizer.json')

# Prediction on Test Dataset

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/Extracted Data in CSV/test_data.csv")

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300)

columns = ['Index', 'Question', 'Predicted Answer']
predictions_df = pd.DataFrame(columns=columns)



for index, row in test_data.iterrows():
    # new_row1 = {'Index': index, 'Question': row['Question'], 'Predicted Answer': row['Answers']}
    # new_df = new_df.append(new_row1, ignore_index=True)
    question = row["Question"]


    result = pipe(f"<s>[INST] {question} [/INST]")
    generated_text = result[0]['generated_text'].split("[/INST]")[1]

    predictions_df.loc[len(predictions_df.index)] = [index, row['Question'], generated_text]

    predictions_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Masters Thesis - Project/predicted_answers.csv')

    print(index, "Done")

predictions_df.head()

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


0 Done




1 Done




2 Done




3 Done




4 Done




5 Done




6 Done




7 Done




8 Done




9 Done




10 Done




11 Done




12 Done




13 Done




14 Done




15 Done




16 Done




17 Done




18 Done




19 Done




20 Done




21 Done




22 Done




23 Done




24 Done




25 Done


