# Install the neessary packages

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install peft
!pip install bitsandbytes
!pip install sentencePiece

# 1. Clean the Data 
1. Please Download the data from the github and save it on your local drive.
2. At the end of this step, data should be single JSON file in the following format.
```
            json_entry = {
                'instruction': 'What is diabetes?',
                'input': '',
                'output': 'Diabetes is ...'
            }
```
3. Save the Json file in your computer.

In [None]:
base_directory='/content/drive/MyDrive/Colab Notebooks/transformer_learn/'

In [None]:
import xmltodict
import json
import glob
import os

def convert_xml_to_json(xml_file):
    with open(xml_file, 'r', encoding='utf-8') as f:
        xml_data = f.read()

    xml_dict = xmltodict.parse(xml_data)

    # Check if 'Document' key is in the xml_dict and its value is not None
    if 'Document' not in xml_dict or xml_dict['Document'] is None or 'QAPairs' not in xml_dict['Document'] or xml_dict['Document']['QAPairs'] is None:
        print(f"Missing or invalid 'Document' or 'QAPairs' key in {xml_file}")
        return []

    questions = xml_dict['Document']['QAPairs']['QAPair']

    # Ensure questions is a list
    if not isinstance(questions, list):
        questions = [questions]

    json_data = []

    for question in questions:
        if question['Answer'] and question['Answer'].strip():
            json_entry = {
                'instruction': question['Question']['#text'],
                'input': '',
                'output': question['Answer']
            }
            json_data.append(json_entry)

    return json_data


# Replace "files" with the actual path to your "files" directory if needed
files_path = base_directory+"dataset"

# Get all topic directories inside the "files" directory
topic_directories = [d for d in os.listdir(files_path) if os.path.isdir(os.path.join(files_path, d))]

combined_json_data = []

for topic_directory in topic_directories:
    # Get all XML files inside the topic directory
    xml_files = glob.glob(os.path.join(files_path, topic_directory, "*.xml"))

    for xml_file in xml_files:
        combined_json_data.extend(convert_xml_to_json(xml_file))

# Save combined JSON data into a single file
with open(base_directory+ 'alpaca_data.json', 'w', encoding='utf-8') as f:
    json.dump(combined_json_data, f, indent=4)


#2. Declare the Model and Tokenizer
We will utilize the lama-7b-hf model created by Meta. To obtain the model weights from Meta, you must submit a request through https://ai.facebook.com/blog/large-language-model-llama-meta-ai/. However, the Llama model's weights were inadvertently leaked and incorporated into Hugging Face's decapoda-research/llama-7b-hf. As a result, we will employ the Llama model from decapoda-research rather than requesting the weights from Meta and waiting.

In [2]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

BASE_MODEL = "decapoda-research/llama-7b-hf"
 
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    device_map="auto",
)
 
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
 
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


#3. Data Preprocessing
1. We allocate 90% of the data for training and 10% for validation purposes.
2. The generate_prompt function establishes the prompt format. Reference: https://github.com/tloen/alpaca-lora
  * Here, Instruction ==> Question, Input ==> Context, Output ==> Answer
  * If there is context, the prompt will have three keys: [Instruction, Input,Output ]
  * If there is no context, the prompt will have two keys: [Instruction,Output ]
3. We create both training and validation datasets.
4. Initially, we generate a prompt and subsequently tokenize it.
5. The training process requires input_ids and attention_mask. It is not necessary to explicitly define the label.
6. This step should produce training and validation dataset with format:
```
Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 14762
})
```

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.utils.data import DataLoader

train_data = load_dataset("json", data_files=base_directory+"alpaca_data.json", split="train[:90%]")
valid_data = load_dataset("json", data_files=base_directory+"alpaca_data.json", split="train[90%:]")

def generate_prompt(data_point):
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""

train_data1 = load_dataset("json", data_files=base_directory+"alpaca_data.json", split="train[:90%]")
valid_data2 = load_dataset("json", data_files=base_directory+"alpaca_data.json", split="train[90%:]")
data_train = train_data1.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=1000,
        padding="max_length",
    )
)
data_valid = valid_data2.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=1000,
        padding="max_length",
    )
)



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-0b9353c21df78eea/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-0b9353c21df78eea/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.




Map:   0%|          | 0/14762 [00:00<?, ? examples/s]

Map:   0%|          | 0/1640 [00:00<?, ? examples/s]

# Model Training With PEFT

### Delaring Lora Variables

In [5]:
LORA_R = 8 #lora dimension
LORA_ALPHA = 16 #(`float`): The alpha parameter for Lora scaling
LORA_DROPOUT= 0.05
# this defines what parameters need to be trained
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]
 
BATCH_SIZE = 128
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 4e-4
TRAIN_STEPS = 50
OUTPUT_DIR = base_directory

The Below output shows that we are only training 0.06 percentage of parameter, which will higly spped-up fine-tunning process

In [6]:
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [7]:
import transformers
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=10,
    max_steps=TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=50,
    save_steps=50,
    output_dir=OUTPUT_DIR,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard"
)

In [1]:
## Train and save the chatbot

In [None]:

trainer = transformers.Trainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_valid,
    args=training_arguments,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

model.save_pretrained(base_directory+"chatbot")

One crucial aspect of PET is that it only stores the newly trained parameters, rather than the entire model. As a result, the size of our chatbot remains relatively small, amounting to just a few megabytes.

# Loading Model for inference

Process 

In [1]:
from transformers import AutoModel
from peft import PeftModel, PeftConfig
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM


base_directory='/Users/premtimsina/Documents/bpbbook/chapter2_huggingFace/'
base_directory='/content/drive/MyDrive/Colab Notebooks/transformer_learn/'
BASE_MODEL = "decapoda-research/llama-7b-hf"

# this is PEFT model specific to chatbot 
peft_model_id = base_directory+"chatbot"
config = PeftConfig.from_pretrained(peft_model_id)
# this is base model which is llama-7b-hf
model = LlamaForCausalLM.from_pretrained(BASE_MODEL)

# we are creating entire model combining base model and peft model
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

model = model.to("cuda")
model.eval()



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): Dropout(p=0.05, inplace=False)
                (lora_A): Linear(in_features=4096, out_features=8, bias=False)
                (lora_B): Linear(in_features=8, out_features=4096, bias=False)
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): Dropout(p=0.05, inplace=False)
                (lora_A): Linear(in_features=4096, out_features=8, bias=False)
                (lora_B): Linear(in_features=8, out_features=40

# Inference Function

In [4]:
DEVICE='cuda'

In [16]:
import textwrap
from peft import PeftModel
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from transformers.generation.utils import GreedySearchDecoderOnlyOutput

def ask_ai_doctor(instruction: str, model: PeftModel) -> str:
    PROMPT_TEMPLATE = f"""
    Below is an instruction that describes a task. Write a response that appropriately completes the request.
     
    ### Instruction:
    [INSTRUCTION]
     
    ### Response:
    """
    
    # Replace [INSTRUCTION] in the template with the given instruction
    prompt = PROMPT_TEMPLATE.replace("[INSTRUCTION]", instruction)
    
    # Generate an encoded input tensor from the prompt
    encoding = tokenizer(prompt, return_tensors="pt")
    input_ids = encoding["input_ids"].to(DEVICE)

    # Set generation configuration
    '''
    controls various aspects of the text generation process.
    temperature: This parameter (set to 0.1) controls the randomness of the generated text. lower value more determenistic; higher value more random
    top_p: This parameter (set to 0.75) is also called nucleus sampling. In our case, the model will only consider tokens that make up the top 75% of probabilities for the next word
    repetition_penalty: This parameter (set to 1.1) is used to penalize repetitions in the generated text. A value greater than 1 helps to reduce the frequency of repeated phrases
    '''
    generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        repetition_penalty=1.1,
    )
    
    # Generate the response using the model and the configuration

    with torch.inference_mode():
        response = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=250,
        )
    
    # Decode the response and extract the relevant part
    decoded_output = tokenizer.decode(response.sequences[0])
    formatted_response = decoded_output.split("### Response:")[1].strip()
    
    # Wrap and return the formatted response
    return "\n".join(textwrap.wrap(formatted_response))


In [17]:
print(ask_ai_doctor('What are symptoms of Cirrhosis?', model))

The following list of signs and symptoms may be associated with
cirrhosis.  Some people with cirrhosis do not have any of these
symptoms.   If you are concerned about how your general health is
affected by cirrhosis, talk to your doctor or nurse practitioner.
Signs and Symptoms of Cirrhosis   ------------------------   Abdominal
swelling (ascites)   Bleeding problems   Blurred vision   Breath odor
Confusion   Constipation   Difficulty concentrating   Dizziness
Fatigue   Fluid retention   Gallstones   Gout   Hair loss   Headache
Itching   Jaundice   Liver cancer   Memory loss   Muscle weakness
Nausea   Neuropathy   Night sweats   Pain in the upper right abdomen
Poor appetite   Skin itching   Sleepiness   Stomach pain   Swollen
legs and feet   Tiredness   Weight gain   Yellow skin and eyes
------------------------   How common are these symptoms?   These
symptoms can occur at different times


In [18]:
from huggingface_hub import notebook_login

In [19]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
model.push_to_hub("prem-timsina/alpaca-ai-doctor", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/prem-timsina/alpaca-ai-doctor/commit/be92fbce9e96a94dc5e8d3cbaed190acc7c03462', commit_message='Upload model', commit_description='', oid='be92fbce9e96a94dc5e8d3cbaed190acc7c03462', pr_url=None, pr_revision=None, pr_num=None)