# Large Language Model (LLM)

**Importing libraries for data analysis and scaling**

In [3]:
! pip install datasets
! pip install accelerate
! pip install bitsandbytes
! pip install peft
! pip install evaluate
! pip install trl
! pip install rouge_score



In [4]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model
from huggingface_hub import notebook_login
import evaluate
from trl import SFTTrainer
import requests
import json

**Entrenamiento de un modelo de lenguaje grande (LLM)**

In [5]:
# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    # Get the number of available CUDA devices
    num_devices = torch.cuda.device_count()
    print("Number of available CUDA devices:", num_devices)

    # Iterate over CUDA devices and print their indices and names
    for i in range(num_devices):
        print("GPU index", i, ":", torch.cuda.get_device_name(i))
else:
    print("CUDA is not available. CPU will be used.")

Number of available CUDA devices: 1
GPU index 0 : Tesla T4


In [6]:
#notebook_login()

In [7]:
# Base model
# Llama-2-7b-hf model architecture: It is an auto-regressive language model that uses an optimized transformer architecture.
# It is possible to the official Meta Llama-2 model from Hugging Face, but you have to apply and wait a couple of days for confirmation.
# Model: https://huggingface.co/meta-llama/Llama-2-7b-hf
# Paper: https://arxiv.org/abs/2307.09288
# More info: https://llama.meta.com/llama2/

#model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

In [8]:
# Base model
# Model: https://huggingface.co/meta-llama/Meta-Llama-3-8B

In [9]:
# Dataset
# Train
url_train = "https://raw.githubusercontent.com/architkaila/Fine-Tuning-LLMs-for-Medical-Entity-Extraction/main/data/entity_extraction/entity-extraction-train-data.json"
response_train = requests.get(url_train)
if response_train.status_code == 200:
    data_train = response_train.json()
    print("Training OK")
else:
    print("Error obtaining training data:", response_train.status_code)

Training OK


In [10]:
data_train[0]

{'input': "Robert Johnson\nrobert.johnson@email.com\n789 Maple Lane, Chicago, IL 60601\n555-234-5678, United States\n\nRelationship to XYZ Pharma Inc.: Patient\nReason for contacting: Adverse Event\n\nMessage: I've been on Onglyza for a while, and I've noticed that I'm experiencing frequent painful urination. Is this a known side effect?",
 'output': '{"drug_name": "Onglyza", "adverse_events": ["painful urination"]}'}

In [11]:
# Test
url_test = "https://raw.githubusercontent.com/architkaila/Fine-Tuning-LLMs-for-Medical-Entity-Extraction/main/data/entity_extraction/entity-extraction-test-data.json"
response_test = requests.get(url_test)
if response_test.status_code == 200:
    data_test = response_test.json()
    print("Test OK")
else:
    print("Error obtaining test data:", response_test.status_code)

Test OK


In [12]:
data_test[0]

{'input': "Natalie Cooper,\nncooper@example.com\n6789 Birch Street, Denver, CO 80203,\n303-555-6543, United States\n\nRelationship to XYZ Pharma Inc.: Patient\nReason for contacting: Adverse Event\n\nMessage: Hi, after starting Abilify for bipolar I disorder, I've noticed that I am experiencing nausea and vomiting. Are these typical reactions? Best, Natalie Cooper",
 'output': '{"drug_name": "Abilify", "adverse_events": ["nausea", "vomiting"]}'}

```
---- LlaMa2 datasets ----
https://gpus.llm-utils.org/llama-2-prompt-template/
Note that this only applies to the llama 2 chat models. The base models have no prompt structure, they’re raw non-instruct tuned models.

<s>[INST] {user_message_1} [/INST] {model_reply_1}</s>

---- Alpaca datasets ----
### Instruction:
(Instruction Text)

### Input:
(Auxiliary Input Text)

### Response:
(Desired Response Text)

---- Vicuna datasets ----
Vicuna datasets
### Human:
(Question Text)
### Assistant:
(Response Text)

---- Mistral datasets ----
<s>[INST] Instruction [/INST] Model answer</s>

---- Gemma ----
```

In [13]:
# Without a system message
# <s>[INST] {user_message_1} [/INST] {model_reply_1}</s>

formatted_data_train = []

for item in data_train:
    input_text = item["input"]
    output_text = item["output"]

    formatted_input = f"<s>[INST] {input_text}[/INST]"

    formatted_output = output_text.replace('\"', '').replace('{', '').replace('}', '')

    formatted_data_train.append({'text':formatted_input + formatted_output + '</s>'})

In [14]:
formatted_data_train[0]

{'text': "<s>[INST] Robert Johnson\nrobert.johnson@email.com\n789 Maple Lane, Chicago, IL 60601\n555-234-5678, United States\n\nRelationship to XYZ Pharma Inc.: Patient\nReason for contacting: Adverse Event\n\nMessage: I've been on Onglyza for a while, and I've noticed that I'm experiencing frequent painful urination. Is this a known side effect?[/INST]drug_name: Onglyza, adverse_events: [painful urination]</s>"}

In [15]:
dataset_train = Dataset.from_list(formatted_data_train)

In [16]:
dataset_train

Dataset({
    features: ['text'],
    num_rows: 700
})

In [17]:
dataset_train[0]

{'text': "<s>[INST] Robert Johnson\nrobert.johnson@email.com\n789 Maple Lane, Chicago, IL 60601\n555-234-5678, United States\n\nRelationship to XYZ Pharma Inc.: Patient\nReason for contacting: Adverse Event\n\nMessage: I've been on Onglyza for a while, and I've noticed that I'm experiencing frequent painful urination. Is this a known side effect?[/INST]drug_name: Onglyza, adverse_events: [painful urination]</s>"}

In [18]:
formatted_data_test = []

for item in data_test:
    input_text = item["input"]
    output_text = item["output"]

    formatted_input = f"<s>[INST] {input_text}[/INST]"

    formatted_output = output_text.replace('\"', '').replace('{', '').replace('}', '')

    formatted_data_test.append({'text':formatted_input + formatted_output + '</s>'})

In [19]:
formatted_data_test[0]

{'text': "<s>[INST] Natalie Cooper,\nncooper@example.com\n6789 Birch Street, Denver, CO 80203,\n303-555-6543, United States\n\nRelationship to XYZ Pharma Inc.: Patient\nReason for contacting: Adverse Event\n\nMessage: Hi, after starting Abilify for bipolar I disorder, I've noticed that I am experiencing nausea and vomiting. Are these typical reactions? Best, Natalie Cooper[/INST]drug_name: Abilify, adverse_events: [nausea, vomiting]</s>"}

In [20]:
dataset_test = Dataset.from_list(formatted_data_test) # HuggingFace Dataset

In [21]:
dataset_test

Dataset({
    features: ['text'],
    num_rows: 59
})

In [22]:
dataset_test[0]

{'text': "<s>[INST] Natalie Cooper,\nncooper@example.com\n6789 Birch Street, Denver, CO 80203,\n303-555-6543, United States\n\nRelationship to XYZ Pharma Inc.: Patient\nReason for contacting: Adverse Event\n\nMessage: Hi, after starting Abilify for bipolar I disorder, I've noticed that I am experiencing nausea and vomiting. Are these typical reactions? Best, Natalie Cooper[/INST]drug_name: Abilify, adverse_events: [nausea, vomiting]</s>"}

In [23]:
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf") # Load tokenizer
tokenizer.pad_token = tokenizer.eos_token #  Padding token of the tokenizer to be the same as the end-of-sequence (eos) token
tokenizer.padding_side = "right" # Padding should be added to the right side of the input sequences

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [24]:
# Base model
# We will use NousResearch's Llama-2-7b-chat-hf as a base model, which is the same as the original, but easily accessible.
# Model: https://huggingface.co/NousResearch/Llama-2-7b-chat-hf

# Create quantization config
# https://huggingface.co/docs/transformers/main_classes/quantization
# Quantization techniques reduce memory and computational costs
# by representing weights and activations with lower-precision data types
quantization_config = BitsAndBytesConfig(
    Load_in_4bit=True, # This flag is used to enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16, # This sets the computational type: once the weights are loaded in 4-bit, the computations will be performed using 16-bit floating-point precision.
    bnb_4bit_quant_type="nf4" # This sets the quantization data type
)

model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-2-7b-chat-hf", # Load model
                                             quantization_config= quantization_config, # Quantification configuration
                                             device_map=0 # device_map = 0 means put the whole model on GPU 0; device_map="auto" compute the most optimized `device_map` automatically
)

Unused kwargs: ['Load_in_4bit']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [25]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

**Zero-shot**

In [26]:
print(data_test[1]['input'])

Mia Garcia
mia.garcia@email.com
321 Magnolia Drive, Dallas, TX 75201
555-890-1234, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I experienced a feeling of light-headedness and near-fainting after taking Staxyn for my erectile dysfunction. Is this a common side effect, and should I be worried?


In [27]:
# Run text generation pipeline with our model
prompt = data_test[1]['input']
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<s>[INST] Mia Garcia
mia.garcia@email.com
321 Magnolia Drive, Dallas, TX 75201
555-890-1234, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I experienced a feeling of light-headedness and near-fainting after taking Staxyn for my erectile dysfunction. Is this a common side effect, and should I be worried? [/INST]  Subject: Adverse Event Report - Light-Headedness and Near-Fainting after Taking Staxyn

Dear Mia Garcia,

Thank you for reaching out to XYZ Pharma Inc. regarding your recent experience with Staxyn. We take all adverse events seriously and appreciate your vig


In [28]:
print(data_test[2]['input'])

Brandon Lee,
blee@example.com
3333 Pine Road, Hilltown, MA 02108,
617-555-3333, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: Since I started on Byetta, I've noticed an increase in thirst and dry mouth. Is this related to the medication? Best, Brandon Lee


In [29]:
# Run text generation pipeline with our model
prompt = data_test[2]['input']
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Brandon Lee,
blee@example.com
3333 Pine Road, Hilltown, MA 02108,
617-555-3333, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: Since I started on Byetta, I've noticed an increase in thirst and dry mouth. Is this related to the medication? Best, Brandon Lee [/INST]  Sure, here is a sample email that you can use as a template:

Subject: Adverse Event Report - Byetta

Dear XYZ Pharma Inc.,

My name is Brandon Lee, and I am writing to report an adverse event related to my use of Byetta. I have been taking Byetta for the past [insert time frame], and I have


**One-shot**

In [30]:
prompt = dataset_test[0]['text'] + '\n' + f"<s>[INST] {data_test[1]['input']} [/INST]"
print(prompt)

<s>[INST] Natalie Cooper,
ncooper@example.com
6789 Birch Street, Denver, CO 80203,
303-555-6543, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: Hi, after starting Abilify for bipolar I disorder, I've noticed that I am experiencing nausea and vomiting. Are these typical reactions? Best, Natalie Cooper[/INST]drug_name: Abilify, adverse_events: [nausea, vomiting]</s>
<s>[INST] Mia Garcia
mia.garcia@email.com
321 Magnolia Drive, Dallas, TX 75201
555-890-1234, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I experienced a feeling of light-headedness and near-fainting after taking Staxyn for my erectile dysfunction. Is this a common side effect, and should I be worried? [/INST]


In [31]:
# Run text generation pipeline with our next model
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=350)
result = pipe(prompt)
print(result[0]['generated_text'])

<s>[INST] Natalie Cooper,
ncooper@example.com
6789 Birch Street, Denver, CO 80203,
303-555-6543, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: Hi, after starting Abilify for bipolar I disorder, I've noticed that I am experiencing nausea and vomiting. Are these typical reactions? Best, Natalie Cooper[/INST]drug_name: Abilify, adverse_events: [nausea, vomiting]</s>
<s>[INST] Mia Garcia
mia.garcia@email.com
321 Magnolia Drive, Dallas, TX 75201
555-890-1234, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I experienced a feeling of light-headedness and near-fainting after taking Staxyn for my erectile dysfunction. Is this a common side effect, and should I be worried? [/INST]  drug_name: Staxyn, adverse_events: [lightheadedness, near-fainting]</s>

Here is the information you requested:

drug_name: Staxyn
adverse_events: lightheadedness, near-fainting




**Entrenamiento**

In [32]:
# Create LoRA config
# More info in https://huggingface.co/docs/peft/main/en/conceptual_guides/lora
peft_config = LoraConfig(
    r=8, # The rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters.
    target_modules=["g_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], # The modules to apply the LoRA update matrices.
    bias="none", # Specifies if the bias parameters should be trained.
    task_type = TaskType.CAUSAL_LM
)


In [33]:
# Subset of the arguments thath we use to the training.
# https://huggingface.co/docs/transformers/main_classes/trainer

training_params = TrainingArguments(
    output_dir="./results", # where the model's checkpoints and predictions will be stored
    num_train_epochs=1, # number of epochs
    per_device_train_batch_size=4, # batch size for training
    gradient_accumulation_steps=1, # # Number of update steps to accumulate the gradients for
    optim="paged_adamw_32bit", # AdamW optimizer
    save_steps=25, # save checkpoint every 25 update steps
    logging_steps=25, # logs every 25 update steps
    learning_rate=2e-4, # initial learning rate
    weight_decay=0.001, # weight decay to apply to all layers except bias/LayerNorm weights
    fp16=False,
    bf16=False,
    max_grad_norm=0.3, # maximum gradient normal (gradient clipping)
    max_steps=-1, # number of training steps (if not -1 overrides num_train_epochs)
    warmup_ratio=0.03, # ratio of steps for a linear warmup (from 0 to learning rate)
    group_by_length=True, # group sequences into batches with same length
    lr_scheduler_type="constant", # learning rate schedule
    report_to="tensorboard"
)

In [34]:
# Set supervised fine-tuning parameters
max_seq_length = None
packing = False

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_params,
    packing=packing,
)

# Train model
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Step,Training Loss
25,1.3511
50,0.6205
75,0.5067
100,0.4828
125,0.4506
150,0.4714
175,0.4302


TrainOutput(global_step=175, training_loss=0.6161933735438756, metrics={'train_runtime': 310.8233, 'train_samples_per_second': 2.252, 'train_steps_per_second': 0.563, 'total_flos': 4234638168686592.0, 'train_loss': 0.6161933735438756, 'epoch': 1.0})

In [35]:
# Run text generation pipeline with our next model
prompt = data_test[1]['input']
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Mia Garcia
mia.garcia@email.com
321 Magnolia Drive, Dallas, TX 75201
555-890-1234, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I experienced a feeling of light-headedness and near-fainting after taking Staxyn for my erectile dysfunction. Is this a common side effect, and should I be worried? [/INST]drug_name: Staxyn, adverse_events: [light-headedness, near-fainting]</s>


In [36]:
# Run text generation pipeline with our next model
prompt = data_test[12]['input']
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Daniel Wilson
daniel.wilson@example.com
112 Pine Avenue, Atlanta, GA 30301
4045554321, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I took Nexium for acid reflux, and I had a headache and stomach pain. Could this be due to the medication? [/INST]drug_name: Nexium, adverse_events: [headache, stomach pain]</s>


**Evaluación**

In [37]:
# Evaluate the Model Quantitatively
rouge = evaluate.load('rouge') # https://en.wikipedia.org/wiki/ROUGE_(metric)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [38]:
input = []

for d in data_test:
  input.append(f"<s>[INST] {d['input']} [/INST]")

In [39]:
output = dataset_test['text']

In [40]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=250)

output_model = []

for i in input:
  output_model.append(pipe(i))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [41]:
print(output[10])

<s>[INST] William Harris
william.harris@example.com
890 Oak Road, San Francisco, CA 94101
4155558765, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I received Neupogen and had trouble breathing and fever. Are these common side effects of the medication?[/INST]drug_name: Neupogen, adverse_events: [trouble breathing, fever]</s>


In [42]:
output_model__ = []

for ii in output_model:
  output_model__.append(ii[0]['generated_text'])

print(output_model__[10])

<s>[INST] William Harris
william.harris@example.com
890 Oak Road, San Francisco, CA 94101
4155558765, United States

Relationship to XYZ Pharma Inc.: Patient
Reason for contacting: Adverse Event

Message: I received Neupogen and had trouble breathing and fever. Are these common side effects of the medication? [/INST]drug_name: Neupogen, adverse_events: [trouble breathing, fever]</s>


In [43]:
rouge_results = rouge.compute(
    predictions=output_model__,
    references=output,
    use_aggregator=True, # Scores are averaged over all examples
    use_stemmer=True, # Stemmer will be used during the computation of the ROUGE scores (stemmer reduces words to their root form, which can help in matching similar words)
)

In [44]:
print(rouge_results)

{'rouge1': 0.9806333269273773, 'rouge2': 0.9705528045530336, 'rougeL': 0.9791167329530824, 'rougeLsum': 0.9785161517372469}


In [45]:
torch.cuda.empty_cache()

In [46]:
total_mem = torch.cuda.get_device_properties(0).total_memory