In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Gemma patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [11]:
import pandas as pd

df = pd.read_csv("/content/ds.csv")
df.head()

Unnamed: 0,raw,formatted
0,Tab IDA 625mg (21)\r\n1+1+1\r\nTab lora 10mg (...,"{""medicines"": [{""type"": ""Tab"", ""name"": ""IDA 62..."
1,Tab IDA 625mg (21)\r\n1+1+1\r\nTab lora 10mg (...,"{""medicines"": [{""type"": ""Tab"", ""name"": ""IDA 62..."
2,mfelulena saline (1oooml)\r\nTaba saline (1ooo...,"{""medicines"": [\r\n {""type"": ""Saline"", ""name""..."
3,Tab IDA 625mg (21)\r\n1+1+1\r\nNapa 500mg(10)\...,"{""medicines"": [\r\n {""type"": ""Tab"", ""name"": ""..."
4,Tab Napa 500mg (20mg)\r\n2+2+2\r\nap ometid 20...,"{""medicines"": [\r\n {""type"": ""Tab"", ""name"": ""..."


In [17]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(df)
print(train_dataset)
#data = {"train": train_dataset}
#print(data["train"])

Dataset({
    features: ['raw', 'formatted'],
    num_rows: 15
})
Dataset({
    features: ['raw', 'formatted'],
    num_rows: 15
})


In [23]:
alpaca_prompt = """Bellow there is a medical raw text comes from OCR. There have misspellings and unorganization. Organize the accordingly.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):

    instruction = """This contains Medicine Type (which is either Tab or Tablet, Cap or Capsule, Inj or Injection, Drop or Drops, Saline, in some cased there might have spelling mistakes like Taba or Cp with Tab or Cap or other misspells with Drops and Saline and correct the accordingly), Medicine Name (which is only start with an alphabet, but can not start with a number, do not include any nosily element as medicine name which may come from OCR missreading), Medicine power (such as *mg or **ml, ** means a number), Intake Frequency (where mention frequency like 1+1+1 or any other combination it may appears, but always check there should not be any abrupt number such as 8+1+0 or 5+0+1 or something appear like this, if you do not found anything just update it with "Not Found"), Frequency Days (which some times appears some time not, if appeare intake time will be indicated by ** inside a () this like bracket or ** with text "Days" or "Day", where ** must be a number, if you do not found anything just update it with "Not Found"). If there contain any unwanted or noisy data just remove it. For any case if you can not extract Medicine Type or Medicine Name for any entity then do not fill Medicine Dose, Intake Frequency, Frequency Days with gurbage value.

    Here all ** means number.

    Organize this data according to this format
    Medicine Type:
    Medicine Name:
    Medicine Dose:
    Intake Frequency:
    Frequency Days:

    For example
    Medicine Type: Tab (Tablet)
    Medicine Name: Napa
    Medicine Dose: 500mg
    Intake Frequency: 1+1+1
    Frequency Days 10
    """
    instructions = instruction
    inputs       = examples["raw"]
    outputs      = examples["formatted"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [24]:
from datasets import load_dataset
#dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = train_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [39]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 250,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/15 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [40]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15 | Num Epochs = 125
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 250
 "-____-"     Number of trainable parameters = 50,003,968


Step,Training Loss
5,0.0259
10,0.0472
15,0.0455
20,0.0355
25,0.0201
30,0.0248
35,0.0247
40,0.0292
45,0.0251
50,0.023


### **LoRa Model Saving**

In [None]:
model.save_pretrained("Gemma_7B_lora_model") # Local saving

### **LoRa Inference**

In [1]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/Gemma_7B_lora_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth: Fast Gemma patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Unsloth 2024.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [2]:
alpaca_prompt = """Bellow there is a medical raw text comes from OCR. There have misspellings and unorganization. Organize the accordingly.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [6]:
# alpaca_prompt = Copied from above
inputs = tokenizer(
[
    alpaca_prompt.format(
        """This contains Medicine Type (which is either Tab or Tablet, Cap or Capsule, Inj or Injection, Drop or Drops, Saline, in some cased there might have spelling mistakes like Taba or Cp with Tab or Cap or other misspells with Drops and Saline and correct the accordingly), Medicine Name (which is only start with an alphabet, but can not start with a number, do not include any nosily element as medicine name which may come from OCR missreading), Medicine power (such as *mg or **ml, ** means a number), Intake Frequency (where mention frequency like 1+1+1 or any other combination it may appears, but always check there should not be any abrupt number such as 8+1+0 or 5+0+1 or something appear like this, if you do not found anything just update it with "Not Found"), Frequency Days (which some times appears some time not, if appeare intake time will be indicated by ** inside a () this like bracket or ** with text "Days" or "Day", where ** must be a number, if you do not found anything just update it with "Not Found"). If there contain any unwanted or noisy data just remove it. For any case if you can not extract Medicine Type or Medicine Name for any entity then do not fill Medicine Dose, Intake Frequency, Frequency Days with gurbage value.

    Here all ** means number.

    Organize this data according to this format
    Medicine Type:
    Medicine Name:
    Medicine Dose:
    Intake Frequency:
    Frequency Days:

    For example
    Medicine Type: Tab (Tablet)
    Medicine Name: Napa
    Medicine Dose: 500mg
    Intake Frequency: 1+1+1
    Frequency Days 10""", # instruction

        """Tab.felepin
Tab. Anlapin
Anlepin 6mg
3+0+0
0+0+0 10
manulol
H. Fixamzol
Tab. Fixazgol
0+0+0+1""", # input

        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 640, use_cache = True)
output = str(tokenizer.batch_decode(outputs))
print(output)
#output.split("### Response:")[-1]

['<bos>Bellow there is a medical raw text comes from OCR. There have misspellings and unorganization. Organize the accordingly. \n\n### Instruction:\nThis contains Medicine Type (which is either Tab or Tablet, Cap or Capsule, Inj or Injection, Drop or Drops, Saline, in some cased there might have spelling mistakes like Taba or Cp with Tab or Cap or other misspells with Drops and Saline and correct the accordingly), Medicine Name (which is only start with an alphabet, but can not start with a number, do not include any nosily element as medicine name which may come from OCR missreading), Medicine power (such as *mg or **ml, ** means a number), Intake Frequency (where mention frequency like 1+1+1 or any other combination it may appears, but always check there should not be any abrupt number such as 8+1+0 or 5+0+1 or something appear like this, if you do not found anything just update it with "Not Found"), Frequency Days (which some times appears some time not, if appeare intake time will

### **Save 16bit Full Model with Tokenizer**

In [51]:
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.6G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.09 out of 12.67 RAM for saving.


 39%|███▉      | 11/28 [00:01<00:01, 13.11it/s]We will save to Disk and not RAM now.
100%|██████████| 28/28 [01:17<00:00,  2.77s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving model/pytorch_model-00004-of-00004.bin...
Done.
