# Preparation of the Jupyter notebook kernel

conda create --name llama python=3.10
   conda activate llama 
    pip install git+https://github.com/huggingface/transformers.git
    pip install git+https://github.com/huggingface/peft.git
  925  pip install git+https://github.com/huggingface/accelerate.git 
  926  pip install -q -U trl
  927  pip install -q trl
  928  pip install -U datasets
  929  pip install -U bitsandbytes 
  930  pip install -U einops
  931  pip install -U wandb
  932  pip install --user ipykernal
  933  pip install --user ipykernel
  934  python -m ipykernel install --user --name=llama

So now you can open jupyter notebook and select the "llama" kernel to run the notebook

In [1]:
# The following command checks how many GPUs are availble in the current system to use
!nvidia-smi

Sun Apr  7 15:04:50 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000000:17:00.0 Off |                    0 |
| N/A   32C    P0    55W / 300W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Preprocessing

The purpose of this step is to take the original file downloaded from HPO website, and make a simple JSON file that can be used by LLAMA fine-tuning later on.

In [1]:
import json
with open('/home/files/hp.json', 'r') as file:
    data = json.load(file)

    
# Create a dictionary to store disease name to HPO ID mapping
hpo_id_to_disease = {}

# Iterate through each data point and extract HPO ID and "val" data
for item in data["graphs"][0]["nodes"]:
    if "id" in item and "lbl" in item:
        hpo_id = item["id"].split("/")[-1].replace("_", ":")
        val_data = item["lbl"]
#       if hpo_id in phenotypic_abnormality_set:
        hpo_id_to_disease[hpo_id] = val_data

FileNotFoundError: [Errno 2] No such file or directory: '/home/files/hp.json'

In [3]:
# Create a dictionary to store HPO ID to list of synonyms mapping
hpo_id_to_synonyms = {}

# Iterate through each data point and extract HPO ID, name, and synonyms
for item in data["graphs"][0]["nodes"]:
    if "id" in item and "lbl" in item:
        hpo_id = item["id"].split("/")[-1].replace("_", ":")
        # val_data = item["lbl"]

        # Check if "meta" key exists
        if "meta" in item:
            # Check if "synonyms" key exists within "meta"
            if "synonyms" in item["meta"]:
                synonyms = [synonym["val"] for synonym in item["meta"]["synonyms"]]
            else:
                # Skip data point if no synonyms
                continue
        else:
            # Skip data point if no "meta" key
            continue

        # Store HPO ID and list of synonyms in the dictionary
        hpo_id_to_synonyms[hpo_id] = list(set(synonyms))



In [9]:
import random
random.seed(10)

def generate_typo1(message, nchar=1):
    message = list(message)
    typo_prob = 0.2 # percent (out of 1.0) of characters to become typos

    # the number of characters that will be typos
    if nchar > 1:
        n_chars_to_flip = round(len(message) * typo_prob)
        if nchar < n_chars_to_flip:
            n_chars_to_flip = nchar #for for example, nchar=3 but the lenght is too long
        if nchar < 1:
            nchar = 1 #at least 1 chr change
    else:
        n_chars_to_flip = nchar #by default it is 1

    # is a letter capitalized?
    capitalization = [False] * len(message)
    # make all characters lowercase & record uppercase
    for i in range(len(message)):
        capitalization[i] = message[i].isupper()
        message[i] = message[i].lower()

    # list of characters that will be flipped
    pos_to_flip = []
    for i in range(n_chars_to_flip):
        pos_to_flip.append(random.randint(0, len(message) - 1))

    # dictionary... for each letter list of letters
    # nearby on the keyboard
    nearbykeys = {
        'a': ['q','w','s','x','z'],
        'b': ['v','g','h','n'],
        'c': ['x','d','f','v'],
        'd': ['s','e','r','f','c','x'],
        'e': ['w','s','d','r'],
        'f': ['d','r','t','g','v','c'],
        'g': ['f','t','y','h','b','v'],
        'h': ['g','y','u','j','n','b'],
        'i': ['u','j','k','o'],
        'j': ['h','u','i','k','n','m'],
        'k': ['j','i','o','l','m'],
        'l': ['k','o','p'],
        'm': ['n','j','k','l'],
        'n': ['b','h','j','m'],
        'o': ['i','k','l','p'],
        'p': ['o','l'],
        'q': ['w','a','s'],
        'r': ['e','d','f','t'],
        's': ['w','e','d','x','z','a'],
        't': ['r','f','g','y'],
        'u': ['y','h','j','i'],
        'v': ['c','f','g','v','b'],
        'w': ['q','a','s','e'],
        'x': ['z','s','d','c'],
        'y': ['t','g','h','u'],
        'z': ['a','s','x'],
        ' ': ['c','v','b','n','m']
    }
    # insert typos
    for pos in pos_to_flip:
        # try-except in case of special characters
        try:
            typo_arrays = nearbykeys[message[pos]]
            message[pos] = random.choice(typo_arrays)
        except:
            break

    # reinsert capitalization
    for i in range(len(message)):
        if (capitalization[i]):
            message[i] = message[i].upper()

    # recombine the message into a string
    message = ''.join(message)

    # show the message in the console
    return message

In [10]:
train_data = [] #create array for training data 
test_synonyms = [] #create array for testing synonyms  
test_typos = [] #create array for testing single character typos  
test_complex = [] #create array for testing complex typos 

for hpo_id in hpo_id_to_disease:
    disease_name = hpo_id_to_disease[hpo_id] #extract disease and its HPO ID from existing array 
    train_data.append({'input':f'The Human Phenotype Ontology term {disease_name} is identified by the HPO ID ', 'output':hpo_id})

    # Add 2 single character typos to training set (train_data)
    disease_name_typo = generate_typo1(disease_name)
    train_data.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})
    
    disease_name_typo = generate_typo1(disease_name)
    train_data.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})
    
    # Add single character typo to typo testing set (test_typos)
    disease_name_typo = generate_typo1(disease_name)
    test_typos.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})
    
    # Add complex typo to complex typo testing set (test_complex)
    disease_name_typo = generate_typo1(disease_name, 3)
    test_complex.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})
 
for hpo_id, synonyms in hpo_id_to_synonyms.items():
    # Determine the index to split synonyms
    split_index = len(synonyms) // 2
    
    # Append half of the synonyms to test_data
    for synonym in synonyms[:split_index]:
  
        # Add a synonym to the synonym testing set (test_synonyms)
        test_synonyms.append({'input':f'The Human Phenotype Ontology term {synonym} is identified by the HPO ID ', 'output':hpo_id})
        
    # Append the other half of synonyms to training set 
    for synonym in synonyms[split_index:]:
        train_data.append({'input': f'The Human Phenotype Ontology term {synonym} is identified by the HPO ID ', 'output': hpo_id})
    
        disease_name_typo = generate_typo1(synonym)
        train_data.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})

        disease_name_typo = generate_typo1(synonym)
        train_data.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})

        disease_name_typo = generate_typo1(synonym)
        test_typos.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})

        disease_name_typo = generate_typo1(synonym, 3)
        test_complex.append({'input':f'The Human Phenotype Ontology term {disease_name_typo} is identified by the HPO ID ', 'output':hpo_id})


In [14]:
# Extract unique 'input' and 'output' values from each array
inputs_hpo_data = {item['input'] for item in hpo_data}
outputs_hpo_data = {item['output'] for item in hpo_data}

# Remove items from test_data where both 'input' and 'output' match with any item in hpo_data
test_data = [item for item in test_data if item['input'] not in inputs_hpo_data or item['output'] not in outputs_hpo_data]


In [17]:
#dump the newly generated data subset into a JSON file 
with open("train_part.json", "w") as f: 
   json.dump(train_data, f)
with open("test_part.json", "w") as f: 
   json.dump(test_synonyms, f)
with open("typo_part.json", "w") as f: 
   json.dump(test_typos, f)
with open("complextypo_part.json", "w") as f: 
   json.dump(test_complex, f)

## Setup

In [18]:
import pandas as pd
import transformers
import textwrap
from transformers import LlamaTokenizer, LlamaForCausalLM
import os
import sys
from typing import List

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)

# import fire
import torch
from datasets import load_dataset
import pandas as pd

import json

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

## Model load & data load

In [19]:
BASE_MODEL = "directory_of_base_model" 

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto',
)

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.53s/it]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly.


In [20]:
#extract the data from the JSON file 
train_data = load_dataset("json", data_files="train_part.json")

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 11554.56it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 157.07it/s]
Generating train split: 120444 examples [00:00, 376857.04 examples/s]


In [23]:
train_list = train_data['train']
train_list

Dataset({
    features: ['output', 'input'],
    num_rows: 120444
})

In [24]:
CUTOFF_LEN = 512

In [26]:
#tokenize the data for llama model to read 
def tokenize(prompt, add_eos_token=True):

    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt1(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [27]:
tokenized_train_list = (train_list.map(generate_and_tokenize_prompt))

Map: 100%|██████████| 120444/120444 [00:20<00:00, 5976.10 examples/s]


In [28]:
len(tokenized_train_list)

120444

## Alpaca LoRa

In [30]:
#LORA_R = 8
LORA_R = 32
#LORA_ALPHA = 16
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

#BATCH_SIZE = 128
#MICRO_BATCH_SIZE = 128
BATCH_SIZE = 256
MICRO_BATCH_SIZE = 256
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 100
OUTPUT_DIR = "/home/model_output/"

In [31]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
# model.print_trainable_parameters()



In [32]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [33]:
print_trainable_parameters(model)

trainable params: 16777216 || all params: 6755192832 || trainable%: 0.24836028248556738


In [34]:
# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

torch.float32 279187456 0.04132930960570986
torch.int8 6476005376 0.9586706903942901


## Training

In [35]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [36]:
#training arguments 
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=2,
    num_train_epochs = 100,
#     max_steps=50, #TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=5,
    optim="adamw_torch",
#     evaluation_strategy="steps",
    save_strategy="steps",
#     eval_steps=5,
    save_steps=100,
    output_dir=OUTPUT_DIR,
    save_total_limit=40,
    load_best_model_at_end=False, #True,
#     report_to="tensorboard"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [37]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [38]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_list,
#     eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False

In [None]:
trainer.train()
model.save_pretrained(OUTPUT_DIR)



Step,Training Loss
11405,0.48
11410,0.4841
11415,0.4755
11420,0.4963
11425,0.4856
11430,0.4827
11435,0.4843
11440,0.4834
11445,0.477
11450,0.4873


