In [1]:
from datasets import load_dataset

# Load the JSONL file
data = load_dataset('json', data_files='instruction_training.jsonl')

# Check the data format
print(data['train'][0])  # Example entry

{'instruction': 'Begin the interview.', 'input': "The Clinician's Name is Brittany Quinn, and the date of the appointment is August 15", 'output': 'Good day, I am Dr. Brittany Quinn’s Interview Assistant and I was asked to connect with you before your appointment on August 15 at the medical clinic. My goal is to gather some background information for the doctor. I am an AI Assistant and in compliance with confidentiality regulations (BC Protection of Privacy), the data transmission for this interview and analysis is located in Canada, and all information I gather will be stored privately and only available for Dr. Brittany Quinn to review. Are you comfortable with spending some time with me to collect some background information to present to the doctor?'}


In [2]:
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 194
    })
})

In [3]:
def formatting_func(example):
  if example.get("input", "") != "":
      input_prompt = (f"Below is an instruction that describes a task, paired with an input that provides further context. "
      "Write a response that appropriately completes the request.\n\n"
      "### Instruction:\n"
      f"{example['instruction']}\n\n"
      f"### Input: \n"
      f"{example['input']}\n\n"
      f"### Response: \n"
      f"{example['output']}")

  else:
    input_prompt = (f"Below is an instruction that describes a task. "
      "Write a response that appropriately completes the request.\n\n"
      "### Instruction:\n"
      f"{example['instruction']}\n\n"
      f"### Response:\n"
      f"{example['output']}")

  return {"text" : input_prompt}

In [4]:
formatted_dataset = data.map(formatting_func)

In [5]:
print(formatted_dataset["train"][0]["text"])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Begin the interview.

### Input: 
The Clinician's Name is Brittany Quinn, and the date of the appointment is August 15

### Response: 
Good day, I am Dr. Brittany Quinn’s Interview Assistant and I was asked to connect with you before your appointment on August 15 at the medical clinic. My goal is to gather some background information for the doctor. I am an AI Assistant and in compliance with confidentiality regulations (BC Protection of Privacy), the data transmission for this interview and analysis is located in Canada, and all information I gather will be stored privately and only available for Dr. Brittany Quinn to review. Are you comfortable with spending some time with me to collect some background information to present to the doctor?


In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# modelpath="meta-llama/Llama-3.1-8B"
modelpath="localhost:11435/llama3.1:8b"

In [8]:
# model1 = OllamaLLM(base_url="localhost:11435", 
#                   model="llama3.1:70b-instruct-q4_0", 
#                   temperature=0.9, 
#                   num_ctx = 6144,
#                   top_k = 40,
#                   top_p = 0.9)

In [9]:
# Load 4-bit quantized model
model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
)

OSError: Incorrect path_or_model_id: 'localhost:11435/llama3.1:8b'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
# Load (slow) Tokenizer, fast tokenizer sometimes ignores added tokens
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)   

# Add tokens <|im_start|> and <|im_end|>, latter is special eos token 
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

In [None]:
# Add LoRA adapters to model
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1, 
    bias="none", 
    modules_to_save = ["lm_head", "embed_tokens"],		# needed because we added new tokens to tokenizer/model
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.config.use_cache = False