In [1]:
from datasets import load_dataset
import torch
import accelerate
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
from peft import prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from transformers import TrainingArguments
# download dataset
torch.cuda.empty_cache()

dataset_dir = "//home////Repository//AI_Coach//INSTRUCT//Data_Finetune_Mistral.jsonl"
dataset = load_dataset('json', data_files=dataset_dir, split='train')

def format_instruction(sample):
    return f"""You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.
        ### Instruction:{sample["instruction"]} ### Response:"""
dataset[10]

  from .autonotebook import tqdm as notebook_tqdm
2024-01-15 11:31:30.138331: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-15 11:31:30.138359: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-15 11:31:30.138980: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-15 11:31:30.142995: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'instruction': 'I want a fun and interactive  game. Any recommendations?',
 'response': 'Sounds like you need to play Splash from  It is a fun game where you putt to pop virtual bubbles.'}

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# base model id to fine-tune
model_id = "mistralai/Mistral-7B-v0.1"

# load model 
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto"
)
model.config.pretraining_tp = 1

# load tokenizer, pad short samples with end of sentence token
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.48s/it]


In [12]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

In [5]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [13]:
#TRAINING
new_model = "mistral-7b-golf-assistant5_pretrained"
model_args = TrainingArguments(
    output_dir="mistral-7b-golf-assistant5_pretrained",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=1e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False
)

# Supervised Fine-Tuning Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=1024,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=model_args,
)

# train
trainer.train()
#trainer.save_model()
trainer.model.save_pretrained(new_model)

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
  0%|          | 0/152 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  7%|▋         | 10/152 [02:09<28:31, 12.05s/it]

{'loss': 0.5474, 'learning_rate': 0.0001, 'epoch': 3.06}


  7%|▋         | 10/152 [02:13<31:42, 13.40s/it]


{'train_runtime': 134.0045, 'train_samples_per_second': 9.164, 'train_steps_per_second': 1.134, 'train_loss': 0.5473888397216797, 'epoch': 3.06}


In [7]:
from huggingface_hub import login
login("")

# push model and tokenizer to HF hub under your username
trainer.model.push_to_hub("mistral-7b-golf-assistant5")
tokenizer.push_to_hub("mistral-7b-golf-assistant5")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home//.cache/huggingface/token
Login successful


adapter_model.safetensors: 100%|██████████| 865M/865M [01:19<00:00, 10.9MB/s] 
tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 1.95MB/s]


CommitInfo(commit_url='https://huggingface.co/berkouille/mistral-7b-golf-assistant5/commit/a7894c9f2b6a9f8cc6f0b85e1f25bcb98378818a', commit_message='Upload tokenizer', commit_description='', oid='a7894c9f2b6a9f8cc6f0b85e1f25bcb98378818a', pr_url=None, pr_revision=None, pr_num=None)

In [8]:

trainer.model.save_pretrained("mistral-7b-golf-assistant5_pretrained")

In [10]:
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=1024, bias=False
     

In [4]:
from peft import LoraConfig, PeftModel

torch.cuda.empty_cache()
base_model = "mistralai/Mistral-7B-v0.1"


base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto"
)

"""# Reload the base model
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.bfloat16,
    device_map= {"": 1})"""
model = PeftModel.from_pretrained(base_model_reload, "mistral-7b-golf-assistant4")
model = model.merge_and_unload()

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.55s/it]


ValueError: Cannot merge LORA layers when the model is loaded in 8-bit mode

In [5]:
#trainer.save_model()

# login to HF hub
from huggingface_hub import login
login("")

# push model and tokenizer to HF hub under your username
trainer.model.push_to_hub("mistral-golf-assistant4")
tokenizer.push_to_hub("mistral-golf-assistant4")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home//.cache/huggingface/token
Login successful


NameError: name 'trainer' is not defined

: 

In [4]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import transformers
from datasets import load_dataset
# fine-tuned model id
model_id = "mistral-7b-golf-assistant"
device = "cuda" if torch.cuda.is_available() else "cpu"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained(model)

from random import randrange

def format_instruction(sample):
    return f"""You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.
        ### Instruction:
        {sample["instruction"]}

        ### Response:

    """

import time
while True:
    time.sleep(5)
    sample = dataset[randrange(len(dataset))]

    # create prompt for inference
    prompt = format_instruction(sample)

    if prompt == "exit":
        break
    model_input = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    generated_ids = model.generate(**model_input, max_new_tokens=200, do_sample=True, temperature=0.7,eos_token_id=eos_token_id)
            
    response_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    output = response_text[len(response_text):]
    print(f"Prompt: \n{prompt}\n")
    print(f"Instruction: \n{sample['instruction']}\n")
    print(f"Ground truth: \n{sample['response']}\n")
    print(f"Generated output: \n{output}\n\n\n")

  warn(
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.57s/it]


ValueError: The version of PEFT you are using is not compatible, please use a version that is greater than 0.5.0

In [1]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch
from datasets import load_dataset

dataset_dir = "//home////Repository//AI_Coach//INSTRUCT//Data_Finetune_Mistral.jsonl"
dataset = load_dataset('json', data_files=dataset_dir, split='train')

# fine-tuned model id
model_id = "mistral-7b-golf-assistant3/checkpoint-9/"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoPeftModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
from random import randrange

def format_instruction(sample):
    return f"""You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.
        ### Instruction:{sample} ### Response:"""

import torch
import csv
device = "cuda" if torch.cuda.is_available() else "cpu"
import time


def call_inference(sample):

    #sample = dataset[randrange(len(dataset))]
    prompt = format_instruction(sample)
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(device)
        
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids, 
            max_new_tokens=200, 
            do_sample=True, 
            top_p=0.9,
            temperature=0.7
        )

    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print("full output :", outputs)
    output = outputs[0][len(prompt):]
    print("output :", output)

    #if output is not empty
    if output:
        if "### Instruction:" in output:
            output = output.split("### Instruction:")[0].strip()
            print("out 2 :" , output)

        instruction = sample
        output = output
        groud_truth = ""#sample['response']

        print(f"Instruction: \n{instruction}\n\n")
        #print(f"Ground truth: \n{groud_truth}\n")
        print(f"Generated output: \n{output}\n\n\n")   
    else:
        print("I am sorry, I did not understand that. Could you please rephrase your question?")
    return instruction, output, groud_truth


while True:
    time.sleep(5)
    sample = dataset[randrange(len(dataset))]
    instruction, output, groud_truth = call_inference(sample)
    with open('output.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([instruction, output, groud_truth])


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.83s/it]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
2024-01-15 11:29:30.969665: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-15 11:29:30.969695: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-15 11:29:30.970276: E external/local_xla/xla/stream_executor

full output : ["You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.\n        ### Instruction:{'instruction': 'What is the name for the sand traps that are placed throughout a golf course?', 'response': 'The sand traps placed throughout a golf course are known as bunkers.'} ### Response:\n        Sand traps are known as bunkers.\n    "]
output : 
        Sand traps are known as bunkers.
    
Instruction: 
{'instruction': 'What is the name for the sand traps that are placed throughout a golf course?', 'response': 'The sand traps placed throughout a golf course are known as bunkers.'}


Generated output: 

        Sand traps are known as bunkers.
    





The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


full output : ["You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.\n        ### Instruction:{'instruction': 'Explain How can golfers determine the correct amount of break for their putts?', 'response': 'To determine the correct amount of break for their putts, golfers can follow these steps: \\n\\n 1. 1. Activate the contour lines and break percentage  from  tablet. \\n\\n 2. 2. Choose a starting position that has break and activate the Base Line feature to visualize the straight line between the ball and the hole. \\n\\n 3. 3. Activate the Ideal Line and appreciate the difference between the Base Line and the Ideal Line to get an idea of how much break is needed. \\n\\n4. Look at the break percentage and try to memorize how much the ball breaks on a specific slope.'} ### Response:\n        ### Instruction:{'instruction': 'Explain How can golfers determine the correct amount of break for th

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


full output : ["You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.\n        ### Instruction:{'instruction': 'what are the main aspects or fundamentals of  ?', 'response': 'This is a good question. Reading greens, speed and the line can be considered as the 3 main aspects of . In addition to that; Posture, grip and stroke technique are also very important to perform well in .'} ### Response:\n        ### Instruction:{'instruction': 'what are the main aspects or fundamentals of  ?', 'response': 'This is a good question. Reading greens, speed and the line can be considered as the 3 main aspects of . In addition to that; Posture, grip and stroke technique are also very important to perform well in .'} ### Response:\n    "]
output : 
        ### Instruction:{'instruction': 'what are the main aspects or fundamentals of  ?', 'response': 'This is a good question. Reading greens, speed and the line 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


full output : ["You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.\n        ### Instruction:{'instruction': 'what is the Hole entry point?', 'response': 'The Hole Entry Point. It is the spot where the ball will fall into the hole.'} ### Response:\n        The Hole Entry Point is the spot where the ball will fall into the hole.\n    "]
output : 
        The Hole Entry Point is the spot where the ball will fall into the hole.
    
Instruction: 
{'instruction': 'what is the Hole entry point?', 'response': 'The Hole Entry Point. It is the spot where the ball will fall into the hole.'}


Generated output: 

        The Hole Entry Point is the spot where the ball will fall into the hole.
    





The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


full output : ["You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.\n        ### Instruction:{'instruction': 'which club is better for ? recommend me a club for .', 'response': 'When deciding on a putter, important aspects to think about include length, grip, head design, hosel configuration, and alignment aids. Experiment with various putters to discover one that aligns with your posture, stroke style, and instills confidence in your alignment.'} ### Response:\n        ### Instruction:\n        ### Response:\n    "]
output : 
        ### Instruction:
        ### Response:
    
out 2 : 
Instruction: 
{'instruction': 'which club is better for ? recommend me a club for .', 'response': 'When deciding on a putter, important aspects to think about include length, grip, head design, hosel configuration, and alignment aids. Experiment with various putters to discover one that aligns with your postu

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


full output : ["You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.\n        ### Instruction:{'instruction': 'How can  help me improve my  alignment?', 'response': '  offers visual aids to assist with alignment, allowing you to ensure that your putter face is square to the target and enhancing your overall aim.'} ### Response:\n         is a  practice platform that helps you to improve your  skills. It provides visual aids to assist with alignment and allows you to ensure that your putter face is square to the target.\n    "]
output : 
         is a  practice platform that helps you to improve your  skills. It provides visual aids to assist with alignment and allows you to ensure that your putter face is square to the target.
    
Instruction: 
{'instruction': 'How can  help me improve my  alignment?', 'response': '  offers visual aids to assist with alignment, allowing you to ensure that yo

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 

: 

In [2]:
# inference neuralwork


from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch
# fine-tuned model id
model_id = "/home//Repository/AI_Coach/INSTRUCT/mistral-7b-golf-assistant5"

# load base LLM model, LoRA params and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

from random import randrange

def format_instruction(sample):
    return f"""You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.
        ### Instruction:{sample["instruction"]} ### Response:"""

# select random sample
sample = dataset[randrange(len(dataset))]

# create prompt for inference
prompt = format_instruction(sample)
print(prompt)



device = "cuda" if torch.cuda.is_available() else "cpu"
# tokenize input text
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(device)
    
# inference, 5 outfit combinations make up around 700-750 tokens
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids, 
        max_new_tokens=200, 
        do_sample=True, 
        top_p=0.9,
        temperature=0.7
    )

# decode token ids to text
outputs = outputs.detach().cpu().numpy()
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# outputs is a list of length num_prompts
# parse the completed part
output = outputs[0][len(prompt):]

print(f"Instruction: \n{sample['instruction']}\n")
print(f"Ground truth: \n{sample['response']}\n")
print(f"Generated output: \n{output}\n\n\n")

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.90s/it]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a personal assistant . Help users to learn  fundamentals and techniques. Recommend ideal  exercises to help users to improve their  skills.
        ### Instruction:What is the impact of  on a golfer's performance? ### Response:
Instruction: 
What is the impact of  on a golfer's performance?

Ground truth: 
This is a nice question.  can significantly improve a golfer's performance by providing real-time feedback, visual aids, and a variety of drills to enhance different aspects of their  game.

Generated output: 






: 