In [None]:
pip install --upgrade pip

In [2]:
!pip install -q -U datasets accelerate bitsandbytes trl peft evaluate git+https://github.com/huggingface/transformers

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import setup_chat_format, SFTTrainer
from peft import LoraConfig
import torch

from peft import get_peft_model

2024-05-08 19:37:23.278274: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-08 19:37:23.278396: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-08 19:37:23.423766: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
#Peft
peft_config = LoraConfig(
        lora_alpha=64, #128
        lora_dropout=0.05,
        r=8, #
        bias="none",
        target_modules=["q_proj","k_proj","v_proj"], #all_linear
        task_type="CAUSAL_LM",
)

In [5]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="int8",
    bnb_8bit_compute_dtype=torch.int8
)

Unused kwargs: ['bnb_8bit_use_double_quant', 'bnb_8bit_quant_type', 'bnb_8bit_compute_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [None]:
# If 8 bit not works 4 bit quantization
'''quantization_config = BitsAndBytesConfig(load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype= torch.bfloat16 #but should be set to the optimal BFloat16 for newer hardware supporting it to achieve the best performance.
)'''

In [None]:
# Huggingface Model ID
model_id = "Trendyol/Trendyol-LLM-7b-base-v1.0"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

In [6]:
#will added our translated data
dataset =  load_dataset("oguuzhansahin/chatdoctor-translated", split="train")

Downloading data:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13293 [00:00<?, ? examples/s]

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

peft_model = get_peft_model(model, peft_config)
print_trainable_parameters(peft_model)

In [None]:
system_message = "Sen hastalara yardım eden Sohbet Doktorusun. Hastaların şikayetlerini dinleyip onlara çözüm öner."

def create_conversation(sample):

    return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": sample["input"]},
      {"role": "assistant", "content": sample["output"]}
    ]
  }


In [None]:
#??
#dataset = dataset.train_test_split(test_size = 0.02)
dataset = dataset.map(create_conversation, batched=False)

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
from huggingface_hub import login
hf_token = "your token"
login(token = hf_token,add_to_git_credential = True)

In [None]:
args = TrainingArguments(
    output_dir="selincildam/medical-chatbot-turkish", # directory to save and repository id
    num_train_epochs=5,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training
    #gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    #gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=20,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    #bf16=True,                              # use bfloat16 precision
    #tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    max_steps= 100
#   report_to="tensorboard",                # report metrics to tensorboard
)

max_seq_length = 512 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


trainer.train()
trainer.save_model()

In [None]:


api_token = "your api token"
model_name = "medical-chatbot-turkish"
model.push_to_hub(model_name)



# Inference

### Load your finetuned model and generate model outputs

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer

import time

In [None]:
model_name = "ceydabasoglu/medical-chatbot-turkish"
tokenizer = AutoTokenizer.from_pretrained("ceydabasoglu/medical-chatbot-turkish")

input_message = {'input' :'Sürekli baş ağrısı neden olur?',
                 'output':'' }

input_message = input_message.map(create_conversation)

#inputs = tokenizer("""###Human: Sürekli baş ağrısı neden olur? ###Assistant: """, return_tensors="pt").to("cuda") #write here your input

inputs = tokenizer(input_message, return_tensors="pt").to("cuda") #write here your input

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    model_name, #write model name
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda")

In [None]:
generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=100,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
st_time = time.time()
outputs = model.generate(**inputs, generation_config=generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
resp_time = (time.time() - st_time) / 60
print(response)
print(f"Response time:{resp_time} dk")

## Make GGUF

In [None]:
'''from huggingface_hub import snapshot_download

model_id = "ceydabasoglu/medical-chatbot-turkish"
snapshot_download(repo_id = model_id, 
                  local_dir="myllama-hf",
                  local_dir_use_symlinks=False, 
                  revision="main")'''

In [None]:
'''$ git clone https://github.com/ggerganov/llama.cpp.git

$ pip install -r llama.cpp/requirements.txt

$ python llama.cpp/convert.py myllama-hf \
  --outfile medical-chatbot-turkish-v0.1.gguf \
  --outtype q4_0''' #q8_0 olabilir

In [None]:
'''from huggingface_hub import HfApi
api = HfApi()

model_id = "ceydabasoglu/medical-chatbot-turkish-v0.1.gguf"
api.create_repo(model_id, exist_ok=True, repo_type="model")
api.upload_file(
    repo_id=model_id,
    path_or_fileobj="medical-chatbot-turkish-v0.1.gguf",
    path_in_repo="medical-chatbot-turkish-v0.1.gguf"
)'''