In [None]:
!pip install transformers
!pip install peft
!pip install trl
!pip install bitsandbytes
!pip install scipy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [None]:
import os

import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    HfArgumentParser
)
from datasets import load_dataset
import torch

import bitsandbytes as bnb
from huggingface_hub import login, HfFolder

from trl import SFTTrainer

from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training

In [None]:
dataset = load_dataset("medalpaca/medical_meadow_wikidoc")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

medical_meadow_wikidoc.json:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
dataset['train']

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 10000
})

In [None]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model=AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=bnb_config
)

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

This technique compresses the original 16-bit or 32-bit model weights down to just 4 bits, significantly shrinking the model size to approximately 1/4 of its original.



In [None]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

In [None]:
tokenizer=AutoTokenizer.from_pretrained(model_name, max_length=256)

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
lora_alpha = 32 #16
lora_dropout = 0.05 #0.1
lora_rank = 32 #64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM"
)

 using lora is like plugging in a USB with new instructions into a computer — enabling efficient task-specific learning without modifying the base model (computer)

In [None]:
peft_model = get_peft_model(model, peft_config)

In [None]:
def format_chat(example):
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": example['input']},
        {"role": "assistant", "content": example['output']}
    ]
    example['text'] = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    return example

formatted_dataset = dataset['train'].map(format_chat)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
output_dir = "qwen2.5-3b-instruct"
per_device_train_batch_size = 1
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 180
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 180 #100 #500
warmup_ratio = 0.03
lr_scheduler_type = "cosine" #"constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=False,
    report_to='none'
)

 fine-tuned the LoRA-based model using Hugging Face’s SFTTrainer with memory-efficient settings

In [None]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    args=training_arguments
)

Converting train dataset to ChatML:   0%|          | 0/10000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss
10,2.1635
20,2.2446
30,2.0267
40,1.8006
50,1.7181
60,1.8968
70,1.9636
80,1.9363
90,1.7282
100,1.5837


TrainOutput(global_step=180, training_loss=1.8232672161526151, metrics={'train_runtime': 61.8738, 'train_samples_per_second': 2.909, 'train_steps_per_second': 2.909, 'total_flos': 358668225328128.0, 'train_loss': 1.8232672161526151})

In [None]:
#model = model.merge_and_unload()

In [None]:
trainer.model.save_pretrained("/content/drive/MyDrive/chatbot_demo")

In [None]:
model=AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/chatbot_demo",
    device_map='auto',
    quantization_config=bnb_config
)

In [None]:
from sentence_transformers import SentenceTransformer

英文： Load a lightweight pre-trained model to create sentence embeddings (vectors).
中文： 加载一个轻量级的句向量模型（MiniLM），用于把句子转成向量表示。

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = embedding_model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = embedding_model.similarity(embeddings, embeddings)
print(similarities)

(3, 384)
tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])


创建我的知识库

In [None]:
information = ["""Methods: In total, 30 English and Spanish-speaking primary care patients with either type 1 or type 2 diabetes will receive screening for DD during clinical visits and subsequent support from an artificial intelligence (AI)-based health care chatbot with interactive tailored messaging. In addition, the use of electronic consultation with a specialist or referral to a behavioral health provider could occur depending on the severity and source of DD. The use of electronic consultations allows providers convenient and timely asynchronous access to a range of specialty care providers. Health outcomes will be measured through changes in validated screening measures for DD, depression, and anxiety. Digital outcomes will be measured through surveys assessing user experience with technology and system usability, and by system performance data. Qualitative data on acceptability and satisfaction with the clinical workflows and technological interventions will be collected through interviews with patients and clinical providers. Descriptive statistics will summarize quantitative outcome measures and responses to closed-ended survey items, and rapid thematic and content analysis will be conducted on open-ended survey and interview data.

Results: Workflows for screening and treating DD have been approved and clinical staff have received training on the process. Electronic surveys for screening measure collection have been created. Data from visit screeners will be entered into the electronic medical record during the medical appointment. Recruitment will begin late June-July 2024.""",
               """Background A stroke frequently results in impaired performance of activities of daily life. Many of these are highly dependent on effective coordination between the two arms. In the context of bimanual movements, cyclic rhythmical bilateral arm coordination patterns can be classified into two fundamental modes: in-phase (bilateral homologous muscles contract simultaneously) and anti-phase (bilateral muscles contract alternately) movements. We aimed to investigate how patients with left (LHS) and right (RHS) hemispheric stroke are differentially affected in both individual- limb control and inter-limb coordination during bilateral movements. Methods We used kinematic measurements to assess bilateral coordination abilities of 18 chronic hemiparetic stroke patients (9 LHS; 9 RHS) and 18 age- and sex-matched controls.""",
               """Abstract
Stem cell therapy has been considered a promising strategy in the management of both type I and type II diabetes mellitus (DM) because of its immunomodulatory and regenerative capability to restore the beta cell number and function. Various modalities of cellular therapy like transplantation of pancreatic islet cells, transplantation of pancreatic ductal stem cells, and mesenchymal stromal cell transplantation have been tried, and the modality is undergoing rapid advancements that may become the reality in the near future. In the course of its evolution, it is essential to have a comprehensive summary of the progress for a greater capacity to refine our future directives. With technological developments like data mining, graphic drawing, and information analytics combined with computational statistics"""
]

In [None]:
embeddings = embedding_model.encode(sentences)
print(embeddings.shape)

(3, 384)


In [None]:
query = "what are the effects of a stroke"

In [None]:
embed_query = embedding_model.encode(query)

In [None]:
query_similarity = embedding_model.similarity(embed_query, embeddings).numpy()

In [None]:
import numpy as np
idx_max = np.argmax(query_similarity[0])

In [None]:
print(information[idx_max])

Abstract
Stem cell therapy has been considered a promising strategy in the management of both type I and type II diabetes mellitus (DM) because of its immunomodulatory and regenerative capability to restore the beta cell number and function. Various modalities of cellular therapy like transplantation of pancreatic islet cells, transplantation of pancreatic ductal stem cells, and mesenchymal stromal cell transplantation have been tried, and the modality is undergoing rapid advancements that may become the reality in the near future. In the course of its evolution, it is essential to have a comprehensive summary of the progress for a greater capacity to refine our future directives. With technological developments like data mining, graphic drawing, and information analytics combined with computational statistics


In [None]:
prompt = f"Using the following information, answer the user's query:\n\nquery: What are the effects of a stroke?\n\ninformation:\n{information}"
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)

The patient will have difficulty performing tasks requiring the use of both hands such as dressing, eating, writing or other fine motor skills. The most common symptoms include numbness, weakness, slurred speech, visual disturbances, and loss of balance.


In [None]:
from rank_bm25 import BM25Okapi
tokenized_docs = [doc.lower().split() for doc in information]

# Create the BM25 object
bm25 = BM25Okapi(tokenized_docs)

In [None]:
def rag(query):
  embed_query = embedding_model.encode(query)
  #query_similarity = embedding_model.similarity(embed_query, embeddings).numpy()
  #idx_max = np.argmax(query_similarity[0])
  #relevant_info = information[idx_max]
  bm25_similarity = bm25.get_scores(query.lower().split())
  idx_max = np.argmax(bm25_similarity)
  relevant_info = information[idx_max]
  prompt = f"Using the following information, answer the user's query:\n\nquery: {query}?\n\ninformation:\n{relevant_info}"
  messages = [
      {"role": "system", "content": "You are an expert medical professional, specializing in answering patient questions."},
      {"role": "user", "content": prompt}
  ]
  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=512
  )
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  print(response)

Retrieve Relevant Information: Use BM25 or other retrieval algorithms to find the most relevant passages from the existing knowledge base (information).

Construct Prompt: Combine the user's query with the relevant passage to form a clear prompt.

Generate Answer: Use the Fine-tuned LLM model to generate a response based on the relevant information.

Output Answer: Print the final generated text answer.

In [None]:
print(rag("what is stem cell therapy?"))

The main goal of stem cell therapy is to regenerate or replace lost insulin producing cells in patients with Type 1 Diabetes. Stem cell therapy involves injecting healthy stem cells into the body to produce new insulin-producing cells called beta cells.
In general, there are two types of stem cell therapies used to treat people with type 2 diabetes - hematopoietic stem cell transplant and umbilical cord blood stem cell therapy. These treatments use different kinds of stem cells from different parts of the human body to create new cells that can help fight disease.
None
