# DeepSeek LoRA Fine-tuning

Model: DeepSeek-R1-Distill-Qwen-1.5B
Framework: MindNLP + MindSpore
Dataset: AI Medical Chatbot


## Install Dependencies


In [None]:
%pip install -q mindspore==2.3.1
%pip install -q mindnlp
%pip install -q transformers==4.44.0
%pip install -q peft==0.12.0
%pip install -q datasets==2.19.0
%pip install -q accelerate==0.30.0
%pip install -q trl==0.8.6
%pip install -q sentencepiece
%pip install -q protobuf

## Import Libraries


In [None]:
import os
import mindspore
import mindnlp
from mindspore import context
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")


## Load Dataset


In [None]:
df = pd.read_csv('/kaggle/input/ai-medical-chatbot/ai_medical_chatbot.csv')
dataset = Dataset.from_pandas(df)
print(f"Loaded {len(dataset)} examples")
print(dataset[0])


In [None]:
# df = pd.read_csv('/kaggle/input/gym-exercise-data/gym_exercise_data.csv')
# dataset = Dataset.from_pandas(df)

# df = pd.read_csv('/kaggle/input/yoga-poses-dataset/yoga_poses.csv')
# dataset = Dataset.from_pandas(df)

# dataset = load_dataset("BI55/MedText", split="train")


## Format Data


In [None]:
def format_instruction(example):
    if 'question' in example and 'answer' in example:
        question = example['question']
        answer = example['answer']
    elif 'instruction' in example and 'output' in example:
        question = example['instruction']
        answer = example['output']
    elif 'input' in example and 'output' in example:
        question = example['input']
        answer = example['output']
    else:
        cols = list(example.keys())
        question = example[cols[0]]
        answer = example[cols[1]]
    
    text = f"""### Instruction:
You are a medical and sports health assistant. Answer the following question accurately and helpfully.

### Question:
{question}

### Answer:
{answer}"""
    
    return {"text": text}

formatted_dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)
print(f"Formatted {len(formatted_dataset)} examples")


In [None]:
train_test_split = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

MAX_TRAIN_SAMPLES = 1000
MAX_EVAL_SAMPLES = 100

if MAX_TRAIN_SAMPLES and len(train_dataset) > MAX_TRAIN_SAMPLES:
    train_dataset = train_dataset.select(range(MAX_TRAIN_SAMPLES))
if MAX_EVAL_SAMPLES and len(eval_dataset) > MAX_EVAL_SAMPLES:
    eval_dataset = eval_dataset.select(range(MAX_EVAL_SAMPLES))

print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")


## Load Model


In [None]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=mindspore.float16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)


## Configure LoRA


In [None]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


## Training Configuration


In [None]:
training_args = TrainingArguments(
    output_dir="./deepseek-mindnlp-medical",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=False,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    group_by_length=True,
    report_to="none",
)


## Initialize Trainer


In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)


## Train


In [None]:
trainer.train()


## Save Model


In [None]:
output_dir = "./deepseek-mindnlp-medical-final"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Saved to: {output_dir}")


## Test Model


In [None]:
def generate_answer(question, max_length=256):
    prompt = f"""### Instruction:
You are a medical and sports health assistant. Answer the following question accurately and helpfully.

### Question:
{question}

### Answer:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "### Answer:" in full_response:
        answer = full_response.split("### Answer:")[1].strip()
    else:
        answer = full_response
    
    return answer


In [None]:
test_questions = [
    "What are the health benefits of regular exercise?",
    "How many hours of sleep do adults need per night?",
    "What is a normal resting heart rate?",
    "How can I improve my sleep quality?",
    "What is BMI and why does it matter?"
]

for question in test_questions:
    print(f"Q: {question}")
    print(f"A: {generate_answer(question)}\n")


In [None]:
import shutil

kaggle_output = "/kaggle/working/deepseek-mindnlp-medical"
os.makedirs(kaggle_output, exist_ok=True)

for file in os.listdir(output_dir):
    src = os.path.join(output_dir, file)
    dst = os.path.join(kaggle_output, file)
    if os.isfile(src):
        shutil.copy2(src, dst)

print(f"Exported to: {kaggle_output}")
