# Finetuning QWen2 with DLF Data

In [None]:
!python -m pip install -U pai
!python -m pip install openai


在此示例中，我们将使用自定义脚本，基于PAI ModelGallery提供的训练镜像以及推理服务配置，使用paimon数据表完成Qwen2的模型微调，并将微调后的模型部署到PAI-EAS。


In [2]:
from pai.model import ModelTrainingRecipe, RegisteredModel


# 获取PAI ModelGallery中提供的qwen2模型
qwen_model = RegisteredModel(model_name="qwen2-0.5b-instruct", model_provider="pai")

# 复用PAI预置的微调训练配置（ModelTrainingRecipe）
model_training_recipe = qwen_model.training_recipe(method="Standard")

print(model_training_recipe.image_uri)
print(model_training_recipe.model_uri)

dsw-registry-vpc.cn-hangzhou.cr.aliyuncs.com/pai-training-algorithm/llm_deepspeed_peft:v0.0.6
oss://pai-quickstart-cn-hangzhou.oss-cn-hangzhou-internal.aliyuncs.com/modelscope/models/qwen2-0.5b-instruct/main/


使用自定义脚本提交一个Qwen2的微调训练任务，具体微调代码请见`./src`目录下的`train.py`文件

In [3]:
from pai.estimator import Estimator


est = Estimator(
    source_dir="./src",
    # 复用PAI提供的qwen2训练镜像
    image_uri=model_training_recipe.image_uri,
    hyperparameters={
        "batch_size": 1,
    },
    requirements=[
        "transformers>=4.42.3",
        "tokenizers>=0.19.1",
        "trl>=0.9.6",
        # use oss2 to download dataset
        "oss2",
    ],
    instance_type="ecs.gn7i-c8g1.2xlarge",
    environments={},
    command="python train.py",
    base_job_name="qwen2_finetune",
)

training_job = est.fit(
    inputs={
        # 复用PAI提供的qwen2模型
        "model": model_training_recipe.model_uri,
    },
    wait=False,
)

Uploading file: /var/folders/jn/9tcbd4h56z5g3wbbd5sms38m0000gp/T/tmpbarx4bdm/source.tar.gz: 100%|██████████| 1.81k/1.81k [00:00<00:00, 8.93kB/s]


View the job detail by accessing the console URI: https://pai.console.aliyun.com/?regionId=cn-hangzhou&workspaceId=58670#/training/jobs/trainvqnkee3bi2e


等待训练作业结束

In [4]:
training_job.wait(show_logs=True)

TrainingJob launch starting
NPP_VERSION=12.1.0.4
SHELL=/bin/bash
PAI_HPS={"batch_size":"1"}
PET_NNODES=1
KUBERNETES_SERVICE_PORT_HTTPS=443
NVIDIA_VISIBLE_DEVICES=0
DALI_BUILD=7922358
DSW_98084_SERVICE_PORT=80
KUBERNETES_SERVICE_PORT=6443
PYTHONUNBUFFERED=1
PAI_HPS_BATCH_SIZE=1
PAI_OUTPUT_TENSORBOARD=/ml/output/tensorboard/
CUSOLVER_VERSION=11.4.5.107
CLUSTER_NAME=asi_cn-hangzhou_pai_k01
CUBLAS_VERSION=12.1.3.1
DSW_98084_SERVICE_PORT_HTTP_DSW_98084=80
KUBERNETES_CONTAINER_RESOURCE_GPU=1
HOSTNAME=trainvqnkee3bi2e-master-0
PYVER=3.10
PET_NODE_RANK=0
MASTER_PORT=23456
DSW_98084_PORT_22_TCP_PORT=22
SCRAPE_PROMETHEUS_METRICS=yes
CUFFT_VERSION=11.0.2.54
NVIDIA_REQUIRE_CUDA=cuda>=9.0
DSW_107274_PORT_80_TCP_PROTO=tcp
DSW_96358_PORT_80_TCP_ADDR=10.192.28.168
DSW_98084_PORT_80_TCP_PORT=80
CUDA_CACHE_DISABLE=1
DSW_96358_PORT_22_TCP_ADDR=10.192.28.168
TENSORBOARD_PORT=6006
POD_NAME=trainvqnkee3bi2e-master-0
DSW_107274_PORT_80_TCP_PORT=80
_CUDA_COMPAT_STATUS=CUDA Driver OK
PAI_OUTPUT_MODEL=/ml/outpu



UnretryableException: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

查看模型输出路径

In [None]:
model_data = training_job.output_path("model")

print(model_data)

训练产出的模型，可以使用预置的Qwen2的模型部署配置，直接部署到PAI-EAS，创建一个推理服务

In [None]:
from pai.model import Model
from pai.common.utils import random_str


m = Model(
    model_data=model_training_recipe.model_data(),
    inference_spec=qwen_model.inference_spec,
)

predictor = m.deploy(service_name="qwen2_finetune_{}".format(random_str(6)))

print(predictor.internet_endpoint)
# print(predictor.access_token)

部署的推理服务支持用OpenAI API进行调用

In [None]:
openai = predictor.openai()


resp = openai.chat.completions.create(
    model="default",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "如何分辨是病毒性感冒还是细菌性性感冒"},
    ],
    max_tokens=2048,
)

print(resp)

完成后删除推理服务

In [None]:
predictor.delete_service()