-
Notifications
You must be signed in to change notification settings - Fork 33
/
finetune.yaml
138 lines (138 loc) · 3.52 KB
/
finetune.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
FormatVersion: OOS-2019-06-01
Description: 执行指定微调脚本运维操作
Parameters:
regionId:
Type: String
Label:
en: RegionId
zh-cn: 地域ID
AssociationProperty: RegionId
Default: '{{ ACS::RegionId }}'
serviceInstanceId:
Type: String
Label:
en: TargetInstance
zh-cn: 目标实例
AssociationProperty: ALIYUN::ComputeNest::ServiceInstance::ServiceInstanceId
AssociationPropertyMetadata:
Disabled: true
workingDir:
Type: String
Default: /root/ChatGLM-Efficient-Tuning
Description: 脚本执行路径
finetuneScriptPath:
Type: String
Default: src/train_bash.py
Description: 微调脚本路径
dataset:
Type: String
Default: alpaca_gpt4_zh
Description: 数据集名称
modelName:
Type: String
Default: THUDM/chatglm-6b
AllowedValues:
- THUDM/chatglm-6b
- THUDM/chatglm2-6b
stage:
Type: String
Description: 微调阶段
Default: sft
AllowedValues:
- sft
- ppo
- rm
finetuneType:
Type: String
Description: 微调类型
Default: lora
AllowedValues:
- lora
- p-tuning
- full
trainingEpoch:
Type: Number
Description: 训练轮次
Default: 3
precision:
Type: String
Description: 训练精度
Default: fp16
AllowedValues:
- fp16
- fp32
- fp64
outputDir:
Type: String
Description: 模型输出地址
Default: path_to_sft_checkpoint
timeout:
Label:
en: Timeout
zh-cn: 超时时间
Type: Number
Default: 600
Tasks:
- Name: getInstance
Description:
en: Views the ECS instances
zh-cn: 获取ECS实例
Action: ACS::SelectTargets
Properties:
ResourceType: ALIYUN::ECS::Instance
RegionId: '{{ regionId }}'
Filters:
- Type: All
RegionId: '{{regionId}}'
Parameters:
RegionId: '{{regionId}}'
Status: Running
Tags:
- Key: acs:computenest:serviceInstanceId
Value: '{{serviceInstanceId}}'
Outputs:
instanceIds:
Type: List
ValueSelector: Instances.Instance[].InstanceId
- Name: runCommand
Action: ACS::ECS::RunCommand
Description: 执行云助手命令
Properties:
commandContent: |-
#!/bin/bash
source /root/anaconda3/bin/activate chatglm_etuning
CUDA_VISIBLE_DEVICES=0
nohup /root/anaconda3/envs/chatglm_etuning/bin/python src/train_bash.py \
--model_name_or_path {{modelName}} \
--stage {{stage}} \
--do_train \
--dataset {{dataset}} \
--finetuning_type {{finetuneType}} \
--output_dir {{outputDir}} \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 1000 \
--learning_rate 5e-5 \
--num_train_epochs {{trainingEpoch}} \
--plot_loss \
--{{precision}} >> finetune_log.log 2>&1 &
workingDir: '{{workingDir}}'
instanceId: '{{ ACS::TaskLoopItem }}'
commandType: RunShellScript
timeout: '{{timeout}}'
Loop:
Items: '{{ getInstance.instanceIds }}'
Outputs:
commandOutputs:
AggregateType: Fn::ListJoin
AggregateField: commandOutput
Outputs:
commandOutput:
Type: String
ValueSelector: invocationOutput
Outputs:
commandOutputs:
Type: List
Value: '{{ runCommand.commandOutputs }}'