# 目录
1. [安装包](#安装包)
2. [持续预训练](#持续预训练)
3. [全参监督训练](#全参监督训练)
4. [LoRA监督训练](#lora监督训练)
5. [评估](#评估)


## 1.安装包

In [None]:

%pip install -r requirements.txt
%pip install "deepspeed==0.14.0" --upgrade
%pip install modelscope
%pip install flash-attn==2.5.9 --no-build-isolation
# %DS_BUILD_CPU_ADAM=1  BUILD_UTILS=1  pip install deepspeed==0.14.3 -U
%DS_BUILD_CPU_ADAM=1  BUILD_UTILS=1  pip install deepspeed==0.14.0 -U
# %conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=12.1 -c pytorch -c nvidia

# %pip install jieba
# %pip install prettytable
# %pip install accelerate --upgrade
# %pip uninstall -y transformers
# %pip install git+https://github.com/huggingface/transformers

In [None]:
dbutils.library.restartPython()

## 2.持续预训练

In [None]:
import os
from pyspark.ml.torch.distributor import TorchDistributor
import mlflow
import torch

os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["TORCH_EXTENSIONS_DIR"] = "/root/.cache/torch_extensions/py310_cu118"
host = "https://" + spark.conf.get("spark.databricks.workspaceUrl")
os.environ["DATABRICKS_HOST"] = host
os.environ["DATABRICKS_TOKEN"] = mlflow.utils.databricks_utils.get_databricks_host_creds().token
username = spark.sql("SELECT current_user()").first()['current_user()']
experiment_path = f'/Users/{username}/finetune-llama3-8b'
mlflow.set_experiment(experiment_path)
os.environ["EXPERIMENT_PATH"] = experiment_path
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

# 设置路径参数
base_path = "/Volumes/main/default/default_volume/erikyzzhang/long_context"
data_path = base_path + "/data"
model_path = base_path + "/models"

model_name_or_path = base_path + '/models/llama3-8B-8k'
model_max_length = 16384 * 2
output_dir = base_path + "/models/llama3-8B-32k-ft"
data_path = base_path + "/data/redpajama"
use_databricks = True

# 如果输出目录不存在，创建它
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

NUM_PROCESSES = torch.cuda.device_count()
print(f"We're using {NUM_PROCESSES} GPUs")
single_node_multi_gpu_ckpt_path = TorchDistributor(num_processes=NUM_PROCESSES, local_mode=True, use_gpu=True) \
    .run("./fine-tune.py", f'--model_name_or_path={model_name_or_path}', f"--data_path={data_path}",
    f'--model_max_length={str(model_max_length)}', 
    f'--output_dir={output_dir}',
    f'--use_databricks', use_databricks)


## 3.全参监督训练


In [None]:
import os
from pyspark.ml.torch.distributor import TorchDistributor
import mlflow
import torch
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["TORCH_EXTENSIONS_DIR"] = "/root/.cache/torch_extensions/py310_cu118"
host = "https://" + spark.conf.get("spark.databricks.workspaceUrl")
os.environ["DATABRICKS_HOST"] = host
os.environ["DATABRICKS_TOKEN"] = mlflow.utils.databricks_utils.get_databricks_host_creds().token
username = spark.sql("SELECT current_user()").first()['current_user()']
experiment_path = f'/Users/{username}/finetune-llama3-8b'
mlflow.set_experiment(experiment_path)
os.environ["EXPERIMENT_PATH"] = experiment_path
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

# 设置参数
base_path = "/Volumes/main/default/default_volume/erikyzzhang/long_context"
data_path = base_path + "/data"
model_path = base_path + "/models"

model_name_or_path = base_path + "/models/llama3-8B-32k-ft"
model_max_length = 16384 * 2
output_dir = base_path + "/models/llama3-8B-32k-ft-sft"
filter_mode = "all"
data_path = base_path + "/data/LongAlpaca-12k.json"
use_databricks = True

# 如果输出目录不存在，创建它
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

NUM_PROCESSES = torch.cuda.device_count()
print(f"We're using {NUM_PROCESSES} GPUs")
single_node_multi_gpu_ckpt_path = TorchDistributor(num_processes=NUM_PROCESSES, local_mode=True, use_gpu=True) \
    .run("./supervised-fine-tune.py", f'--model_name_or_path={model_name_or_path}', f"--data_path={data_path}",
    f'--model_max_length={str(model_max_length)}', 
    f'--output_dir={output_dir}',
    f'--use_databricks', use_databricks)

## 4.LoRA监督训练

In [None]:
# Train
import os
from pyspark.ml.torch.distributor import TorchDistributor
import mlflow
import torch
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["TORCH_EXTENSIONS_DIR"] = "/root/.cache/torch_extensions/py310_cu118"
host = "https://" + spark.conf.get("spark.databricks.workspaceUrl")
os.environ["DATABRICKS_HOST"] = host
os.environ["DATABRICKS_TOKEN"] = mlflow.utils.databricks_utils.get_databricks_host_creds().token
username = spark.sql("SELECT current_user()").first()['current_user()']
experiment_path = f'/Users/{username}/finetune-llama3-8b'
mlflow.set_experiment(experiment_path)
os.environ["EXPERIMENT_PATH"] = experiment_path
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

# 设置参数
base_path = "/Volumes/main/default/default_volume/erikyzzhang/long_context"
data_path = base_path + "/data"
model_path = base_path + "/models"

model_name_or_path = base_path + "/models/llama3-8B-8k"
model_max_length = 16384 * 2
output_dir = base_path + "/models/llama3-8B-32k-sft-lora-adapter-resampling-full-data"
data_path = base_path + "/data/LongAlpaca-12k.json"
use_databricks = True

# 如果输出目录不存在，创建它
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

NUM_PROCESSES = torch.cuda.device_count()
print(f"We're using {NUM_PROCESSES} GPUs")
single_node_multi_gpu_ckpt_path = TorchDistributor(num_processes=NUM_PROCESSES, local_mode=True, use_gpu=True) \
    .run("./supervised-fine-tune-lora.py", f'--model_name_or_path={model_name_or_path}', f"--data_path={data_path}",
    f'--model_max_length={str(model_max_length)}', 
    f'--output_dir={output_dir}',
    f'--use_databricks', use_databricks)

In [None]:
# Merge
base_path = "/Volumes/main/default/default_volume/erikyzzhang/long_context"
steps_list = ["step-490", "step-686", "step-882"]
context_size = 16384 * 2
cache_dir = base_path + "/cache_dir"

base_model = base_path + "/models/llama3-8B-8k"
peft_model = base_path + "/models/llama3-8B-32k-sft-lora-adapter-resampling" + "/step-98" ######### 需要修改
save_path = base_path + "/models/llama3-8B-32k-sft-lora-step-98"   ######### 需要修改

command = f"python merge_lora_weights_and_save_hf_model.py --base_model {base_model} --peft_model {peft_model} --context_size {context_size} --save_path {save_path} --cache_dir {cache_dir}"
print(f"Running command: {command}")
!{command}

## 5.LoRAMEL

In [None]:
import os
from pyspark.ml.torch.distributor import TorchDistributor
import mlflow
import torch
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["TORCH_EXTENSIONS_DIR"] = "/root/.cache/torch_extensions/py310_cu118"
host = "https://" + spark.conf.get("spark.databricks.workspaceUrl")
os.environ["DATABRICKS_HOST"] = host
os.environ["DATABRICKS_TOKEN"] = mlflow.utils.databricks_utils.get_databricks_host_creds().token
username = spark.sql("SELECT current_user()").first()['current_user()']
experiment_path = f'/Users/{username}/finetune-llama3-8b'
mlflow.set_experiment(experiment_path)
os.environ["EXPERIMENT_PATH"] = experiment_path
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

# 设置参数
base_path = "/Volumes/main/default/default_volume/erikyzzhang/long_context"
data_path = base_path + "/data"
model_path = base_path + "/models"

model_name_or_path = base_path + "/models/llama3-8B-8k"
model_max_length = 16384 * 2
output_dir = base_path + "/models/llama3-8B-32k-sft-loramel"
data_path = base_path + "/data/LongAlpaca-12k.json"
use_databricks = True

# 如果输出目录不存在，创建它
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

NUM_PROCESSES = torch.cuda.device_count()
print(f"We're using {NUM_PROCESSES} GPUs")
single_node_multi_gpu_ckpt_path = TorchDistributor(num_processes=NUM_PROCESSES, local_mode=True, use_gpu=True) \
    .run("./supervised-fine-tune-LoRAMEL.py", f'--model_name_or_path={model_name_or_path}', f"--data_path={data_path}",
    f'--model_max_length={str(model_max_length)}', 
    f'--output_dir={output_dir}',
    f'--use_databricks', use_databricks)

## 6.评估

In [None]:
# Evaluate
# # llama3-8B-32k-ft
import os
steps_list = ["step-490", "step-686", "step-882"]
base_path = "/Volumes/main/default/default_volume/erikyzzhang/long_context"
data_path = base_path + "/longbench/data"
longbench_dir = base_path + "/longbench"
port = 12356
maxlen = 31500
use_databricks = True

model_name = "llama3-8B-32k-sft-lora-490"       ######### 需要修改
model_path = base_path + "/models/llama3-8B-32k-sft-lora-step-490"     ######### 需要修改


if not os.path.exists(longbench_dir):
    os.makedirs(longbench_dir)

!CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python LongBench/pred_db.py --model_path {model_path} --model_name {model_name} --data_path {data_path} --longbench_dir {longbench_dir} --maxlen {maxlen} --port {port}
!python ./LongBench/eval_db.py --model_name {model_name} --longbench_dir {longbench_dir}
!python ./LongBench/analysis_db.py --longbench_dir {longbench_dir}