In [None]:
import subprocess
import os


In [None]:
base_path = "/Volumes/main/default/default_volume/erikyzzhang/long_context"
data_path = base_path + "/data"
model_path = base_path + "/models"

In [None]:
from datasets import load_dataset
import os
import shutil

# 定义目标目录和缓存目录
target_dir = data_path + 'redpajama'
cache_dir = data_path + 'redpajama'


# 设置缓存目录
os.environ['HF_DATASETS_CACHE'] = cache_dir

# 加载数据集，并指定缓存和下载目录
dataset = load_dataset('togethercomputer/RedPajama-Data-1T-Sample', cache_dir=cache_dir, data_dir=target_dir)

print(f"Dataset downloaded to {target_dir}")
print(dataset)
# 列出下载目录中的文件
print("Files in target directory:")
for root, dirs, files in os.walk(target_dir):
    for file in files:
        print(os.path.join(root, file))



In [None]:
# 下载LongAlpaca-12k.json数据集
# os.makedirs(data_path + 'longalpaca', exist_ok=True)
subprocess.run(['wget', 'https://huggingface.co/datasets/Yukang/LongAlpaca-12k/resolve/main/LongAlpaca-12k.json', '-P', data_path + 'longalpaca'], check=True)

1.初始化环境

In [None]:
# 安装modelscope
subprocess.run(['pip', 'install', 'modelscope'], check=True)

In [None]:
import transformers
import torch
from modelscope import snapshot_download

model_id = snapshot_download("LLM-Research/Meta-Llama-3-8B-Instruct", cache_dir=model_path)

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
)

In [None]:
# 移动模型文件到models目录
subprocess.run(['mv', base_path + '/models/LLM-Research/Meta-Llama-3-8B-Instruct', base_path + '/models/llama3-8B-8k'], check=True)

# 安装requirements.txt中的依赖
subprocess.run(['pip', 'install', '-r', 'requirements.txt'], check=True)
# deepspeed
!pip uninstall deepspeed
!pip cache purge
!DS_BUILD_CPU_ADAM=1  BUILD_UTILS=1  pip install deepspeed==0.14.0 -U

# 安装flash-attn
subprocess.run(['pip', 'install', 'flash-attn', '--no-build-isolation'], check=True)


2.持续预训练

In [None]:
import os
import subprocess

model_name_or_path = base_path + '/models/llama3-8B-8k'
model_max_length = 16384 * 2
output_dir = base_path + "/models/llama3-8B-32k-ft"
data_path = base_path + "/data/redpajama"

# 如果输出目录不存在，创建它
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 构建运行命令
command = [
    'torchrun', '--nproc_per_node=8', 'fine-tune.py',
    '--data_path', data_path,
    '--model_name_or_path', model_name_or_path,
    '--model_max_length', str(model_max_length),
    '--output_dir', output_dir
]

# 运行命令并打印输出
with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) as process:
    for line in process.stdout:
        print(line, end='')
    for line in process.stderr:
        print(line, end='')

# 等待进程结束
process.wait()

# 检查进程返回码
if process.returncode != 0:
    print(f"Process failed with return code {process.returncode}")


3.监督训练

In [None]:
import os
import subprocess

# 设置参数
model_name_or_path =  base_path + "/models/llama3-8B-32k-ft"
model_max_length = 16384 * 2
output_dir =  base_path + "/models/llama3-8B-32k-ft-sft"
filter_mode = "all"
data_path = base_path + "/data/LongAlpaca-12k.json"

# 如果输出目录不存在，创建它
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 构建运行命令
command = [
    'torchrun', '--nproc_per_node=8', '--master_port=29501', 'supervised-fine-tune.py',
    '--model_name_or_path', model_name_or_path,
    '--data_path', data_path,
    '--model_max_length', str(model_max_length),
    '--filter_mode', filter_mode,
    '--output_dir', output_dir
]

# 运行命令并实时打印输出
with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) as process:
    for line in process.stdout:
        print(line, end='')
    for line in process.stderr:
        print(line, end='')

# 等待进程结束
process.wait()

# 检查进程返回码
if process.returncode != 0:
    print(f"Process failed with return code {process.returncode}")


4.评估

In [None]:
import json
import os
import subprocess
from IPython.display import display, clear_output

# 设置相对路径
relative_path = 'LongBench'
# 获取当前工作目录
current_dir = os.getcwd()
# 组合成新的路径
new_dir = os.path.join(current_dir, relative_path)
# 切换到新的目录
os.chdir(new_dir)

# 定义模型名称和路径
MODEL_NAME = "llama3-8B-32k-ft-sft"
MODEL_PATH = base_path + "/models/" + MODEL_NAME
MAX_LENGTH = 31000

# 更新 model2path.json 文件
display("Updating model2path.json...")
model2path_file = './config/model2path.json'
with open(model2path_file, 'r') as file:
    data = json.load(file)

data[MODEL_NAME] = MODEL_PATH

with open(model2path_file, 'w') as file:
    json.dump(data, file, indent=4)

# 更新 model2maxlen.json 文件
display("Updating model2maxlen.json...")
model2maxlen_file = './config/model2maxlen.json'
with open(model2maxlen_file, 'r') as file:
    data = json.load(file)

data[MODEL_NAME] = MAX_LENGTH

with open(model2maxlen_file, 'w') as file:
    json.dump(data, file, indent=4)

# 更新 pred.py 文件中的 parse_args
display("Updating pred.py...")
pred_file = './pred.py'
with open(pred_file, 'r') as file:
    lines = file.readlines()

for i, line in enumerate(lines):
    if 'choices=' in line:
        lines.insert(i + 1, ' ' * 12 + f'"{MODEL_NAME}",\n')
        break

with open(pred_file, 'w') as file:
    file.writelines(lines)

def run_subprocess(command):
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
    while True:
        output = process.stdout.readline()
        if output == "" and process.poll() is not None:
            break
        if output:
            clear_output(wait=True)
            display(output.strip())
    rc = process.poll()
    return rc

# 运行模型预测
display("Running model prediction...")
run_subprocess(['CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python ./pred.py --model ' + MODEL_NAME])

# 运行模型评估
display("Running model evaluation...")
run_subprocess(['python ./eval.py --model ' + MODEL_NAME])

# 显示结果
display("Displaying results...")
run_subprocess(['python analysis.py --filter ' + MODEL_NAME])

display("Script execution completed.")

# 结果移动到Volume
source_dir = os.path.join("./pred", MODEL_NAME)
target_dir = os.path.join(base_path, "LongBench/pred")
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
# 移动文件或目录
shutil.move(source_dir, target_dir)
