# 快速入门

## 示例一：单机房单芯片仿真

In [None]:
! python3 main.py --model-name llama2-70b --num-gpus 256 --nproc-per-node 8 --gpu-infos a800 --global-batch-size 512 --sequence-length 4096 --top 3

## 示例二：单机房多芯片仿真

In [None]:
! python3 main.py --model-name llama2-70b --num-gpus 256 --nproc-per-node 8 --gpu-infos a800:128,h800:160 --global-batch-size 512 --sequence-length 4096 --top 3

## 并行策略搜索

### 设定参数

In [None]:
info = {
   "model-name":"llama2-70b",
    "num-gpus":"256",
    "nproc-per-node":"8",
    "gpu-infos":"a800",
    "global-batch-size":"512",
    "sequence-length":"4096",
    "top":"3"
}

### 搜索参数

In [19]:
import subprocess
import json
from tabulate import tabulate

# 定义命令及其参数
command = [
    "python3", "main.py",
    "--model-name", info['model-name'],
    "--num-gpus", info['num-gpus'],
    "--nproc-per-node", info['nproc-per-node'],
    "--gpu-infos", info['gpu-infos'],
    "--global-batch-size", info['global-batch-size'],
    "--sequence-length", info['sequence-length'],
    "--top", info['top']
]

# 使用 subprocess 运行命令
try:
    result = subprocess.run(command, check=True, capture_output=True, text=True)
    
    # 将输出按行分割
    output_lines = result.stdout.splitlines()
    output_list = []
    # 逐行解析每个 JSON 对象
    for line in output_lines:
        try:
            json_output = json.loads(line.replace("'", '"'))  # 替换单引号为双引号
            json_output['strategy'] = json.dumps(json_output['strategy'], indent=4)
            output_list.append(json_output)
            # print("JSON Output:", json.dumps(json_output, indent=4))
        except json.JSONDecodeError as json_error:
            print("Failed to decode JSON for line:", line, "Error:", json_error)
    # print(json.dumps(output_list, indent=4))
except subprocess.CalledProcessError as e:
    print("Command failed with error:", e)

# 提取键和值，准备输出
headers = list(output_list[0].keys())
rows = [[item[key] for key in headers] for item in output_list]

# 使用 tabulate 美化输出
print(tabulate(rows, headers=headers, tablefmt="grid"))

+------------------+--------------------+---------------------+-------------------------------------------------+
|   sim_throughput |   sim_total_time_s | max-use-mem-size    | strategy                                        |
|           118247 |            17.7354 | [72.52196502685547] | {                                               |
|                  |                    |                     |     "pipeline-model-parallel-size": 8,          |
|                  |                    |                     |     "tensor-model-parallel-size": 2,            |
|                  |                    |                     |     "micro-batch-size": 1,                      |
|                  |                    |                     |     "use-flash-attn": "",                       |
|                  |                    |                     |     "sequence-parallel": "",                    |
|                  |                    |                     |     "use-distributed-opt