llama inference test #515

HandH1998 · 2023-06-29T08:41:53Z

I build lightseq on cuda11.4 successfully. Then I do llama-13B inference test on A100-80G. I set max_step=1024. When max_batch_size <11, it works fine. The problem is that when I set max_batch_size >= 11, lightseq/csrc/ops_new/sampling.cc.cu(73): an illegal memory access was encountered. And I also use CUDA_LAUNCH_BLOCKING=1 to locate the problem, lightseq/csrc/ops_new/sampling.cc.cu(57): an illegal memory access was encountered.The memory uses about 40G, so it is not OOM problem. The following is my inference test script. Please help me with the problem.

import time
import argparse
import numpy as np
import torch
import lightseq.inference as lsi
from transformers import LlamaTokenizer, LlamaForCausalLM

def ls_llama(model, inputs):
    torch.cuda.synchronize()
    start_time = time.perf_counter()
    results = model.infer(inputs)
    torch.cuda.synchronize()
    end_time = time.perf_counter()
    return results, end_time - start_time

def ls_generate(model, tokenizer, inputs):
    print("=========lightseq=========")
    print("lightseq generating...")
    ls_res_ids, ls_time = ls_llama(model, inputs)

    ls_res_ids = np.squeeze(ls_res_ids, axis=1)
    # ls_res = tokenizer.batch_decode(ls_res_ids, skip_special_tokens=True)
    ls_res = tokenizer.batch_decode(ls_res_ids)
    print("lightseq results:")
    for sent in ls_res:
        print(sent)

    input_seq_len = inputs.shape[1]
    input_bsz = inputs.shape[0]
    input_total_tokens = input_seq_len * input_bsz

    print("input_seq_len: {}".format(input_seq_len))
    print("input_bsz: {}".format(input_bsz))
    print("input_total_tokens: {}".format(input_total_tokens))

    output_total_tokens = ls_res_ids.size
    gen_total_tokens = output_total_tokens - input_total_tokens
    output_seq_len = [seq.size for seq in ls_res_ids]

    print("output_total_tokens: {}".format(output_total_tokens))
    print("output_seq_len: {}".format(output_seq_len))
    print("gen_total_tokens: {}".format(gen_total_tokens))
    print(f"lightseq time: {ls_time}s")
    print("gen_speed: {} tokens/s".format(gen_total_tokens / ls_time))


def warmup(ls_tokenizer, ls_model, sentences):
    ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)["input_ids"]
    ls_generate(ls_model, ls_tokenizer, ls_inputs)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--user_input", action="store_true")
    args = parser.parse_args()
    print("initializing gpt tokenizer...")
    ls_tokenizer = LlamaTokenizer.from_pretrained(
        "/home/zy/lightseq/llama/13b"
    )
    ls_tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    print("creating lightseq model...")
    # llama_weight_path = "/home/zy/lightseq/llama_13b.hdf5"
    ls_model = lsi.Llama(llama_weight_path, max_batch_size=11)

    # lightseq gpt perplexity supports batch infer with different lengths,
    # but sampling doesn't support
    sentences = [
        "Are you a pig?",
        "I love you, but you say that",
        "I love you, but you say that",
        "I love you, but you say that",
        "I love you, but you say that",
        "I love you, but you say that",
        "I love you, but you say that",
        "I love you, but you say that",
        "Are you a pig?",
        "I love you, but you say that",
        "I love you, but you say that",
    ]
    print("====================START warmup====================")
    warmup(
        ls_tokenizer,
        ls_model,
        sentences,
    )
    print("====================END warmup====================")

    while True:
        if args.user_input:
            sentences = [input("input the masked sentence:\n")]

        print("tokenizing the sentences...")

        ls_inputs = ls_tokenizer(sentences, return_tensors="pt", padding=True)[
            "input_ids"
        ]
        ls_generate(ls_model, ls_tokenizer, ls_inputs)

        if not args.user_input:
            break


if __name__ == "__main__":
    main()

The text was updated successfully, but these errors were encountered:

ChristineSeven · 2023-08-02T05:16:03Z

use your code, i got this error, module 'lightseq.inference' has no attribute 'Llama' . could you tell how you bypass this? @HandH1998

HandH1998 · 2023-08-15T12:25:02Z

use your code, i got this error, module 'lightseq.inference' has no attribute 'Llama' . could you tell how you bypass this? @HandH1998

It seems that you didn't compile it correctly.

Change use_new_arch to ON.

ChristineSeven · 2023-08-28T09:01:48Z

@HandH1998 Thanks.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

llama inference test #515

llama inference test #515

HandH1998 commented Jun 29, 2023 •

edited

ChristineSeven commented Aug 2, 2023 •

edited

HandH1998 commented Aug 15, 2023

ChristineSeven commented Aug 28, 2023

llama inference test #515

llama inference test #515

Comments

HandH1998 commented Jun 29, 2023 • edited

ChristineSeven commented Aug 2, 2023 • edited

HandH1998 commented Aug 15, 2023

ChristineSeven commented Aug 28, 2023

HandH1998 commented Jun 29, 2023 •

edited

ChristineSeven commented Aug 2, 2023 •

edited