In [21]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import math
import torch
import sys
import json
import re
from tqdm import tqdm
from llama_attn_replace import replace_llama_attn
import deepspeed
import gc
import random
import warnings
warnings.filterwarnings('ignore')

## DeepSpeed Offloading

In [22]:
ds_config = {
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": True
        }
    }, 
    "train_batch_size": 1,
    'weight_quantization': {
        'quantized_initialization' : {
            'num_bits': 4,
            'group_size': 64,
            "group_dim": 1,
            "symmetric": False
        }
    },
}

## Parameters

In [23]:
PROMPT_DICT = {
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
    "prompt_no_input_llama2": (
        "<s>[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
    "prompt_llama2": "[INST]{instruction}[/INST]"
}

In [24]:
#model_name = "Yukang/LongAlpaca-7B"
model_name = "merged/continue-qm-16k"
cache_dir = "./cache"
context_size = 16384

## Config, Model and Tokenizer

In [None]:
replace_llama_attn(inference=True)

In [None]:
config = AutoConfig.from_pretrained(model_name)

In [None]:
orig_ctx_len = getattr(config, "max_position_embeddings", None)
if orig_ctx_len and context_size > orig_ctx_len:
    scaling_factor = float(math.ceil(context_size / orig_ctx_len))
    config.rope_scaling = {"type": "linear", "factor": scaling_factor}

In [None]:
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        torch_dtype=torch.float16,
        device_map="auto",
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.resize_token_embeddings(32001)

Embedding(32001, 4096, padding_idx=0)

In [None]:
model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

In [None]:
# run if using deepspeed
model_engine, _, _, _ = deepspeed.initialize(config_params=ds_config, model=model)

[2023-12-04 21:29:43,006] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.12.4, git-hash=unknown, git-branch=unknown
[2023-12-04 21:29:43,007] [INFO] [comm.py:637:init_distributed] cdb=None
[2023-12-04 21:29:43,007] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2023-12-04 21:29:43,128] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.31.17.147, master_port=29500
[2023-12-04 21:29:43,129] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2023-12-04 21:29:43,976] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2023-12-04 21:29:43,978] [INFO] [logging.py:96:log_dist] [Rank 0] Creating ZeRO Offload
[2023-12-04 21:29:44,118] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
[2023-12-04 21:29:44,119] [INFO] [utils.py:796

In [None]:
# run if using deepspeed
model_engine = model_engine.to("cuda")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        model_max_length=context_size if context_size > orig_ctx_len else orig_ctx_len,
        padding_side="right",
        use_fast=False,
    )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Inference

In [None]:
def build_generator(
    model, tokenizer, temperature=0.6, top_p=0.9, max_gen_len=512):
    def response(prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

        #streamer = TextStreamer(tokenizer)
        
        # run if using deepspeed
        output = model.module.generate(
            **inputs,
            max_new_tokens=max_gen_len,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.1
            #streamer=streamer,
        )
        
        # output = model.generate(
        #     **inputs,
        #     max_new_tokens=max_gen_len,
        #     temperature=temperature,
        #     top_p=top_p,
        #     use_cache=use_cache,
        #     #streamer=streamer,
        # )
        
        out = tokenizer.decode(output[0], skip_special_tokens=True)

        out = out.split(prompt.lstrip("<s>"))[1].strip()
        out = out.split("</s>")[0].strip()
        return out

    return response

In [None]:
# run if using deepspeed
respond = build_generator(model_engine, tokenizer, max_gen_len=512)
# respond = build_generator(model, tokenizer, max_gen_len=512)

In [None]:
prompt_no_input = PROMPT_DICT["prompt_llama2"]

In [None]:
question = "Summarize the meeting transcript in two sentences.\n"

### File

In [None]:
def file_inference_mb(question, in_file, out_file):
    with open(in_file, "r") as f:
        data = json.load(f)

    with open(out_file, "w") as f:
        for meeting in tqdm(data):
            with torch.no_grad():
                sample_result = {}
                id = meeting["id"]
                material = meeting["source"]
                target = meeting["summary"]
                prompt = prompt_no_input.format_map({"instruction": material + "\n%s"%question})
                output = respond(prompt=prompt)
                sample_result["target"] = re.sub(r'\n', '', target)
                sample_result["prediction"] = re.sub(r'\n', '', output)
                print(sample_result["prediction"])
                sample_result["id"] = id
                json.dump(sample_result, f)
                f.write('\n')

            del sample_result, material, target, prompt, output, id
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

In [None]:
in_file = "../MeetingBank/test_segment_16k_sample_100.json"
out_file = "./output/output_MB_16k.jsonl"

In [None]:
file_inference_mb(question, in_file, out_file)

  0%|          | 0/100 [00:00<?, ?it/s]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
  0%|          | 0/100 [03:37<?, ?it/s]


KeyboardInterrupt: 