In [1]:
# ! pip install bert_score

In [1]:
import pandas as pd
import numpy as np
import evaluate

In [2]:
# from transformers import LlamaModel, LlamaConfig
# from accelerate import init_empty_weights
# config = LlamaConfig()
# with init_empty_weights():
#     model = LlamaModel(config)

In [3]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=False,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)


# hf_model_repo = "meta-llama/Llama-2-7b-chat-hf"

hf_model_repo = "ilufy/llama2-7b-coaxnn-paper-qna-fuinetuned-merged-270"


# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo)

# Load the model
model = AutoModelForCausalLM.from_pretrained(hf_model_repo,
                                             quantization_config=bnb_config,
                                             device_map="auto")

use_transformers_pipeline = False

if use_transformers_pipeline:
    import transformers

    tokenizer = AutoTokenizer.from_pretrained(hf_model_repo,  trust_remote_code=True)
    pipeline = transformers.pipeline(
        "text-generation", #"question-answering",
        model=hf_model_repo,
        trust_remote_code=True
    )

    sequences = pipeline(
        prompt,
        temperature=0.6,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,
    )

    for seq in sequences:
        print(seq['generated_text'])





Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Lin

In [5]:
df = pd.read_csv("data/coaxnn_paper_qna_train.csv")[:256]

df['text_question'] = 'Question:\n' + df['Question']
df['text_answer'] = 'Answer:\n' + df['Answer']

In [6]:
eval_metric = evaluate.load("rouge")

In [None]:
# prompt = "How much speedup does CoAxNN have on CIFAR dataset?"

# prompt = "What is the speedup achieved by CoAxNN on different models?"

# prompt = "What is the advantage of using CoAxNN?"

# prompt = "What is search space when the number of stages is �?"

# prompt = "What is the output of Algorithm 2 in CoAxNN?"

# prompt = "Fig. 1 shows the optimization effect of the ResNet-56 using different configuration parameters under the specified requirements of accuracy on the CIFAR-10 dataset. What does the triples (𝑥, 𝑦, 𝑧) represent?"

rand_idx = np.random.randint(len(df))

prompt = df['text_question'].iloc[rand_idx]

references = [df['text_answer'].iloc[rand_idx]]
predictions = []

# prompt = "Question:\n" + prompt

# Generate response
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                         temperature=0.6)

gen_tokens = outputs[:, input_ids.shape[1]:]

result = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]

predictions = [result]

metric = eval_metric.compute(predictions = predictions, references = references)
# metric = eval_metric.compute(lang='en', predictions = predictions, references = references)


# Print the result

print(f"[Prompt] \n{prompt}")
print()
print(f"[GT] \n{references[0]}")
print()
print(f"Generated response:\n{result}")

print(f"metric: {metric}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
# free VRAM
del model
import gc
gc.collect()

torch.cuda.empty_cache()
