In [None]:
! pip install -U peft accelerate
! pip install -U sentencepiece
! pip install -U transformers langchain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
from transformers import pipeline

model_name = "decapoda-research/llama-7b-hf"
adapters_name = 'lucas0/empath-llama-7b'

print(f"Starting to load the model {model_name} into memory")

m = AutoModelForCausalLM.from_pretrained(
    model_name,
    #load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map={"": 0}
)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()
tok = LlamaTokenizer.from_pretrained(model_name)
tok.bos_token_id = 1

stop_token_ids = [0]

print(f"Successfully loaded the model {model_name} into memory")

Starting to load the model decapoda-research/llama-7b-hf into memory


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


Successfully loaded the model decapoda-research/llama-7b-hf into memory


In [None]:
from typing import Any, Dict, List, Mapping, Optional

from pydantic import Extra, root_validator

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens

from langchain import PromptTemplate, LLMChain

class HuggingFaceHugs(LLM):
  pipeline: Any
  class Config:
    """Configuration for this pydantic object."""
    extra = Extra.forbid

  def __init__(self, model, tokenizer, task="text-generation"):
    super().__init__()
    self.pipeline = pipeline(task, model=model, tokenizer=tokenizer)

  @property
  def _llm_type(self) -> str:
    """Return type of llm."""
    return "huggingface_hub"

  def _call(self, prompt, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None,):
    # Runt the inference.
    text = self.pipeline(prompt, max_length=100)[0]['generated_text']

    # @alvas: I've totally no idea what this in langchain does, so I copied it verbatim.
    if stop is not None:
      # This is a bit hacky, but I can't figure out a better way to enforce
      # stop tokens when making calls to huggingface_hub.
      text = enforce_stop_tokens(text, stop)
    print(text)
    return text[len(prompt):]


template = """ Hey llama, you like to eat quinoa. Whatever question I ask you, you reply with "Waffles, waffles, waffles!".
 Question: {input} Answer: """
prompt = PromptTemplate(template=template, input_variables=["input"])


hf_model = HuggingFaceHugs(model=m, tokenizer=tok)

chain = LLMChain(prompt=prompt, llm=hf_model)

chain("Who is Princess Momo?")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


 Hey llama, you like to eat quinoa. Whatever question I ask you, you reply with "Waffles, waffles, waffles!".
 Question: Who is Princess Momo? Answer:  She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is


{'input': 'Who is Princess Momo?',
 'text': ' She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is a princess.  She is'}

In [None]:
ß