In [7]:
from torch import cuda, bfloat16
import transformers

model_name = '/home/ubuntu/model/falcon-40b-instruct'
# Set device to run the inference
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set quantization configuration to load large Falcon model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto'
)
# Set model to inference mode
model.eval()
print(f"Model loaded on {device}")


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ubuntu/anaconda3/envs/ldm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /home/ubuntu/anaconda3/envs/ldm/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/ubuntu/anaconda3/envs/ldm/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

Model loaded on cuda:0


In [8]:
from transformers import StoppingCriteria, StoppingCriteriaList
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
import torch
# we create a list of stopping criteria
stop_token_ids = [
    tokenizer.convert_tokens_to_ids(x) for x in [
        ['Human', ':'], ['AI', ':']
    ]
]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

In [9]:
from transformers import StoppingCriteria, StoppingCriteriaList
# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [10]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

The model 'RWForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForC

In [16]:
import langchain
from langchain.llms import HuggingFacePipeline
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain import PromptTemplate, LLMChain

prompt_template = \
"""The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
Human: {input}
AI:"""

prompt = PromptTemplate(
    input_variables=["history", "input"],
    template=prompt_template,
)

memory = ConversationBufferWindowMemory( memory_key="history", k=5, return_messages=True)

In [17]:
from langchain.chains import ConversationChain
llm = HuggingFacePipeline(pipeline=generate_text)
chat = ConversationChain(
    llm=llm,
    memory=memory,
    verbose=True,
    prompt=prompt
)


In [18]:
query = \
"""can you write me a simple Python script that calculates the circumference
of a circle given a radius `r`"""

res = chat.predict(input=query)
res

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
[]
Human: can you write me a simple Python script that calculates the circumference
of a circle given a radius `r`
AI:[0m

[1m> Finished chain.[0m


" Sure! Here's a Python code snippet that should do what you need:\n\n```python\nimport math\n\ndef circumference_of_circle(r):\n    return 2 * math.pi * r\n```\n\nYou can call this function with any value for `r`, and it will return the circumference of a circle with that radius. For example:\n\n```python\nprint(circumference_of_circle(5))  # Output: 31.41592653589793\n```"

In [15]:
query = \
"""what was the function of the code you provided?"""
res = chat.predict(input=query)
print(res)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
[HumanMessage(content='can you write me a simple Python script that calculates the circumference\nof a circle given a radius `r`', additional_kwargs={}, example=False), AIMessage(content=" Sure! Here's a Python code snippet that should do what you need:\n\n```python\nimport math\n\ndef circumference_of_circle(r):\n    return 2 * math.pi * r\n```\n\nYou can call this function with any value for `r`, and it will return the circumference of a circle with that radius. For example:\n\n```python\nprint(circumference_of_circle(5))  # Output: 31.41592653589793\n```", additional_kwargs={}, example=False)]
Human: what was the function of the code you provide

In [19]:
query = \
"""Can you provide the same code for cone?"""
res = chat.predict(input=query)
print(res)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
[HumanMessage(content='can you write me a simple Python script that calculates the circumference\nof a circle given a radius `r`', additional_kwargs={}, example=False), AIMessage(content=" Sure! Here's a Python code snippet that should do what you need:\n\n```python\nimport math\n\ndef circumference_of_circle(r):\n    return 2 * math.pi * r\n```\n\nYou can call this function with any value for `r`, and it will return the circumference of a circle with that radius. For example:\n\n```python\nprint(circumference_of_circle(5))  # Output: 31.41592653589793\n```", additional_kwargs={}, example=False)]
Human: Can you provide the same code for cone?
AI:[