# Model init

## GPT4All-j 7B

In [1]:

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, T5Tokenizer, T5ForConditionalGeneration, LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
import torch
from langchain.llms import HuggingFacePipeline

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True)

device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": 0,
    "transformer.h": 0,
    "transformer.ln_f": 0,
    "transformer.wte.weight":0,
}

tokenizer = AutoTokenizer.from_pretrained("nomic-ai/gpt4all-j")
model = AutoModelForCausalLM.from_pretrained(
        "nomic-ai/gpt4all-j",
        revision="v1.3-groovy",
        torch_dtype=torch.float16,
        device_map=device_map,
        quantization_config=quantization_config,
    )


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: C:\Users\Aviv9\miniconda3\envs\LLMBots2\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary c:\Users\Aviv9\miniconda3\envs\LLMBots2\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll...


  warn(msg)
  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## MPT-7B

In [None]:
# MPT-7B test
# pip install einops

from torch import cuda, bfloat16
import transformers

# from accelerate import infer_auto_device_map
# device_map = infer_auto_device_map(model, max_memory={0: "9.5GiB", "cpu": "30GiB"})
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

quantization_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)


device_map = {
    'transformer.wte': 0,
    'transformer.emb_drop': 0,
    'transformer.blocks.0': 0,
    'transformer.blocks.1': 0,
    'transformer.blocks.2': 0,
    'transformer.blocks.3': 0,
    'transformer.blocks.4': 0,
    'transformer.blocks.5': 0,
    'transformer.blocks.6': 0,
    'transformer.blocks.7': 0,
    'transformer.blocks.8': 0,
    'transformer.blocks.9': 0,
    'transformer.blocks.10': 0,
    'transformer.blocks.11': 0,
    'transformer.blocks.12': 0,
    'transformer.blocks.13': 0,
    'transformer.blocks.14': 0,
    'transformer.blocks.15': 0,
    'transformer.blocks.16': 0,
    'transformer.blocks.17': 0,
    'transformer.blocks.18': 0,
    'transformer.blocks.19': 0,
    'transformer.blocks.20': 0,
    'transformer.blocks.21': 0,
    'transformer.blocks.22': 0,
    'transformer.blocks.23': 0,
    'transformer.blocks.24': 0,
    'transformer.blocks.25.norm_1': 0,
    'transformer.blocks.25.attn.Wqkv': 0,
    'transformer.blocks.25.attn.out_proj': 0,
    'transformer.blocks.25.norm_2': 0,
    'transformer.blocks.25.ffn': 0,
    'transformer.blocks.25.resid_attn_dropout': 0,
    'transformer.blocks.25.resid_ffn_dropout': 0,
    'transformer.blocks.26': 0,
    'transformer.blocks.27': 0,
    'transformer.blocks.28': 0,
    'transformer.blocks.29': 0,
    'transformer.blocks.30': 0,
    'transformer.blocks.31': 0,
    'transformer.norm_f': 0
}

model = transformers.AutoModelForCausalLM.from_pretrained(
    'mosaicml/mpt-7b',
    trust_remote_code=True,
    torch_dtype=bfloat16,
    max_seq_len=2048,
    quantization_config=quantization_config,
    device_map=device_map
)

model.eval()

tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
print("model loaded")



In [None]:
# MPT-7B stopping cretiria

import torch
from transformers import StoppingCriteria, StoppingCriteriaList

# mtp-7b is trained to add "<|endoftext|>" at the end of generations
stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in stop_token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # device=device,
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model will ramble
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    top_p=0.15,  # select from top tokens whose probability add up to 15%
    top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
    max_new_tokens=64,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

## Vicuna-7b

In [1]:
# Vicuna-7b test
# pip install sentencepiece protobuf<=3.20.0
# pip install git+https://github.com/zphang/transformers.git@llama_push
from torch import cuda, bfloat16
from accelerate import infer_auto_device_map
import transformers

# device_map = infer_auto_device_map(model, max_memory={0: "10GiB", "cpu": "30GiB"})
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

quantization_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

device_map = {
    'model.embed_tokens': 0,
    'model.layers.0': 0,
    'model.layers.1': 0,
    'model.layers.2': 0,
    'model.layers.3': 0,
    'model.layers.4': 0,
    'model.layers.5': 0,
    'model.layers.6': 0,
    'model.layers.7': 0,
    'model.layers.8': 0,
    'model.layers.9': 0,
    'model.layers.10': 0,
    'model.layers.11': 0,
    'model.layers.12': 0,
    'model.layers.13': 0,
    'model.layers.14': 0,
    'model.layers.15': 0,
    'model.layers.16': 0,
    'model.layers.17': 0,
    'model.layers.18': 0,
    'model.layers.19': 0,
    'model.layers.20': 0,
    'model.layers.21': 0,
    'model.layers.22': 0,
    'model.layers.23': 0,
    'model.layers.24': 0,
    'model.layers.25.self_attn.q_proj': 0,
    'model.layers.25.self_attn.k_proj': 0,
    'model.layers.25.self_attn.v_proj': 0,
    'model.layers.25.self_attn.o_proj': 0,
    'model.layers.25.self_attn.rotary_emb': 0,
    'model.layers.25.mlp': 0,
    'model.layers.25.input_layernorm': 0,
    'model.layers.25.post_attention_layernorm': 0,
    'model.layers.26': 0,
    'model.layers.27': 0,
    'model.layers.28': 0,
    'model.layers.29': 0,
    'model.layers.30': 0,
    'model.layers.31': 0,
    'model.norm': 0,
    'lm_head': 0
}

model = transformers.AutoModelForCausalLM.from_pretrained(
    'Tribbiani/vicuna-7b',
    torch_dtype=bfloat16,
    quantization_config=quantization_config,
    device_map=device_map
)

model.eval()
print("model loaded")
tokenizer = transformers.AutoTokenizer.from_pretrained("Tribbiani/vicuna-7b")
print("tokenizer loaded")




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: C:\Users\Aviv9\miniconda3\envs\LLMBots2\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary c:\Users\Aviv9\miniconda3\envs\LLMBots2\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll...


  warn(msg)
  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

model loaded
tokenizer loaded


# Creating agents

## Task list creator

In [2]:
template = """Answer the following questions as best you can.
You can interact with a computer program that can give you access to the following tools:

Current Search: useful for when you need to answer questions about current events or the current state of the world

The way you use the tools is by specifying a json blob that will be referred as $JSON_BLOB.
Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the tools input).

The only values that should be in the "action" field are: Current Search

The $JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:

```
{
    "action": $TOOL_NAME,
    "action_input": $INPUT
}
```
You would be given a task as follows:
Question: <the input question you must answer>.

You should always answer step by step in the following format:
Thought: <think about the question and how you are going to answer it>
Action: <specify an action from the tools you can access>
```
$JSON_BLOB
```
Result: <the result of your action>
Observation: <make an observation based on the action Results>.
... (this Thought/Action/Result/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: <the final answer to the original input question>

Begin! Reminder to always use the exact characters `Final Answer` when responding.
"""

In [2]:
templateTaskCreation =  rf"""
You are an AI who creates a precise task list to accomplish a certain objective. Each task should start with a number of the task and should end with '.\n'. The task list should be short and contain MAXIMUM 10 tasks. Each task should start with a specific action.  Don't write anything other than the tasks.
Objective: Plan a 5 days trip to Madrid, include historical locations and most recommended low budget restaurants, you should also create itinerary for the trip.
Assistant: let's think step by step:"""

In [42]:
tokenizer("#")

{'input_ids': [1, 396], 'attention_mask': [1, 1]}

In [48]:
"Human" in tokenizer.decode([835, 12968])

True

In [61]:
import os
import torch
import transformers    

class StopOnTokens(transformers.StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
       stop_tokens = [2277, 29937, 12968, 29901]
    #    print(tokenizer.decode(input_ids[0][-1]), tokenizer.decode(input_ids[0][-2]))
       if (list(input_ids[0][-4:]) == stop_tokens):
           return True 
       return False

device = torch.cuda.is_available()

message = templateTaskCreation
inputs = tokenizer(message, return_tensors="pt").to("cuda")
tokens = model.generate(
    **inputs,
    max_new_tokens = 512,
    temperature = 0.1,
    top_k = 0,
    top_p = 0.15,
    repetition_penalty = 1.1,
    stopping_criteria=transformers.StoppingCriteriaList([StopOnTokens()]),
    # streamer = transformers.TextStreamer(tokenizer=tokenizer),
)

Stopping


In [50]:
completion_tokens = tokens[0][inputs['input_ids'].size(1):]
completion = tokenizer.decode(completion_tokens, skip_special_tokens=True)
print(completion)

 

1. Research the best historical locations in Madrid
2. Find the most recommended low budget restaurants in Madrid
3. Create an itinerary for the trip including the historical locations and restaurants
4. Organize transportation for the trip
5. Book accommodations for the trip
6. Purchase tickets for any attractions or tours
7. Pack for the trip
8. Inform family members about the plan
9. Print out the itinerary and packing list
10. Double check all arrangements before departing

Please let me know if there is something else I can help you with.
### Human:


In [60]:
list(tokens[0][-4:]) == [2277, 29937, 12968, 29901]

True

In [58]:
tokenizer.decode([2277, 29937, 12968, 29901])

' ### Human:'

In [7]:
def parse_tasks(completion):
    return [task[2:] for task in completion.split('\n') if task != '']

tasks = parse_tasks(completion)
print(*tasks, sep='\n')

 Research historical locations in Madrid
 Find low budget restaurants in Madrid
 Create an itinerary for the trip
 Plan transportation options
 Finalize the plan
 Review and finalize the plan
 Print the plan
 Share the plan with family members or friends
 Enjoy your trip!


## Tool chooser

In [21]:
template_action_chooser =  r"""
You are an AI who uses tools in order to complete your Objective.
You can only use exactly ONE tool.
The tool you are planning to must be one of the following tools:
1. WRITE_FILE(content)
2. READ_FILE(path)
3. RUN_PYTHON(code)
4. SEARCH_ONLINE(query)
5. EXTRACT_INFO(url, the information to look for)
6. NO_TOOL

Answer in JSON format with the following keys:
tool_name, tool_input.
Do not write anything other than the chosen tool.
Assistant: """

In [31]:
from transformers import TextStreamer    

template_action_chooser =  r"""
Your decisions must always be made independently without seeking user assistance. Play to your strengths as an LLM and pursue simple strategies with no legal complications.


CONSTRAINTS:
1. No user assistance.
2. Cannot run Python code that requires user input.


ACTIONS:

1. "READ_FILE": read the current state of a file. The schema for the action is:

READ_FILE: <PATH>

2. "WRITE_FILE": write a block of text to a file. The schema for the action is:

WRITE_FILE: <PATH>
```
<TEXT>
```

3. "RUN_PYTHON": run a Python file. The schema for the action is:

RUN_PYTHON: <PATH>

4. "SEARCH_ONLINE": search online and get back a list of URLs relevant to the query. The schema for the action is:

SEARCH_ONLINE: <QUERY>

5. EXTRACT_INFO: extract specific information from a webpage. The schema for the action is:

EXTRACT_INFO: <URL>, <a brief instruction to GPT for information to extract>

6. "SHUTDOWN": shut down the program. The schema for the action is:

SHUTDOWN


RESOURCES:
1. File contents after reading file.
2. Online search results returning URLs.
3. Output of running a Python file.


PERFORMANCE EVALUATION:
1. Continuously review and analyze your actions to ensure you are performing to the best of your abilities. 
2. Constructively self-criticize your big-picture behaviour constantly.
3. Reflect on past decisions and strategies to refine your approach.
4. Every action has a cost, so be smart and efficent. Aim to complete tasks in the least number of steps.


Write only one action. The action must one of the actions specified above and must be written according to the schema specified above.

After the action, also write the following metadata JSON object, which must be parsable by Python's json.loads():
{
    "criticism": "<constructive self-criticism of actions performed so far, if any>",
    "reason": "<a sentence explaining the action above>",
    "plan": "<a short high-level plan in plain English>",
    "speak": "<a short summary of thoughts to say to the user>"
}

If you want to run an action that is not in the above list of actions, send the SHUTDOWN action instead and explain which action you wanted to run in the metadata JSON object.
So, write one action and one metadata JSON object, nothing else.
Plan a 5 days trip to Madrid, include historical locations and most recommended low budget restaurants, you should also create itinerary for the trip.
Determine which next action to use, and write one valid action, a newline, and one valid metadata JSON object, both according to the specified schema:
"""

device = torch.cuda.is_available()
message = template_action_chooser
inputs = tokenizer(message, return_tensors="pt").to("cuda")
tokens = model.generate(
    **inputs,
    max_new_tokens = 512,
    temperature = 0.1,
    # top_k=0,
    # top_p=0.5,
    repetition_penalty = 1.05,
    streamer = TextStreamer(tokenizer=tokenizer),
    # stopping_criteria = stopping_criteria
)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Your decisions must always be made independently without seeking user assistance. Play to your strengths as an LLM and pursue simple strategies with no legal complications.


CONSTRAINTS:
1. No user assistance.
2. Cannot run Python code that requires user input.


ACTIONS:

1. "READ_FILE": read the current state of a file. The schema for the action is:

READ_FILE: <PATH>

2. "WRITE_FILE": write a block of text to a file. The schema for the action is:

WRITE_FILE: <PATH>
```
<TEXT>
```

3. "RUN_PYTHON": run a Python file. The schema for the action is:

RUN_PYTHON: <PATH>

4. "SEARCH_ONLINE": search online and get back a list of URLs relevant to the query. The schema for the action is:

SEARCH_ONLINE: <QUERY>

5. EXTRACT_INFO: extract specific information from a webpage. The schema for the action is:

EXTRACT_INFO: <URL>, <a brief instruction to GPT for information to extract>

6. "SHUTDOWN": shut down the program. The schema for the action is:

SHUTDOWN


RESOURCES:
1. File contents af

KeyboardInterrupt: 

# Appendix

In [None]:
# Generation args
max_new_tokens = 1024
temperature = 0.5
top_k = 5
top_p = 0.9

pipe = pipeline(
    "text-generation",
    tokenizer=tokenizer,
    model=model, 
    max_length=max_new_tokens,
    temperature=temperature,
    top_k = top_k,
    top_p = top_p,
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain


template = """Answer the following questions as best you can.
You can interact with a computer program that can give you access to the following tools:

Current Search: useful for when you need to answer questions about current events or the current state of the world

The way you use the tools is by specifying a json blob.
Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the tools input).

The only values that should be in the "action" field are: Current Search

The $JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:

```
{{
    "action": $TOOL_NAME,
    "action_input": $INPUT
}}
```
You would be given a task as follows:
Question: the input question you must answer.

You should always answer in the following format:
Thought: think about the question and how you are going to answer it
Action: (optional)
```
$JSON_BLOB
```
After requesting action you should STOP and wait for the computer program to return the action result.
Result: <the result of your action>
Observation: you should make an observation based on the action Results.
... (this Thought/Action/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin! Reminder to always use the exact characters `Final Answer` when responding.
"""

prompt = PromptTemplate(
    input_variables=["human_input"], 
    template=template
)

In [None]:
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.memory import ConversationBufferWindowMemory

llm_chain = LLMChain(prompt=prompt, llm=local_llm, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), verbose=True)
# question = "How many cities are in london?"

In [None]:
chat_history = []

In [None]:
chat_history.append("Question: Plan a 5 days trip to Madrid, include historical locations and most recommended low budget restaurants, you should also create itinerary for the trip.")
answer = llm_chain.predict(human_input="\n".join(chat_history))

In [None]:
print(answer)

In [None]:
print(llm_chain.predict(human_input="ls ~"))

In [None]:
print(llm_chain.predict(human_input="Can you list the last 10 presidents of the united states?"))

In [None]:
from langchain.agents import Tool
from langchain.agents import AgentType
from langchain.memory import ConversationBufferMemory
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.agents import initialize_agent

search = GoogleSearchAPIWrapper(
    google_api_key="AIzaSyAP55t5T-MpdccsquOW17a4BQSZBqPbWIM",
    google_cse_id="2361e668410524176"
)
tools = [
    Tool(
        name = "Current Search",
        func=search.run,
        description="useful for when you need to answer questions about current events or the current state of the world"
    ),
]

memory = ConversationBufferMemory(memory_key="chat_history")
agent_chain = initialize_agent(tools, local_llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True, memory=memory)

In [None]:
agent_chain.run(input="list the last 10 presidents of the united states")