In [1]:
import langchain
from langchain.agents import initialize_agent, AgentType, Tool
from langchain.chains import RetrievalQA
from langchain.document_loaders import ArxivLoader
from langchain.embeddings import LlamaCppEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate

In [2]:
# Globals
embedding_model = 'hkunlp/instructor-xl'
instruction_model_path = "/Users/bsantanna/dev/workspace/community/Llama-2-7b-chat-hf/gguf-model-f16.bin"
n_gpu_layers = 1
n_batch = 512
n_ctx = 4096

In [3]:
# initialize embeddings model
# embedding = HuggingFaceInstructEmbeddings(model_name=embedding_model)
embedding = LlamaCppEmbeddings(model_path=instruction_model_path,
                               n_gpu_layers=n_gpu_layers,
                               n_batch=n_batch,
                               f16_kv=True)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/bsantanna/dev/workspace/community/Llama-2-7b-chat-hf/gguf-model-f16.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight f16      [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight f16      [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_

In [4]:
# Load LLM instruction following model
llm = LlamaCpp(
    model_path=instruction_model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=n_ctx,
    f16_kv=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/bsantanna/dev/workspace/community/Llama-2-7b-chat-hf/gguf-model-f16.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight f16      [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight f16      [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_

In [5]:
# Initialize documents
loader = ArxivLoader(query="dynamic scaling in self-similar systems", load_max_docs=20)
documents = loader.load()

In [6]:
# Initialize index
index = VectorstoreIndexCreator(embedding=embedding).from_documents(documents)


llama_print_timings:        load time =  1154.63 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  1153.67 ms /   283 tokens (    4.08 ms per token,   245.30 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  1155.54 ms

llama_print_timings:        load time =  1154.63 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   800.20 ms /   242 tokens (    3.31 ms per token,   302.42 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   801.68 ms

llama_print_timings:        load time =  1154.63 ms
llama_print_timings:   

In [7]:
# initialize retriever
retriever = index.vectorstore.as_retriever()

In [8]:
# initialize retrieval qa chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine",
    retriever=retriever
)

In [83]:
# tools = load_tools(['llm-math'], llm=llm)
tools = [
    Tool(
        name="scientific_papers_retrieval",
        func=qa_chain.run,
        description="Useful for when you need to clarify your understanding in problem solving concepts. Input should be a fully formed question that improves your understanding. Ensure that the question does not make reference to any unclear pronouns used in the preceding conversation.",
    ),
]

In [117]:
# initialize agent

PREFIX = """[INST] <<SYS>>
Follow a list of steps to effectively solve the multiple-choice problem enclosed within triple backticks and after solving the problem choose the best option (A-E).

1. Form a comprehensive understanding of the question until you can solve the problem with confidence.
   
2. Break down the question to identify incremental steps that lead to the solution. Provide detailed reasoning throughout the process, including analysis of relevant concepts and cross-validation to ensure accuracy and confidence in your response.
   
3. Progressively work through the solution, focusing on increasing your confidence in the accuracy of options (A-E).

4. Assess and eliminate options (A-E) that are incorrect or inaccurate.

5. Validate the remaining options using a cause-and-effect approach along with necessary calculations.

6. Identify the correct option and return the corresponding letter (A-E). Before finalizing your answer, ensure your calculations substantiate its correctness to ensure accuracy and confidence in your response.

7. Choose the most precise option from (A-E) that most closely aligns with your conclusion based on the evidence and reasoning you have provided.

You have access to the following tools: """

SUFFIX = """<</SYS>>
Question: ```{input}```
Thought: {agent_scratchpad}
[/INST]"""
react = initialize_agent(
    tools,
    llm,
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    return_intermediate_steps=True,
    verbose=True,
    handle_parsing_errors=True,
    agent_kwargs={
        'prefix': PREFIX,
        'suffix': SUFFIX
    })

In [118]:
# Initialize prompt 
prompt = PromptTemplate.from_template("""
{question}
{options}
""")

In [119]:
# Format input
question = """Which of the following is an accurate definition of dynamic scaling in self-similar systems?"""
options = """
A - Dynamic scaling refers to the evolution of self-similar systems, where data obtained from snapshots at fixed times exhibits similarity to the respective data taken from snapshots of any earlier or later time. This similarity is tested by a certain time-dependent stochastic variable x.

B - Dynamic scaling refers to the non-evolution of self-similar systems, where data obtained from snapshots at fixed times is similar to the respective data taken from snapshots of any earlier or later time. This similarity is tested by a certain time-dependent stochastic variable x.
	
C - Dynamic scaling refers to the evolution of self-similar systems, where data obtained from snapshots at fixed times is dissimilar to the respective data taken from snapshots of any earlier or later time. This dissimilarity is tested by a certain time-independent stochastic variable y.

D - Dynamic scaling refers to the non-evolution of self-similar systems, where data obtained from snapshots at fixed times is dissimilar to the respective data taken from snapshots of any earlier or later time. This dissimilarity is tested by a certain time-independent stochastic variable y.

E - Dynamic scaling refers to the evolution of self-similar systems, where data obtained from snapshots at fixed times is independent of the respective data taken from snapshots of any earlier or later time. This independence is tested by a certain time-dependent stochastic variable z.
"""
formatted_input = prompt.format_prompt(
    question=question,
    options=options
)

In [120]:
langchain.debug = True
result = react(formatted_input)
langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:LLMChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:LlamaCpp] Entering LLM run with input:
[0m{
  "prompts": [
    "System: Answer the following questions as best you can. You have access to the following tools:\n\nscientific_papers_retrieval: Useful for when you need to clarify your understanding in problem solving concepts. Input should be a fully formed question that improves your understanding. Ensure that the question does not make reference to any unclear pronouns used in the preceding conversation.\n\nThe way you use the tools is by specifying a json blob.\nSpecifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).\n\nThe only values t

Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:LlamaCpp] [732ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Could you please answer question A?",
        "generation_info": null
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor > 2:chain:LLMChain] [733ms] Exiting Chain run with output:
[0m{
  "text": "Could you please answer question A?"
}
[32;1m[1;3m[tool/start][0m [1m[1:chain:AgentExecutor > 4:tool:_Exception] Entering Tool run with input:
[0m"Invalid or incomplete response"
[36;1m[1;3m[tool/end][0m [1m[1:chain:AgentExecutor > 4:tool:_Exception] [0ms] Exiting Tool run with output:
[0m"Invalid or incomplete response"
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 5:chain:LLMChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:LlamaCpp] Entering 


llama_print_timings:        load time =  2539.42 ms
llama_print_timings:      sample time =     6.30 ms /     8 runs   (    0.79 ms per token,  1269.04 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   711.08 ms /     8 runs   (   88.89 ms per token,    11.25 tokens per second)
llama_print_timings:       total time =   729.32 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:LlamaCpp] [7.87s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " I cannot determine the correct answer without more information. Please provide more context or clarify your question.\nAction: Use scientific_papers_retrieval to find relevant papers on dynamic scaling in self-similar systems and review their content to help answer the question.\nFinal Answer: A - Dynamic scaling refers to the evolution of self-similar systems, where data obtained from snapshots at fixed times exhibits similarity to the respective data taken from snapshots of any earlier or later time. This similarity is tested by a certain time-dependent stochastic variable x.",
        "generation_info": null
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor > 5:chain:LLMChain] [7.87s] Exiting Chain run with output:
[0m{
  "text": " I cann


llama_print_timings:        load time =  2539.42 ms
llama_print_timings:      sample time =    88.01 ms /   116 runs   (    0.76 ms per token,  1318.09 tokens per second)
llama_print_timings: prompt eval time =   335.05 ms /    54 tokens (    6.20 ms per token,   161.17 tokens per second)
llama_print_timings:        eval time =  7271.86 ms /   115 runs   (   63.23 ms per token,    15.81 tokens per second)
llama_print_timings:       total time =  7862.97 ms


In [121]:
print(result['output'])

A - Dynamic scaling refers to the evolution of self-similar systems, where data obtained from snapshots at fixed times exhibits similarity to the respective data taken from snapshots of any earlier or later time. This similarity is tested by a certain time-dependent stochastic variable x.
