In [2]:
from torch import cuda
import transformers
from langchain.llms import HuggingFacePipeline, OpenAI

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"

model = transformers.AutoModelForCausalLM.from_pretrained(model_id)

model.eval()

print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

import torch
stop_list = ['<|user|>', '\n```\n']
stop_token_ids = [torch.LongTensor(tokenizer(x)['input_ids']).to(device) for x in stop_list]

from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # stopping_criteria=stopping_criteria,
    # we pass model parameters here too
    max_new_tokens=128,  # max number of tokens to generate in the output
    repetition_penalty=1.3  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

Model loaded on cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.memory import ConversationSummaryBufferMemory
from langchain_core.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.tools import Tool

In [4]:
loader = PyPDFLoader('/data/study/huggingface.pdf')
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

In [5]:
memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=256, memory_key="chat_history", input_key="question", return_messages=True)

In [6]:
question_generator_template = PromptTemplate.from_template(
    "<|system|>\n"
    "You are an assistant who takes a chat history and a flow up question and "
    "combine the chat history and follow up question into "
    "a standalone question. Chat History: {chat_history}</s>"
    "\n<|user|>\n"
    "Follow up question: {question}</s>"
    "\n<|assistant|>\n"
)

qa_template = PromptTemplate.from_template(
    "<|system|>\n"
    "You are an assistant who reads a context and respond the user question based on it, "
    "if the answer is not in the context do not make it up"
    "\nContext:\n"
    "{context}\n"
    "{chat_history}</s>"
    "\n<|user|>\n"
    "{question}</s>"
    "\n<|assistant|>\n"

)

python_template = PromptTemplate.from_template(
    "<|system|>\n"
    "You are an assistant who write and execute python code to solve the user question "
    "respond always with the code and the execution result only</s>"
    "\n<|user|>\n"
    "{question}</s>"
    "\n<|assistant|>\n"

)

chain = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=vectorstore.as_retriever(),
    condense_question_prompt=question_generator_template,
    combine_docs_chain_kwargs={'prompt': qa_template},
    verbose=True,
    rephrase_question=False,
    memory=memory
)

python_chain = LLMChain(
    llm=llm,
    verbose=True,
    prompt=python_template,
)

In [7]:
tools = [Tool('Search documents',chain.invoke,'Always try to use this tool first if the answer could be found in the provided context.'),
         Tool('Python code', python_chain.invoke, 'This tool is for execute and write python code to solve some specific problem, use it to find out what a code does too.')]

In [8]:
from langchain.agents import AgentExecutor, create_react_agent

In [9]:
prompt = PromptTemplate.from_template(
'''
Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!
Question: {input}
Thought:{agent_scratchpad}
'''
)

In [10]:
router = OpenAI(openai_api_key='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')
agent = create_react_agent(router, tools, prompt)

  warn_deprecated(


In [11]:
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, max_iterations=5)

In [12]:
import langchain
langchain.debug=True

In [24]:
agent_executor.invoke({'input':'what is huggingface?'})

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "what is huggingface?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "what is huggingface?",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": "what is huggingface?",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> > 4:chain:RunnableParallel<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": "what is huggingface?",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> > 4:chain:RunnableParallel<agent_scratchpad> > 5

{'input': 'what is huggingface?',
 'output': 'A huggingface company is a technology company founded by the creator of the open source natural language processing platform called Hugging Face. It was established in 2016 after the founder, Ian Goodfellow II, left Google Brain Research Lab due to disagreements over intellectual property rights related to neural networks. Since then, huggingface has grown rapidly, becoming one of the largest providers of pre-trained language models like BERT, RoBERTA, XLM, etc., along with other advanced technologies in Natural Language Processing (NLP). Its mission statement emphasizes creating high-quality, user-friendly, and open-source NLP tools for developers and researchers.'}

In [25]:
agent_executor.invoke({'input':'what is models are hosted there?'})

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "what is models are hosted there?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "what is models are hosted there?",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": "what is models are hosted there?",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> > 4:chain:RunnableParallel<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": "what is models are hosted there?",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> 

{'input': 'what is models are hosted there?',
 'output': "Yes, hosted models refer to pre-trained language models that have already been trained on large amounts of data but are still accessible via APIs. This means they can be used without having to download them locally first. Hosted models typically come at a higher cost than custom models since they require more resources such as computing power and storage space. However, hosting models offers several benefits including faster inference times, better accuracy compared to custom models, and greater scalability when dealing with larger datasets. Some popular hosted models include OpenAI's GPT-3, Microsoft's Bing Speech API, and IBM Watson's Text Analytics."}

In [13]:
agent_executor.invoke({'input':'write a python function to sum two numbers'})

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "write a python function to sum two numbers"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "write a python function to sum two numbers",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": "write a python function to sum two numbers",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> > 4:chain:RunnableParallel<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": "write a python function to sum two numbers",
  "intermediate_steps": []
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[36;1m[1;3m[llm/end][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 7:llm:OpenAI] [1.04s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Make a function that takes two numbers and adds them together\nAction: Python code\nAction Input: def sum(a,b): return a+b",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "prompt_tokens": 182,
      "completion_tokens": 28,
      "total_tokens": 210
    },
    "model_name": "gpt-3.5-turbo-instruct"
  },
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 8:parser:ReActSingleInputOutputParser] Entering Parser run with input:
[0m{
  "input": "Make a function that takes two numbers and adds them together\nAction: Python code\nAction Input: def sum(a,b): return a+b"
}
[36;1m[1;3m[chain/end][0m [1m[1:ch

{'input': 'write a python function to sum two numbers',
 'output': 'def sum(a,b): return a+b'}