Install chromadb and tiktoken packages using pip

PART 1: Web Question-Answering

In [3]:
f = open("./openai_key")
openai_key = f.read().strip()

import os

os.environ["OPENAI_API_KEY"] = openai_key

In [4]:
from langchain.document_loaders import WebBaseLoader
from langchain.indexes import VectorstoreIndexCreator
# Document loader
loader = WebBaseLoader("https://fsalim.github.io")
# Index that wraps above steps
index = VectorstoreIndexCreator().from_loaders([loader])
# Question-answering
question = "At which conference was Flora the PC Co-chair? "
index.query(question)

' UbiComp 2020'

In [5]:
question = "How many awards has Flora received, as per her homepage? "
index.query(question)

' Flora has received 11 awards.'

PART 2: Running chains on a LLM on your machine: LLAMA

In [1]:
# pip install speechrecognition and pyttsx3

In [11]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [42]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 64  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path ="/Users/aditya/Documents/GitHub/llama/llama-7b.ggmlv3.q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=128,
    f16_kv=True, 
    callback_manager=callback_manager,
    verbose=True,
)


llama.cpp: loading model from /Users/aditya/Documents/GitHub/llama/llama-7b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 128
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 5287.72 MB (+ 1026.00 MB per state)
llama_new_context_with_model: kv self size  =   64.00 MB
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | F

In [46]:
template = """Question: {question}

Answer: Let's think step by step. """
prompt = PromptTemplate(template=template, input_variables=["question"])

In [47]:
llm_chain_llama = LLMChain(prompt=prompt, llm=llm)

In [48]:
parameter = "Who is the better CEO: Mark Zuckerberg or Elon Musk? "
llm_chain_llama.run(parameter)

Llama.generate: prefix-match hit


 First, Mark has a few advantages over Elon in that he built a successful company (Facebook) while Elon has yet to build a commercially viable product on his own; however, Elon is working on it now with the Tesla and SpaceX projects, so both are still "works in progress."  But for Mark's sake, let's say he has 15-20 years head start on this, so that


llama_print_timings:        load time = 14223.83 ms
llama_print_timings:      sample time =   122.95 ms /    94 runs   (    1.31 ms per token,   764.52 tokens per second)
llama_print_timings: prompt eval time = 15294.35 ms /    31 tokens (  493.37 ms per token,     2.03 tokens per second)
llama_print_timings:        eval time = 685601.68 ms /    93 runs   ( 7372.06 ms per token,     0.14 tokens per second)
llama_print_timings:       total time = 701988.23 ms


' First, Mark has a few advantages over Elon in that he built a successful company (Facebook) while Elon has yet to build a commercially viable product on his own; however, Elon is working on it now with the Tesla and SpaceX projects, so both are still "works in progress."  But for Mark\'s sake, let\'s say he has 15-20 years head start on this, so that'

In [56]:
template = """Question: {question}

Answer in exactly two words: """
prompt = PromptTemplate(template=template, input_variables=["question"])

In [59]:
n_batch = 32

llm = LlamaCpp(
    model_path ="/Users/aditya/Documents/GitHub/llama/llama-7b.ggmlv3.q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=33,
    f16_kv=True, 
    callback_manager=callback_manager,
    verbose=True,
)

llm_chain_llama = LLMChain(prompt=prompt, llm=llm)

llama.cpp: loading model from /Users/aditya/Documents/GitHub/llama/llama-7b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 33
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 5287.72 MB (+ 1026.00 MB per state)
llama_new_context_with_model: kv self size  =   16.50 MB
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP

In [60]:
parameter = "Who is the better CEO: Mark Zuckerberg or Elon Musk? "
llm_chain_llama.run(parameter)

 [Elon


llama_print_timings:        load time = 15680.60 ms
llama_print_timings:      sample time =     3.53 ms /     3 runs   (    1.18 ms per token,   850.82 tokens per second)
llama_print_timings: prompt eval time = 15680.39 ms /    30 tokens (  522.68 ms per token,     1.91 tokens per second)
llama_print_timings:        eval time = 16207.50 ms /     2 runs   ( 8103.75 ms per token,     0.12 tokens per second)
llama_print_timings:       total time = 31913.30 ms


' [Elon'