In [None]:
# llama.cpp env setup
! git clone https://github.com/ggerganov/llama.cpp
! cd llama.cpp
! make
! python3 -m pip install -r requirements.txt

In [None]:
# Download https://huggingface.co/ziqingyang/chinese-alpaca-2-7b/tree/main

# pth -> f16
! python3 convert.py ../chinese-alpaca-2-7b/

In [None]:
# f16 -> q4
! python3 ./quantize ../chinese-alpaca-2-7b/ggml-model-f16.bin ../chinese-alpaca-2-7b/gml-model-q4_0.bin q4_0

In [None]:
# [CPU]
! pip install  llama-cpp-python

In [None]:
# [GPU]
# nvcc install
! wget https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run
! sudo sh cuda_12.2.0_535.54.03_linux.run
! sudo apt install nvidia-cuda-toolkit
! mkdir build
! cd build
! cmake .. -DLLAMA_CUBLAS=ON
! cmake --build . --config Release
! CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

In [1]:
! pip install langchain

Collecting langchain
  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/80/d6/1160e4df19e54f1a421883873e82d0f6aba2a368c8c9ea46a9d170c585bf/langchain-0.0.248-py3-none-any.whl.metadata
  Downloading langchain-0.0.248-py3-none-any.whl.metadata (14 kB)
Collecting PyYAML>=5.4.1 (from langchain)
  Obtaining dependency information for PyYAML>=5.4.1 from https://files.pythonhosted.org/packages/7d/39/472f2554a0f1e825bd7c5afc11c817cd7a2f3657460f7159f691fbb37c51/PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Obtaining dependency information for SQLAlchemy<3,>=1.4 from https://files.pythonhosted.org/packages/84/bc/72e7fce7151e2540b72776b515f10bee72d68112965b90b4cf400d39b6f1/SQLAlchemy-2.0.19-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading SQLAl

In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [3]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

In [9]:
# [CPU]
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="./home/sung/llm/chinese-alpaca-2-7b/gml-model-q4_0.bin",
    callback_manager=callback_manager, 
    verbose=True
)

In [4]:
# [GPU]
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 8  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="/home/sung/llm/chinese-alpaca-2-7b/gml-model-q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,
)

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2060 SUPER, compute capability 7.5
llama.cpp: loading model from /home/sung/llm/chinese-alpaca-2-7b/gml-model-q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 55296
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 5504
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_inte

In [5]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [6]:
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"

llm_chain.run(question)

答案：在贾斯汀·比伯出生的那个赛季，匹兹堡钢人赢得了超级碗冠军。


llama_print_timings:        load time =   331.69 ms
llama_print_timings:      sample time =    26.66 ms /    23 runs   (    1.16 ms per token,   862.62 tokens per second)
llama_print_timings: prompt eval time =  1536.70 ms /    45 tokens (   34.15 ms per token,    29.28 tokens per second)
llama_print_timings:        eval time =   387.68 ms /    22 runs   (   17.62 ms per token,    56.75 tokens per second)
llama_print_timings:       total time =  2103.36 ms


'答案：在贾斯汀·比伯出生的那个赛季，匹兹堡钢人赢得了超级碗冠军。'