In [5]:
from llama_index import download_loader, LLMPredictor, ServiceContext, VectorStoreIndex, LangchainEmbedding
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (completion_to_prompt, messages_to_prompt)

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

import time, pytz
from pathlib import Path
from datetime import datetime

In [12]:
DATA = "./data/data_pp.csv"
MODEL = "/home/catsmile/models/llama-2-7b-chat.ggmlv3.q4_1.bin"
EMBEDDING = "/home/catsmile/embeddings/BAAI_bge-small-en/"

In [3]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin",
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=MODEL,
    temperature=0,
    max_new_tokens=0,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=512,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 0},
    # n_gpu_layers=20,
    # n_ctx=1000,
    # transform inputs into Llama2 format
    # messages_to_prompt=messages_to_prompt,
    # completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama.cpp: loading model from /home/catsmile/models/llama-2-7b-chat.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4017.35 MB (+  256.00 MB per state)
llama_new_context_with_model: kv self size  =  25

In [14]:
llm_predictor = LLMPredictor(llm=llm)
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(cache_folder=EMBEDDING))
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor=llm_predictor)

In [15]:
CSVReader = download_loader("PagedCSVReader") #SimpleCSVReader PandasCSVReader PagedCSVReader
documents = CSVReader().load_data(file=Path(DATA))
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()

In [16]:
prompt = """
[INST] <<SYS>>
As an advanced language model, you can generate code as part of your responses. 
To make the code more noticeable and easier to read, please encapsulate it within triple backticks.
For instance, if you're providing Python code, wrap it as follows:

```python
print('hellow world')
```
<</SYS>>

{prompt} [/INST]
""".format(prompt="How can I provide you a dataframe or CSV file to analyze?")

In [17]:
response = query_engine.query("Describe the dataset")
response.response

Llama.generate: prefix-match hit

llama_print_timings:        load time = 23513.24 ms
llama_print_timings:      sample time =    58.86 ms /   112 runs   (    0.53 ms per token,  1902.85 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 26728.00 ms /   112 runs   (  238.64 ms per token,     4.19 tokens per second)
llama_print_timings:       total time = 27037.21 ms


" The dataset contains two rows of data, each representing a student's information. The students are enrolled in different courses, with one student enrolled in Informatics Engineering and the other student dropped out of Nursing. The mothers of both students work in agriculture-related occupations, while the fathers work in various fields such as legislative power, executive bodies, and directors. None of the students hold scholarships. The age of the student at enrollment ranges from 18 to 39 years old."

In [18]:
response

Response(response=" The dataset contains two rows of data, each representing a student's information. The students are enrolled in different courses, with one student enrolled in Informatics Engineering and the other student dropped out of Nursing. The mothers of both students work in agriculture-related occupations, while the fathers work in various fields such as legislative power, executive bodies, and directors. None of the students hold scholarships. The age of the student at enrollment ranges from 18 to 39 years old.", source_nodes=[NodeWithScore(node=TextNode(id_='7cb60cb2-e013-4619-81b0-96dad3056cfc', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='defc3053-72cf-4303-b1a4-16d150e56262', node_type=None, metadata={}, hash='a6f02b237dc5ecf56bcfcbca93899164934ba35eeb570e4f630b118faf57a973')}, hash='a6f02b237dc5ecf56bcfcbca93899164934ba35eeb570e4f630b118faf57a973', te