In [1]:
import torch
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.llama_cpp import LlamaCPP
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core.indices.service_context import ServiceContext
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt,completion_to_prompt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["HUGGINGFACEHUB_API_TOKEN"]  = "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"

In [3]:
documents = SimpleDirectoryReader("../Test pdfs/").load_data()

In [4]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    #You can also use others LLMs of bigger size by using Quatisation through bitsandbytes
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.3,
    max_new_tokens=256,
    context_window=4096,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from C:\Users\manje\AppData\Local\llama_index\models\mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_mo

llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  19:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   65 tensors
llama_model_loader: - type q4_K:  193 tensors
llama_model_loader: - type q6_K:   33 tensors
llm_load_vocab: special tokens definition check successful ( 259/32000 ).
llm_load_print_meta: format           = GGUF V2
llm_load_print_meta: arch             = llama
llm_load_print_meta: vocab type       = SPM
llm_load

In [5]:
embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
)

In [6]:
service_context = ServiceContext.from_defaults(
    chunk_size=256,
    llm=llm,
    embed_model=embed_model
)

  service_context = ServiceContext.from_defaults(


In [7]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [8]:
query_engine = index.as_query_engine()
response = query_engine.query("What is Integer Programming?")


llama_print_timings:        load time =   35863.07 ms
llama_print_timings:      sample time =      46.81 ms /   238 runs   (    0.20 ms per token,  5083.84 tokens per second)
llama_print_timings: prompt eval time =   48308.02 ms /   693 tokens (   69.71 ms per token,    14.35 tokens per second)
llama_print_timings:        eval time =   29571.58 ms /   237 runs   (  124.77 ms per token,     8.01 tokens per second)
llama_print_timings:       total time =   78718.08 ms /   930 tokens


In [9]:
print(response)

 Integer Programming (IP) is a type of optimization problem where the objective is to either maximize or minimize a specific goal while adhering to certain constraints. The solutions to IP problems must be integral (i.e., whole numbers) rather than real-valued. These types of problems are solvable using linear algebra when the constraints are linear, ensuring polynomial time complexity. However, finding integer-only solutions to IP problems can be difficult, and currently, IP problems are not solvable in polynomial time (they are NP-complete). The general IP problem involves finding a solution (σ1, σ2, σ3, ..., σm) in Nm of the system a11σ1+a12σ2+···+a1mσm=b1, a21σ1+a22σ2+···+a2mσm=b2, ..., an1σ1+an2σ2+···+anmσm=bn, where aij, bi, and cij are given integers and real numbers, respectively.
