In [1]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, WebBaseLoader, PyPDFDirectoryLoader

loader = DirectoryLoader(path='./data/google/', glob="*.pdf", loader_cls=PyPDFLoader,     show_progress=True)
docs = loader.load()

100%|██████████| 14/14 [00:24<00:00,  1.75s/it]


In [2]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=0
)
docs_split = text_splitter.split_documents(docs)

In [3]:
# to vector
import os
import pinecone 
from pinecone import Pinecone
from langchain_community.vectorstores import Pinecone as PineconeVectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings import LlamaCppEmbeddings, HuggingFaceEmbeddings

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))


In [4]:
# we use the free embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})

doc_db = PineconeVectorStore.from_documents(
    docs_split, 
    embeddings, 
    index_name='d384'
)

In [None]:
query = "What were the most important events for Google in 2021?"
search_docs = doc_db.similarity_search(query)
# print(search_docs[0].page_content)

In [5]:
from langchain_community.llms       import LlamaCpp, CTransformers
llm = LlamaCpp(
    model_path = "e:/models/llama/llama-2-7b-chat.Q6_K.gguf",
    n_gpu_layers=40,
    n_ctx=2048,
    n_batch=256,  # Batch size for model processing
)


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from e:/models/llama/llama-2-7b-chat.Q6_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32  

In [6]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type='stuff',
    retriever=doc_db.as_retriever(),
)

query = "What were the earnings in 2022?"
result = qa.invoke(query)

result


llama_print_timings:        load time =     798.58 ms
llama_print_timings:      sample time =       3.77 ms /    20 runs   (    0.19 ms per token,  5300.82 tokens per second)
llama_print_timings: prompt eval time =    3067.65 ms /  1108 tokens (    2.77 ms per token,   361.19 tokens per second)
llama_print_timings:        eval time =     982.32 ms /    19 runs   (   51.70 ms per token,    19.34 tokens per second)
llama_print_timings:       total time =    4116.43 ms /  1127 tokens


{'query': 'What were the earnings in 2022?',
 'result': ' Total revenues for 2022 were $69,685 million.'}