In [None]:
%pip install -Uq weaviate-client langchain tiktoken pypdf rapidocr-onnxruntime
%pip install -Uq python-dotenv
%pip install -Uq langchain-community
%pip install -Uq tqdm

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
# Check if the environment variables are loaded
print("LOAD:", os.getenv("LOAD"))

LOAD: True


In [5]:
from langchain.vectorstores import Weaviate
import weaviate
# from langchain_community.vectorstores import Weaviate
from weaviate.classes.init import Auth

# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
# client = weaviate.Client(
#     url=weaviate_url, auth_client_secret=weaviate.AuthApiKey(weaviate_api_key)
# )

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)


print(client.is_ready())



True


In [None]:
# fixing unicode error in google colab
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
%pip install -Uq sentence-transformers
%pip install -Uq langchain-huggingface
%pip install -Uq ipywidgets
%pip install jupyterlab-widgets

In [7]:
# specify embedding model (using huggingface sentence transformer)
# from langchain_huggingface import HuggingFaceEmbeddings
# from tqdm.notebook import tqdm
# import time
# embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
# #model_kwargs = {"device": "cuda"}
# embeddings = HuggingFaceEmbeddings(
#   model_name=embedding_model_name,
#   #model_kwargs=model_kwargs
# )
from langchain_huggingface import HuggingFaceEmbeddings
# from tqdm.notebook import tqdm
from tqdm import tqdm
import time

embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
)

sentences = ["This is a test sentence.", "Another example sentence.", "More sentences for embeddings."]
embedded_sentences = []

for sentence in tqdm(sentences, desc="Embedding Sentences"):
    embedding = embeddings.embed_query(sentence)
    embedded_sentences.append(embedding)
    time.sleep(0.5)  # Simulate processing delay


Embedding Sentences: 100%|██████████| 3/3 [00:01<00:00,  1.52it/s]


### Load multiple types of pdf using the langchain just check with the document

https://python.langchain.com/docs/how_to/document_loader_pdf/

In [9]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("../eBook.pdf", extract_images=True)
pages = loader.load()

In [10]:
# pages

In [11]:
# Split text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(pages)

In [None]:
# docs

In [19]:
# vector_db = Weaviate.from_documents(
#     docs, embeddings, client=client, by_text=False
# )
# from langchain_community.vectorstores import Weaviate

vector_db = Weaviate.from_documents(
    docs, embeddings, client=client, by_text=False
)

AttributeError: 'WeaviateClient' object has no attribute 'schema'

In [None]:
print(vector_db.similarity_search("what is rag?", k=3)[0].page_content)

In [None]:
print(vector_db.similarity_search("what is rag?", k=3)[1].page_content)

In [None]:
print(vector_db.similarity_search("what is rag?", k=3)[2].page_content)

In [None]:
print(
    vector_db.similarity_search(
        "what is attention?", k=3)
    )

In [None]:
from langchain.prompts import ChatPromptTemplate

template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [None]:
prompt=ChatPromptTemplate.from_template(template)

In [None]:
prompt

In [None]:
from langchain import HuggingFaceHub

In [None]:
from google.colab import userdata
huggingfacehub_api_token=userdata.get('HuGGINGFACE_TOKEN')

In [None]:
model = HuggingFaceHub(
    huggingfacehub_api_token=huggingfacehub_api_token,
    repo_id="mistralai/Mistral-7B-Instruct-v0.1",
    model_kwargs={"temperature":1, "max_length":180}
)

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [None]:
output_parser=StrOutputParser()

In [None]:
retriever=vector_db.as_retriever()

In [None]:
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | model
    | output_parser
)

In [None]:
print(rag_chain.invoke("what is rag system?"))

In [None]:
print(rag_chain.invoke("How does the RAG model differ from traditional language generation models?"))