In [None]:
from dotenv import load_dotenv
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
import os

# Vector Store

## Qdrant Vectorstore 

In [None]:
from langchain.vectorstores import Qdrant
import os
import qdrant_client

os.environ['QDRANT_HOST']= ''
os.environ['QDRANT_API_KEY'] = ''
os.environ['QDRANT_COLLECTION_NAME'] = ''

def get_vector_store():
    
    
    #create a client that will connect to Qdrant resources
    client = qdrant_client.QdrantClient(
        os.getenv("QDRANT_HOST"),
        api_key=os.getenv("QDRANT_API_KEY")
    )
    
    #Create an OpenAIEmbedding Object 
    embeddings = OpenAIEmbeddings()
   
    # To create Collection
    vectors_config = qdrant_client.http.models.VectorParams(
        size = 1536,
        distance = qdrant_client.http.models.Distance.COSINE
    )
    
    # To Create New Collection
    client.recreate_collection(
        collection_name= os.getenv('QDRANT_COLLECTION_NAME'),
        vectors_config = vectors_config,
    )

    #Create a Vector store of collection at cloud 
    vector_store = Qdrant(
        client=client, 
        collection_name=os.getenv("QDRANT_COLLECTION_NAME"), 
        embeddings=embeddings,
    )
    
    return vector_store


# get the vector store
vector_store = get_vector_store()


#################### create chain 
user_question = st.text_input("Ask a question about your PDF:")
qa = RetrievalQA.from_chain_type(
        llm=OpenAI(),
        chain_type="stuff",
        retriever=vector_store.as_retriever()
        )
answer = qa.run(user_question)

## Pinecone Vectorstore

##### Install All the Required Packages

In [None]:
!pip install langchain
!pip install pinecone-client
!pip install pypdf

In [None]:
!pip install openai
!pip install tiktoken

#### Import All the Required Libraries

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

#### Load the PDF Files

In [None]:
!mkdir pdfs

In [None]:
!gdown 1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE -O pdfs/yolov7paper.pdf
!gdown 1vILwiv6nS2wI3chxNabMgry3qnV67TxM -O pdfs/rachelgreecv.pdf

#### Extract the Text from the PDF's

In [None]:
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()

In [None]:
data

#### Split the Extracted Data into Text Chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

In [None]:
text_chunks = text_splitter.split_documents(data)

In [None]:
text_chunks

In [None]:
len(text_chunks)

In [None]:
text_chunks[1]

In [None]:
text_chunks[2]

In [None]:
text_chunks[3]

#### Downlaod the Embeddings

In [None]:
import os

os.environ['OPENAI_API_KEY'] = ""

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
result = embeddings.embed_query("How are you!")

In [None]:
len(result)

#### Initializing the Pinecone

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', '')

In [None]:
import pinecone
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "test" # put in the name of your pinecone index here


#### Create Embeddings for each of the Text Chunk

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

#### If you already have an index, you can load it like this

In [None]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch

#### Similarity Search

In [None]:
query = "YOLOv7 outperforms which models"

In [None]:
docs = docsearch.similarity_search(query, k=3)

In [None]:
docs

#### Creating a LLM Model Wrapper

In [None]:
llm = OpenAI()

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())


#### Q/A

In [None]:
query = "YOLOv7 outperforms which models"

In [None]:
qa.run(query)

In [None]:
query = "Rachel Green Experience"

In [None]:
qa.run(query)

In [None]:
import sys

In [None]:
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")

## Chroma DB

In [None]:
!pip -q install chromadb openai langchain tiktoken

In [None]:
!pip show chromadb

In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [None]:
!unzip -q new_articles.zip -d new_articles

#### Setting up Environment

In [None]:
import os

os.environ['OPENAI_API_KEY'] = ""

#### Import some libraries

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

#### Load data

In [None]:
loader = DirectoryLoader("/content/new_articles/", glob = "./*.txt", loader_cls= TextLoader)

In [None]:
document = loader.load()

In [None]:
document

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(document)

In [None]:
text

In [None]:
len(text)

In [None]:
text[1]

In [None]:
text[2]

#### Creating DB

In [None]:
from langchain import embeddings
persist_directory = 'db'

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=text,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

#### Make a retriever

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")

In [None]:
len(docs)

In [None]:
docs

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

#### Make a chain

In [None]:
from langchain.chains import RetrievalQA

In [None]:
llm=OpenAI()

In [None]:
llm

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Microsoft raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

#### Deleteing the DB


In [None]:
!zip -r db.zip ./db

In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

#### Starting again loading the db

In [None]:
!unzip db.zip

## Weaviate

https://console.weaviate.cloud/

In [None]:
!pip install weaviate-client
!pip install langchain
!pip install openai

In [None]:
OPENAI_API_KEY = ""
WEAVIATE_API_KEY = ""
WEAVIATE_CLUSTER = "https://"

#### Data Reading

In [None]:
!mkdir data

In [None]:
!pip install unstructured
!pip install "unstructured[pdf]"

In [None]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader("./data",glob = "**/*.pdf")
data = loader.load()

In [None]:
data

#### Text Splitting

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)

In [None]:
docs

In [None]:
len(docs)

#### Embedding Convertion

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [None]:
embeddings

#### Vector Database Storage

In [None]:
import weaviate
from langchain.vectorstores import Weaviate

#Connect to weaviate Cluster
auth_config = weaviate.auth.AuthApiKey(api_key = WEAVIATE_API_KEY)
WEAVIATE_URL = WEAVIATE_CLUSTER

client = weaviate.Client(
    url = WEAVIATE_URL,
    additional_headers = {"X-OpenAI-Api-key": OPENAI_API_KEY},
    auth_client_secret = auth_config,
    startup_period = 10
)

In [None]:
client.is_ready()

In [None]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)
vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])

In [None]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

#### Similarity Measurement

In [None]:
query = "what is a yolo?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=20)

In [None]:
docs

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_API_KEY,temperature=0),
    chain_type="stuff")

In [None]:
# create answer
chain.run(input_documents=docs, question=query)

## FAISS DB

In [None]:
db_faiss = FAISS.from_texts(texts, embeddings)

In [None]:
#### Simple similarity
faiss_q1 = db_faiss.similarity_search(query1)
print(faiss_q1[0].page_content)

In [None]:
similar_query1 = db_faiss.similarity_search_with_score(query1)
similar_query1

In [None]:
faiss_q2 = db_faiss.similarity_search(query2)
chain.run(input_documents = faiss_q2, question = query2)