In [13]:
from dotenv import load_dotenv
import streamlit as st
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
import os

# Vector Store

## Qdrant Vectorstore 

In [None]:
from langchain.vectorstores import Qdrant
import os
import qdrant_client

os.environ['QDRANT_HOST']= ''
os.environ['QDRANT_API_KEY'] = ''
os.environ['QDRANT_COLLECTION_NAME'] = ''

def get_vector_store():
    
    
    #create a client that will connect to Qdrant resources
    client = qdrant_client.QdrantClient(
        os.getenv("QDRANT_HOST"),
        api_key=os.getenv("QDRANT_API_KEY")
    )
    
    #Create an OpenAIEmbedding Object 
    embeddings = OpenAIEmbeddings()
   
    # To create Collection
    vectors_config = qdrant_client.http.models.VectorParams(
        size = 1536,
        distance = qdrant_client.http.models.Distance.COSINE
    )
    
    # To Create New Collection
    client.recreate_collection(
        collection_name= os.getenv('QDRANT_COLLECTION_NAME'),
        vectors_config = vectors_config,
    )

    #Create a Vector store of collection at cloud 
    vector_store = Qdrant(
        client=client, 
        collection_name=os.getenv("QDRANT_COLLECTION_NAME"), 
        embeddings=embeddings,
    )
    
    return vector_store


# get the vector store
vector_store = get_vector_store()


#################### create chain 
user_question = st.text_input("Ask a question about your PDF:")
qa = RetrievalQA.from_chain_type(
        llm=OpenAI(),
        chain_type="stuff",
        retriever=vector_store.as_retriever()
        )
answer = qa.run(user_question)

## Pinecone Vectorstore

##### Install All the Required Packages

In [None]:
!pip install langchain
!pip install pinecone-client
!pip install pypdf

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
   ---------------------------------------- 0.0/244.8 kB ? eta -:--:--
   ---------- ----------------------------- 61.4/244.8 kB 1.7 MB/s eta 0:00:01
   -------------------------------------- - 235.5/244.8 kB 2.9 MB/s eta 0:00:01
   ---------------------------------------- 244.8/244.8 kB 2.5 MB/s eta 0:00:00
Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
   ---------------------------------------- 0.0/85.4 kB ? eta -:--:--
   ---------------------------------------- 85.4/85.4 kB 5.0 MB/s eta 0:00:00
Downlo

In [None]:
!pip install openai
!pip install tiktoken



#### Import All the Required Libraries

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

#### Load the PDF Files

In [None]:
!mkdir pdfs

In [None]:
!gdown 1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE -O pdfs/yolov7paper.pdf
!gdown 1vILwiv6nS2wI3chxNabMgry3qnV67TxM -O pdfs/rachelgreecv.pdf

Downloading...
From: https://drive.google.com/uc?id=1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE
To: /content/pdfs/yolov7paper.pdf
100% 2.27M/2.27M [00:00<00:00, 14.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vILwiv6nS2wI3chxNabMgry3qnV67TxM
To: /content/pdfs/rachelgreecv.pdf
100% 271k/271k [00:00<00:00, 3.62MB/s]


#### Extract the Text from the PDF's

In [None]:
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()

In [None]:
data

[Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known\nreal-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable\nDETR, DINO-5scale-R50, ViT-Adapter-B and

#### Split the Extracted Data into Text Chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

In [None]:
text_chunks = text_splitter.split_documents(data)

In [None]:
text_chunks

[Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known', metadata={'source': 'pdfs/yolov7paper.pdf', 'page': 0}),
 Document(page_content='real-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX,

In [None]:
len(text_chunks)

168

In [None]:
text_chunks[1]

Document(page_content='real-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable', metadata={'source': 'pdfs/yolov7paper.pdf', 'page': 0})

In [None]:
text_chunks[2]

Document(page_content='DETR, DINO-5scale-R50, ViT-Adapter-B and many other\nobject detectors in speed and accuracy. Moreover, we train\nYOLOv7 only on MS COCO dataset from scratch without\nusing any other datasets or pre-trained weights. Source\ncode is released in https://github.com/WongKinYiu/yolov7.\n1. Introduction\nReal-time object detection is a very important topic in\ncomputer vision, as it is often a necessary component in\ncomputer vision systems. For example, multi-object track-', metadata={'source': 'pdfs/yolov7paper.pdf', 'page': 0})

In [None]:
text_chunks[3]

Document(page_content='ing [94, 93], autonomous driving [40, 18], robotics [35, 58],\nmedical image analysis [34, 46], etc. The computing de-\nvices that execute real-time object detection is usually some\nmobile CPU or GPU, as well as various neural processing\nunits (NPU) developed by major manufacturers. For exam-\nple, the Apple neural engine (Apple), the neural compute\nstick (Intel), Jetson AI edge devices (Nvidia), the edge TPU\n(Google), the neural processing engine (Qualcomm), the AI', metadata={'source': 'pdfs/yolov7paper.pdf', 'page': 0})

#### Downlaod the Embeddings

In [None]:
import os

os.environ['OPENAI_API_KEY'] = ""

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
result = embeddings.embed_query("How are you!")

In [None]:
len(result)

1536

#### Initializing the Pinecone

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', '')

In [None]:
import pinecone
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "test" # put in the name of your pinecone index here


  from tqdm.autonotebook import tqdm


#### Create Embeddings for each of the Text Chunk

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

#### If you already have an index, you can load it like this

In [None]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch

<langchain.vectorstores.pinecone.Pinecone at 0x78f9e0e67460>

#### Similarity Search

In [None]:
query = "YOLOv7 outperforms which models"

In [None]:
docs = docsearch.similarity_search(query, k=3)

In [None]:
docs

[Document(page_content='YOLOv7-tiny 6.2 3.5 320 30.8% 47.3% 32.2% 10.0% 31.9% 52.2%\nimprovement -39% -49% - = = = -0.9 = +0.7\nYOLOR-E6 [81] 115.8M 683.2G 1280 55.7% 73.2% 60.7% 40.1% 60.4% 69.2%\nYOLOv7-E6 97.2M 515.2G 1280 55.9% 73.5% 61.1% 40.6% 60.3% 70.0%\nimprovement -19% -33% - +0.2 +0.3 +0.4 +0.5 -0.1 +0.8\nYOLOR-D6 [81] 151.7M 935.6G 1280 56.1% 73.9% 61.2% 42.4% 60.5% 69.9%\nYOLOv7-D6 154.7M 806.8G 1280 56.3% 73.8% 61.4% 41.3% 60.6% 70.1%\nYOLOv7-E6E 151.7M 843.2G 1280 56.8% 74.4% 62.1% 40.8% 62.1% 70.6%'),
 Document(page_content='YOLOv5-L6 (r6.1) [23] 76.8M 445.6G 1280 63 - / 53.7% - -\nYOLOX-X [21] 99.1M 281.9G 640 58 51.5% / 51.1% - -\nYOLOv7-E6 97.2M 515.2G 1280 56 56.0% /55.9% 73.5% 61.2%\nYOLOR-E6 [81] 115.8M 683.2G 1280 45 55.8% / 55.7% 73.4% 61.1%\nPPYOLOE-X [85] 98.4M 206.6G 640 45 52.2% / 51.9% 69.9% 56.5%\nYOLOv7-D6 154.7M 806.8G 1280 44 56.6% /56.3% 74.0% 61.8%\nYOLOv5-X6 (r6.1) [23] 140.7M 839.2G 1280 38 - / 55.0% - -\nYOLOv7-E6E 151.7M 843.2G 1280 36 56.8% /56.8

#### Creating a LLM Model Wrapper

In [None]:
llm = OpenAI()

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())


#### Q/A

In [None]:
query = "YOLOv7 outperforms which models"

In [None]:
qa.run(query)

' YOLOv7 outperforms YOLOv5-L6 (r6.1), YOLOX-X, YOLOR-E6, PPYOLOE-X, YOLOv7-D6, YOLOv5-X6 (r6.1), YOLOv7-E6E, YOLOv5-X (r6.1), YOLOR-CSP, YOLOR-CSP-X, YOLOv7-tiny-SiLU, YOLOv7, and YOLOv7-X.'

In [None]:
query = "Rachel Green Experience"

In [None]:
qa.run(query)

' Rachel Green has a PhD in English from the University of Illinois at Urbana-Champaign. Her dissertation title was "Down on the Farm: World War One and the Emergence of Literary Modernism in the American South". She also holds an MA in English from Butler University, and has received a Summer Research Grant from the Center for Summer Studies, a Graduate College Conference Travel Grant from the University of Illinois, the Most Outstanding Butler Woman award from Butler University, and an Academic Scholarship from Butler University. She has published multiple works, and has presented at conferences.'

In [None]:
import sys

In [None]:
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")

Input Prompt: what is yolo v7
Answer:  YOLOv7 is a real-time object detector which surpasses all known object detectors in both speed and accuracy. It has the highest accuracy of 56.8% AP among all known detectors and can run from 5 FPS to 160 FPS.
Input Prompt: tell me about Rechel Green
Answer:  Rachel Green is a PhD in English from the University of Illinois at Urbana-Champaign. Her dissertation title was “Down on the Farm: World War One and the Emergence of Literary Modernism in the American South.” She also has a MA in English and was awarded a Summer Research Grant, a Graduate College Conference Travel Grant, Most Outstanding Butler Woman, and an Academic Scholarship. She has published extensively and has given multiple conference presentations.
Input Prompt: exit
Exiting


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Chroma DB

In [None]:
!pip -q install chromadb openai langchain tiktoken

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/479.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/479.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m479.8/479.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [None]:
!pip show chromadb

Name: chromadb
Version: 0.4.15
Summary: Chroma.
Home-page: 
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: bcrypt, chroma-hnswlib, fastapi, grpcio, importlib-resources, kubernetes, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-sdk, overrides, posthog, pulsar-client, pydantic, pypika, requests, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 


In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [None]:
!unzip -q new_articles.zip -d new_articles

#### Setting up Environment

In [None]:
import os

os.environ['OPENAI_API_KEY'] = ""

#### Import some libraries

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

#### Load data

In [None]:
loader = DirectoryLoader("/content/new_articles/", glob = "./*.txt", loader_cls= TextLoader)

In [None]:
document = loader.load()

In [None]:
document

[Document(page_content='Google I/O 2023 is next week; here’s what we’re expecting A whole bunch of new hardware, coupled with a lot of AI and the best look yet at Android 14\n\nGoogle’s annual developer conference, Google I/O, returns to Mountain View’s Shoreline Amphitheater next week, and for the first time in four years, we’ll be returning along with it. The kickoff keynote is always jammed-packed full of information, debuting all of the different software projects the company has been working on for the past year.\n\nUpdate: Google just went ahead and announced the Pixel Fold over on Twitter. The company gave a good look at the upcoming foldable smartphone from just about every angle. That means all three of the expected pieces of hardware – including the Pixel 7a and Pixel Tablet – have officially been announced.\n\nThe event, which kicks off May 10 at 10 AM PT will be a big showcase for everything that’s on the way for Android 14. The company has, arguably, missed a step when it 

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(document)

In [None]:
text

[Document(page_content='Google I/O 2023 is next week; here’s what we’re expecting A whole bunch of new hardware, coupled with a lot of AI and the best look yet at Android 14\n\nGoogle’s annual developer conference, Google I/O, returns to Mountain View’s Shoreline Amphitheater next week, and for the first time in four years, we’ll be returning along with it. The kickoff keynote is always jammed-packed full of information, debuting all of the different software projects the company has been working on for the past year.\n\nUpdate: Google just went ahead and announced the Pixel Fold over on Twitter. The company gave a good look at the upcoming foldable smartphone from just about every angle. That means all three of the expected pieces of hardware – including the Pixel 7a and Pixel Tablet – have officially been announced.', metadata={'source': '/content/new_articles/05-05-google-i-o-2023-is-next-week-heres-what-were-expecting.txt'}),
 Document(page_content='The event, which kicks off May 1

In [None]:
len(text)

233

In [None]:
text[1]

Document(page_content='The event, which kicks off May 10 at 10 AM PT will be a big showcase for everything that’s on the way for Android 14. The company has, arguably, missed a step when it comes to the current generative AI land rush — hell, who could have predicted after all of these years that Bing would finally have a moment?\n\nCEO Sundar Pichai will no doubt be making the case that the company continues to lead the way in the world of artificial intelligence. There’s always been a fair bit of the stuff at the event largely focused on practical real-world applications like mobile imaging and dealing with customer service. This year, however, I’d say it’s safe to say the company is going to go bonkers with the stuff.', metadata={'source': '/content/new_articles/05-05-google-i-o-2023-is-next-week-heres-what-were-expecting.txt'})

In [None]:
text[2]

Document(page_content='Hardware, meanwhile, is always a bit of a crapshoot at developer conferences. But after an off-year for the industry at large, a deluge of rumors are aligning, pointing to what’s likely to be an unusually consumer electronics-focused keynote. Given the fact that the last bit is my focus at TechCrunch, I’m going to start the list there.\n\nThe Pixel 7a is about as sure as bets get. Google has settled into a comfortable release cadence: releasing a flagship in the fall, followed by a budget device in the spring. The former is designed to be an ideal showcase for its latest mobile operating system and first-party silicon, while the latter makes some compromises for price, while maintaining as many of its predecessors as possible.\n\nHow to show excitement without shouting? Asking for a friend Coming to @Flipkart on 11th May. pic.twitter.com/il6GUx3MmR — Google India (@GoogleIndia) May 2, 2023', metadata={'source': '/content/new_articles/05-05-google-i-o-2023-is-next

#### Creating DB

In [None]:
from langchain import embeddings
persist_directory = 'db'

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=text,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

#### Make a retriever

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")

In [None]:
len(docs)

2

In [None]:
docs

[Document(page_content='April 28, 2023\n\nVC firms including Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global are picking up new shares, according to documents seen by TechCrunch. A source tells us Founders Fund is also investing. Altogether the VCs have put in just over $300 million at a valuation of $27 billion to $29 billion. This is separate to a big investment from Microsoft announced earlier this year, a person familiar with the development told TechCrunch, which closed in January. The size of Microsoft’s investment is believed to be around $10 billion, a figure we confirmed with our source.\n\nApril 25, 2023\n\nCalled ChatGPT Business, OpenAI describes the forthcoming offering as “for professionals who need more control over their data as well as enterprises seeking to manage their end users.”', metadata={'source': '/content/new_articles/05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt'}),
 Document(page_content='The amount that Google investe

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
retriever.search_type

'similarity'

In [None]:
retriever.search_kwargs

{'k': 2}

#### Make a chain

In [None]:
from langchain.chains import RetrievalQA

In [None]:
llm=OpenAI()

In [None]:
llm

OpenAI(client=<class 'openai.api_resources.completion.Completion'>, openai_api_key='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', openai_api_base='', openai_organization='', openai_proxy='')

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Microsoft raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Around $10 billion.


Sources:
/content/new_articles/05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
/content/new_articles/05-03-checks-the-ai-powered-data-protection-project-incubated-in-area-120-officially-exits-to-google.txt


In [None]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Pando has raised $30 million in a Series B round, bringing its total raised to $45 million. The new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.


Sources:
/content/new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
/content/new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


#### Deleteing the DB


In [None]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/chroma.sqlite3 (deflated 42%)
  adding: db/340c51aa-f60a-48b6-b4fd-2fea61654443/ (stored 0%)
  adding: db/340c51aa-f60a-48b6-b4fd-2fea61654443/length.bin (deflated 74%)
  adding: db/340c51aa-f60a-48b6-b4fd-2fea61654443/data_level0.bin (deflated 100%)
  adding: db/340c51aa-f60a-48b6-b4fd-2fea61654443/header.bin (deflated 61%)
  adding: db/340c51aa-f60a-48b6-b4fd-2fea61654443/link_lists.bin (stored 0%)


In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

#### Starting again loading the db

In [None]:
!unzip db.zip

Archive:  db.zip
   creating: db/
  inflating: db/chroma.sqlite3       
   creating: db/340c51aa-f60a-48b6-b4fd-2fea61654443/
  inflating: db/340c51aa-f60a-48b6-b4fd-2fea61654443/length.bin  
  inflating: db/340c51aa-f60a-48b6-b4fd-2fea61654443/data_level0.bin  
  inflating: db/340c51aa-f60a-48b6-b4fd-2fea61654443/header.bin  
 extracting: db/340c51aa-f60a-48b6-b4fd-2fea61654443/link_lists.bin  


## Weaviate

https://console.weaviate.cloud/

In [None]:
!pip install weaviate-client
!pip install langchain
!pip install openai

Collecting weaviate-client
  Downloading weaviate_client-3.25.2-py3-none-any.whl (120 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/120.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m112.6/120.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.3/120.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting validators<1.0.0,>=0.21.2 (from weaviate-client)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client)
  Downloading Authlib-1.2.1-py2.py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: validators, authlib, weaviate-client
Successfully installed authlib-1.2.1 validators-0.22.0 weaviate-client-3.25.2
Collecting langchain
  Downloading

In [None]:
OPENAI_API_KEY = ""
WEAVIATE_API_KEY = ""
WEAVIATE_CLUSTER = "https://"

#### Data Reading

In [None]:
!mkdir data

In [None]:
!pip install unstructured
!pip install "unstructured[pdf]"

Collecting unstructured
  Downloading unstructured-0.10.28-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2023.6.15-py3-none-any.whl (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.1/275.1 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader("./data",glob = "**/*.pdf")
data = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
data

[Document(page_content='You Only Look Once (YOLO): Unified, Real-Time Object Detection\n\nPresenter: Shivang Singh\n\nSept 2nd, 2021\n\nCS391R: Robot Learning (Fall 2021)\n\n1\n\nProblem Addressed: Object Detection\n\n❖ Object detection is the problem of both\n\nlocating AND classifying objects\n\n❖ Goal of YOLO algorithm is to do object\n\ndetection both fast AND with high\n\naccuracy\n\n“Deep Learning for Vision Systems” (Elgendy)\n\nCS391R: Robot Learning (Fall 2021)\n\nObject Detection vs Classification\n\n2\n\nImportance of Object Detection for Robotics\n\n❖ Visual modality is very powerful\n\n❖ Humans are able to detect objects and do\n\nVision based vs LIDAR (self driving)\n\nperception using just this modality in real time\n\n(not needing radar)\n\n❖ If we want responsive robot systems that\n\nwork in real time (without specialized\n\nsensors) almost real time vision based object\n\ndetection can help greatly\n\nTesla Investor Day Presentation\n\nCS391R: Robot Learning (Fall 20

#### Text Splitting

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)

In [None]:
docs

[Document(page_content='You Only Look Once (YOLO): Unified, Real-Time Object Detection\n\nPresenter: Shivang Singh\n\nSept 2nd, 2021\n\nCS391R: Robot Learning (Fall 2021)\n\n1\n\nProblem Addressed: Object Detection\n\n❖ Object detection is the problem of both\n\nlocating AND classifying objects\n\n❖ Goal of YOLO algorithm is to do object\n\ndetection both fast AND with high\n\naccuracy\n\n“Deep Learning for Vision Systems” (Elgendy)\n\nCS391R: Robot Learning (Fall 2021)\n\nObject Detection vs Classification\n\n2\n\nImportance of Object Detection for Robotics\n\n❖ Visual modality is very powerful\n\n❖ Humans are able to detect objects and do\n\nVision based vs LIDAR (self driving)\n\nperception using just this modality in real time\n\n(not needing radar)\n\n❖ If we want responsive robot systems that\n\nwork in real time (without specialized\n\nsensors) almost real time vision based object\n\ndetection can help greatly\n\nTesla Investor Day Presentation\n\nCS391R: Robot Learning (Fall 20

In [None]:
len(docs)

10

#### Embedding Convertion

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [None]:
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False)

#### Vector Database Storage

In [None]:
import weaviate
from langchain.vectorstores import Weaviate

#Connect to weaviate Cluster
auth_config = weaviate.auth.AuthApiKey(api_key = WEAVIATE_API_KEY)
WEAVIATE_URL = WEAVIATE_CLUSTER

client = weaviate.Client(
    url = WEAVIATE_URL,
    additional_headers = {"X-OpenAI-Api-key": OPENAI_API_KEY},
    auth_client_secret = auth_config,
    startup_period = 10
)

In [None]:
client.is_ready()

True

In [None]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)
vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])

In [None]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

['9c496404-7515-4b3c-8b8e-88e2dc09dfc8',
 '0c6883df-8242-4011-a51d-13efd4d6dd39',
 '3ad7f0e4-d5b9-4ffc-9fb0-1d4a4b31f610',
 '4845d89b-4e65-4ec3-ac46-e182793e9b68',
 '32df9a02-fdf1-4e71-81b5-1103264f7499',
 '6e2be198-6bb7-4374-8bf8-bc3f42aefd17',
 '5074cbc0-870c-416e-a37c-63a8f28a06bd',
 '59fd3d13-29f3-4001-af59-7fc48a4e1bec',
 'cb52baa4-bd78-4491-801e-21b074f6aa8d',
 '631ccd1a-e35d-4eb4-94ae-8beb3c1f99e4']

#### Similarity Measurement

In [None]:
query = "what is a yolo?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=20)

In [None]:
docs

[Document(page_content='You Only Look Once (YOLO): Unified, Real-Time Object Detection\n\nPresenter: Shivang Singh\n\nSept 2nd, 2021\n\nCS391R: Robot Learning (Fall 2021)\n\n1\n\nProblem Addressed: Object Detection\n\n❖ Object detection is the problem of both\n\nlocating AND classifying objects\n\n❖ Goal of YOLO algorithm is to do object\n\ndetection both fast AND with high\n\naccuracy\n\n“Deep Learning for Vision Systems” (Elgendy)\n\nCS391R: Robot Learning (Fall 2021)\n\nObject Detection vs Classification\n\n2\n\nImportance of Object Detection for Robotics\n\n❖ Visual modality is very powerful\n\n❖ Humans are able to detect objects and do\n\nVision based vs LIDAR (self driving)\n\nperception using just this modality in real time\n\n(not needing radar)\n\n❖ If we want responsive robot systems that\n\nwork in real time (without specialized\n\nsensors) almost real time vision based object\n\ndetection can help greatly\n\nTesla Investor Day Presentation\n\nCS391R: Robot Learning (Fall 20

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_API_KEY,temperature=0),
    chain_type="stuff")

In [None]:
# create answer
chain.run(input_documents=docs, question=query)

' YOLO is an algorithm for object detection that is unified, real-time, and has high accuracy. It is presented by Shivang Singh in the CS391R: Robot Learning (Fall 2021) course on Sept 2nd, 2021.'

## FAISS DB

In [None]:
db_faiss = FAISS.from_texts(texts, embeddings)

In [None]:
#### Simple similarity
faiss_q1 = db_faiss.similarity_search(query1)
print(faiss_q1[0].page_content)

In [None]:
similar_query1 = db_faiss.similarity_search_with_score(query1)
similar_query1

In [None]:
faiss_q2 = db_faiss.similarity_search(query2)
chain.run(input_documents = faiss_q2, question = query2)