# Vector Stores

In [1]:
from qdrant_client import QdrantClient
from langchain_core.documents import Document
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
import openai
import os

### Embedding Model

In [2]:
# Initialize embedding model with BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')

  embed_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load the PDF document using PyPDFLoader
loaders = PyPDFLoader("AI_Engineer_Roadmap.pdf")

# Extract pages from the loaded PDF
pages = loaders.load()

In [5]:
len(pages)

13

In [4]:
pages[9]

Document(metadata={'source': 'AI_Engineer_Roadmap.pdf', 'page': 9}, page_content=' \n   \ncodebasics.io  \n \n10 \no Track B (Affordable Fees):  \n▪ Included in  the above Master Machine Learning for Data Science & AI  \n \nWeek 23, 24: Machine Learning Projects  with Deployment       \n \n• You need to finish two end to end ML projects. One on Regression , the other on \nClassification  \n• Regression Project: Bangalore property price prediction  \no YouTube playlist link: https://bit.ly/3ivycWr  \no Project covers following  \n▪ Data cleaning  \n▪ Feature engineering  \n▪ Model building and hyper parameter tuning  \n▪ Write flask server as a web backend  \n▪ Building website for price prediction  \n▪ Deployment to AWS  \n• Classification Project: Sports celebrity image classification  \no YouTube playlist link: https://bit.ly/3ioaMSU  \no Project covers following  \n▪ Data collection and data cleaning  \n▪ Feature engineering and model training  \n▪ Flask server as a web backend  \n▪

### Splitting the Document into Chunks

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

### MetaData preprocessing

In [6]:
from langchain.docstore.document import Document

In [8]:
# Create an empty list to store processed document chunks
doc_list = []

# Iterate over each page in the extracted pages
for page in pages:
    # Split the page content into smaller chunks
    pg_split = text_splitter.split_text(page.page_content)

    # Iterate over each chunk and create Document objects
    for pg_sub_split in pg_split:
        # Metadata for each chunk, including source and page number
        metadata = {"source": "AI Roadmap", "page_no": page.metadata["page"] + 1}

        # Create a Document object with content and metadata
        doc_string = Document(page_content=pg_sub_split, metadata=metadata)

        # Append the Document object to the list
        doc_list.append(doc_string)

In [9]:
doc_list[10]

Document(metadata={'source': 'AI Roadmap', 'page_no': 10}, page_content='codebasics.io  \n \n10 \no Track B (Affordable Fees):  \n▪ Included in  the above Master Machine Learning for Data Science & AI  \n \nWeek 23, 24: Machine Learning Projects  with Deployment       \n \n• You need to finish two end to end ML projects. One on Regression , the other on \nClassification  \n• Regression Project: Bangalore property price prediction  \no YouTube playlist link: https://bit.ly/3ivycWr  \no Project covers following  \n▪ Data cleaning  \n▪ Feature engineering  \n▪ Model building and hyper parameter tuning  \n▪ Write flask server as a web backend  \n▪ Building website for price prediction  \n▪ Deployment to AWS  \n• Classification Project: Sports celebrity image classification  \no YouTube playlist link: https://bit.ly/3ioaMSU  \no Project covers following  \n▪ Data collection and data cleaning  \n▪ Feature engineering and model training  \n▪ Flask server as a web backend  \n▪ Building website

In [10]:
len(doc_list)

16

### Qdrant Vectore Store

#### Qdrant Credentials

In [13]:
qdrant_url = ""
qdrant_key = ""
collection_name = ""

In [14]:
# Initialize QdrantVectorStore with documents and embedding model
qdrant = QdrantVectorStore.from_documents(
    doc_list,                # List of Document objects to be stored in the vector store
    embed_model,             # Embedding model used to convert documents into vectors
    url=qdrant_url,          # URL for the Qdrant service
    api_key=qdrant_key,      # API key for accessing the Qdrant service
    collection_name=collection_name  # Name of the collection to store the vectors in
)

### Query Vector Store

In [15]:
query = "what is Ai roadmap?"

# Retrieve relevant documents
results = qdrant.similarity_search(query, k=5)

In [16]:
results[3]

Document(metadata={'source': 'AI Roadmap', 'page_no': 11, '_id': 'e1ec305a-f62b-4c51-b272-90fcb26bfa90', '_collection_name': 'AI_Roadmap'}, page_content='codebasics.io  \n \n11 \n• Linktree  \no Helpful to add multiple links in one page.  \n \n• Assignment  \no In above two projects make following changes  \n☐ Use FastAPI  instead of flask . FastAPI tutorial: https://youtu.be/Wr1JjhTt1Xg  \n☐ Regression project : Instead of property prediction, take any other project \nof your interest from Kaggle for regress ion \n☐ Classification project : Instead of sports celebrity classification, take any \nother project of your interest from Kaggle for classification and build end to \nend solution along with deployment to AWS or Azure  \n     ☐ Add a link of your projects in your resume and LinkedIn.  \n(Tag Codebasics, Dhaval Patel and Hemanand Vadivel with the hashtag \n#dsroadmap24 so we can engage to increase your visibility)  \n \n \nWeek 25, 26, 27 : Deep Learning           \n \n• Topics  

In [17]:
results[0].page_content

'codebasics.io  \n \n1 \nAI Engineer Roadmap for Beginners  \nFollowing is the roadmap  to learning  AI Engineer  (also known as ML Engineer ) skills for a total \nbeginner. It includes FREE learning resources for technical skills (or tool skills) and soft (or core) skills  \n          \nPrerequisites : You must have skills or interests  to build skills in Coding and Math. Without these two \nyou cannot become an AI engineer.  \nTotal Duration: 8 Months  (4 hours  of study Every Day ) \nAlso, AI Engineer = Data Scientist + Software Engineer  \n \n \nWeek 0: Do Proper Research and protect yourself from SCAMS.  \n \n Unfortunately, a lot of systematic scams are happening in ed tech, especially in the \ndata field where aspirants are provided with false promises like a 100% job guarantee or \ntrapped into “Masterclasses” which are nothing but sales pitches to upsell their l ow-grade \ncourses at exorbitant prices. You need to do complete research about the market and \nmentors before star

### Pinecone Vector Store

In [39]:
PINECONE_API_KEY=""
index_name=""

#### Data Upsertion in Pinecone

In [40]:
from langchain_pinecone import PineconeVectorStore as lang_pinecone
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [41]:
# Convert documents into vectors using LangPinecone
vector = lang_pinecone.from_documents(
    doc_list,                # List of Document objects to be converted into vectors
    embed_model,             # Embedding model used for generating vector representations
    index_name=index_name    # Name of the Pinecone index where vectors will be stored
)

In [45]:
# Define a query to search for relevant information
query = "Classification"

# Perform similarity search to find the top 5 most relevant results
pinecone_results = vector.similarity_search(query, k=5)

In [43]:
pinecone_results

[Document(id='463daf4b-cc13-422f-ad00-669a8cc79b19', metadata={'page_no': 8.0, 'source': 'AI Roadmap'}, page_content='codebasics.io  \n \n8 \nWeek 17: Exploratory Data Analysis (EDA)           \n \n• Exploratory Data Analysis (EDA)  \no https://www.kaggle.com/code?searchQuery=exploratory+data+analysis  \no Use the above link to search for exploratory data analysis notebooks.  \no Practice EDA using at least 3 datasets.  \n▪ e.g. https://www.kaggle.com/datasets/rishabhkarn/ipl -auction -\n2023/data  \n \n• Assignment  \n☐ Perform EDA (Exploratory data analysis on at least 2 additional datasets  on \nKaggle)  \n \nWeek 18, 19, 20, 21 : Machine Learning          \n \n• Machine Learning : Preprocessing  \no Handling NA values , outlier treatment, data normalization  \no One hot encoding, label encoding  \no Feature engineering  \no Train test split  \no Cross validation  \n• Machine Learning: Model  Building  \no Types of ML: Supervised, Unsupervised  \no Supervised: Regression vs Classifi

In [44]:
pinecone_results[3].page_content

'codebasics.io  \n \n2 \n• https://bit.ly/4at9Jaw  \n• https://bit.ly/477IOOs  \n• https://bit.ly/3GPD7dp  \n \n \nWeek 1 and 2: Computer Science Fundamentals 💻 \n \n• Topics  \no Data representation: Bits and Bytes, Storing text and numbers, Binary number \nsystem.  \no Basics of computer networks, IP addresses, Internet routing protocol  \no UDP, TCP, HTTP, and The World Wide Web  \no Programming basics: variables, strings, and numbers, if condition, loops  \no Algorithm basics  \n• Learning Resources  \no Khan Academy  course: https://bit.ly/42DUXtW  \no In the above course , only follow the first 4 sections (1) Digital Information (2) The \nInternet (3) Programming (4) Algorithms . Completing the remaining sections is \noptional . Do it if you have time and interest.  \n \nWeek 3 and 4: Beginners P ython                                                                \n \n• Topics  \no Variables, Numbers, Strings  \no Lists, Dictionaries, Sets, Tuples  \no If condition, for loop  \n