# Retrievals

In [20]:
#import libraries
from langchain_core.documents import Document
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone
import openai
import os

In [2]:
# Initialize embedding model with BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')

  embed_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load the PDF document using PyPDFLoader
loaders = PyPDFLoader("AI_Engineer_Roadmap.pdf")

# Extract pages from the loaded PDF
pages = loaders.load()

In [4]:
pages[9]

Document(metadata={'source': 'AI_Engineer_Roadmap.pdf', 'page': 9}, page_content=' \n   \ncodebasics.io  \n \n10 \no Track B (Affordable Fees):  \n▪ Included in  the above Master Machine Learning for Data Science & AI  \n \nWeek 23, 24: Machine Learning Projects  with Deployment       \n \n• You need to finish two end to end ML projects. One on Regression , the other on \nClassification  \n• Regression Project: Bangalore property price prediction  \no YouTube playlist link: https://bit.ly/3ivycWr  \no Project covers following  \n▪ Data cleaning  \n▪ Feature engineering  \n▪ Model building and hyper parameter tuning  \n▪ Write flask server as a web backend  \n▪ Building website for price prediction  \n▪ Deployment to AWS  \n• Classification Project: Sports celebrity image classification  \no YouTube playlist link: https://bit.ly/3ioaMSU  \no Project covers following  \n▪ Data collection and data cleaning  \n▪ Feature engineering and model training  \n▪ Flask server as a web backend  \n▪

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [6]:
# Create an empty list to store processed document chunks
doc_list = []

# Iterate over each page in the extracted pages
for page in pages:
    # Split the page content into smaller chunks
    pg_split = text_splitter.split_text(page.page_content)

    # Iterate over each chunk and create Document objects
    for pg_sub_split in pg_split:
        # Metadata for each chunk, including source and page number
        metadata = {"source": "AI policy", "page_no": page.metadata["page"] + 1}

        # Create a Document object with content and metadata
        doc_string = Document(page_content=pg_sub_split, metadata=metadata)

        # Append the Document object to the list
        doc_list.append(doc_string)

In [24]:
PINECONE_API_KEY=""
index_name=""

In [23]:
from langchain_pinecone import PineconeVectorStore as lang_pinecone
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [25]:
# Convert documents into vectors using LangPinecone
vector = lang_pinecone.from_documents(
    doc_list,                # List of Document objects to be converted into vectors
    embed_model,             # Embedding model used for generating vector representations
    index_name=index_name    # Name of the Pinecone index where vectors will be stored
)

In [11]:
question = "Classification?"

In [14]:
docs_ss = vector.similarity_search(question,k=5)

In [15]:
docs_ss[0].page_content

'codebasics.io  \n \n12 \nWeek 28, 29, 30 : NLP or Computer Vision  & GenAI  📃 \n \n• Many AI engineers  choose a specialized track which is either NLP or Computer vision. \nYou don’t need to learn both.  \n• Natural Language Processing (N LP) \no Topics  \n▪ Regex  \n▪ Text presentation: Count vectorizer, TF -IDF, BOW, Word2Vec, \nEmbeddings  \n▪ Text classification: Naïve Bayes  \n▪ Fundamentals of Spacy & NLTP library  \n▪ One end to end project  \no Learning Resources  \n▪ NLP YouTube playlist: https://bit.ly/3XnjfEZ  \n \n• Comput er Vision (CV)  \no Topics  \n▪ Basic image processing techniques: Filtering, Edge Detection, Image \nScaling, Rotation  \n▪ Library to use: OpenCV  \n▪ Convolutional Neural Networks (CNN) – Already covered in deep \nlearning.  \n▪ Data preprocessing, augmentation – Already covered in deep learning.  \n• Assignment  \n☐ NLP Track: Complete exercises in this playlist: https://bit.ly/3XnjfEZ  \n \nWeek 31, 32 : LLM & Langchain 📃 \n \n• Topics  \no What is 

#### Drawbacks of Similarity Search

In [16]:
docs_ss[1].page_content

'codebasics.io  \n \n11 \n• Linktree  \no Helpful to add multiple links in one page.  \n \n• Assignment  \no In above two projects make following changes  \n☐ Use FastAPI  instead of flask . FastAPI tutorial: https://youtu.be/Wr1JjhTt1Xg  \n☐ Regression project : Instead of property prediction, take any other project \nof your interest from Kaggle for regress ion \n☐ Classification project : Instead of sports celebrity classification, take any \nother project of your interest from Kaggle for classification and build end to \nend solution along with deployment to AWS or Azure  \n     ☐ Add a link of your projects in your resume and LinkedIn.  \n(Tag Codebasics, Dhaval Patel and Hemanand Vadivel with the hashtag \n#dsroadmap24 so we can engage to increase your visibility)  \n \n \nWeek 25, 26, 27 : Deep Learning           \n \n• Topics  \no What is a neural network? Forward propagation, back propagation  \no Building multilayer perceptron  \no Special neural network architectures  \n▪ Co

In [28]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever

In [29]:
texts = [doc.page_content for doc in doc_list]
svm_retriever = SVMRetriever.from_texts(texts, embed_model)
tfidf_retriever = TFIDFRetriever.from_texts(texts)

In [30]:
question = "machine learning?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

  docs_svm=svm_retriever.get_relevant_documents(question)


Document(metadata={}, page_content='codebasics.io  \n \n8 \nWeek 17: Exploratory Data Analysis (EDA)           \n \n• Exploratory Data Analysis (EDA)  \no https://www.kaggle.com/code?searchQuery=exploratory+data+analysis  \no Use the above link to search for exploratory data analysis notebooks.  \no Practice EDA using at least 3 datasets.  \n▪ e.g. https://www.kaggle.com/datasets/rishabhkarn/ipl -auction -\n2023/data  \n \n• Assignment  \n☐ Perform EDA (Exploratory data analysis on at least 2 additional datasets  on \nKaggle)  \n \nWeek 18, 19, 20, 21 : Machine Learning          \n \n• Machine Learning : Preprocessing  \no Handling NA values , outlier treatment, data normalization  \no One hot encoding, label encoding  \no Feature engineering  \no Train test split  \no Cross validation  \n• Machine Learning: Model  Building  \no Types of ML: Supervised, Unsupervised  \no Supervised: Regression vs Classification  \no Linear models  \n▪ Linear regression, logistic regression  \n▪ Gradien

In [31]:
question = "deep learning?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(metadata={}, page_content='codebasics.io  \n \n11 \n• Linktree  \no Helpful to add multiple links in one page.  \n \n• Assignment  \no In above two projects make following changes  \n☐ Use FastAPI  instead of flask . FastAPI tutorial: https://youtu.be/Wr1JjhTt1Xg  \n☐ Regression project : Instead of property prediction, take any other project \nof your interest from Kaggle for regress ion \n☐ Classification project : Instead of sports celebrity classification, take any \nother project of your interest from Kaggle for classification and build end to \nend solution along with deployment to AWS or Azure  \n     ☐ Add a link of your projects in your resume and LinkedIn.  \n(Tag Codebasics, Dhaval Patel and Hemanand Vadivel with the hashtag \n#dsroadmap24 so we can engage to increase your visibility)  \n \n \nWeek 25, 26, 27 : Deep Learning           \n \n• Topics  \no What is a neural network? Forward propagation, back propagation  \no Building multilayer perceptron  \no Special n