In [1]:
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

from langchain_community.vectorstores import Chroma

import numpy as np
from typing import List, Dict, Any


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

## Sample Data

In [3]:
sample_docs = [
    """
Machine Learning Fundamentals

Machine learning is a field of artificial intelligence that focuses on building systems
that can learn patterns from data and make predictions or decisions without being explicitly programmed.
Instead of following fixed rules, machine learning models improve their performance as they are exposed
to more data over time.

Common types of machine learning include supervised learning, unsupervised learning,
and reinforcement learning. These techniques are widely used in recommendation systems,
fraud detection, and predictive analytics.
""",

    """
Deep Learning Fundamentals

Deep learning is a subfield of machine learning that uses multi-layer neural networks,
often called deep neural networks, to model complex patterns in large datasets.
These networks are inspired by the structure of the human brain and are particularly
effective when working with large amounts of unstructured data.

Deep learning is widely used in image recognition, speech processing,
autonomous systems, and natural language understanding.
""",

    """
Natural Language Processing

Natural Language Processing (NLP) is a branch of artificial intelligence that enables
computers to understand, interpret, and generate human language in a meaningful way.
NLP combines linguistics, machine learning, and deep learning techniques to process text and speech data.

Common NLP applications include chatbots, search engines, sentiment analysis,
text summarization, and machine translation.
"""
]


In [4]:
sample_docs

['\nMachine Learning Fundamentals\n\nMachine learning is a field of artificial intelligence that focuses on building systems\nthat can learn patterns from data and make predictions or decisions without being explicitly programmed.\nInstead of following fixed rules, machine learning models improve their performance as they are exposed\nto more data over time.\n\nCommon types of machine learning include supervised learning, unsupervised learning,\nand reinforcement learning. These techniques are widely used in recommendation systems,\nfraud detection, and predictive analytics.\n',
 '\nDeep Learning Fundamentals\n\nDeep learning is a subfield of machine learning that uses multi-layer neural networks,\noften called deep neural networks, to model complex patterns in large datasets.\nThese networks are inspired by the structure of the human brain and are particularly\neffective when working with large amounts of unstructured data.\n\nDeep learning is widely used in image recognition, speech 

In [5]:
dir_path = "sample_docs"
os.makedirs(dir_path, exist_ok=True)

for i, doc in enumerate(sample_docs):
    file_path = os.path.join(dir_path, f"doc{i}.txt")
    with open(file_path, "w") as f:
        f.write(doc)

print(f"Files created in: {dir_path}")


Files created in: sample_docs


In [6]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader(
    dir_path,
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'}
)
documents = loader.load()
print(f"length of document: {len(documents)}")
for i in range(len(documents)):
    print(f"\ndocument preview: {documents[i].page_content[:100]}\n")

length of document: 3

document preview: 
Machine Learning Fundamentals

Machine learning is a field of artificial intelligence that focuses 


document preview: 
Deep Learning Fundamentals

Deep learning is a subfield of machine learning that uses multi-layer n


document preview: 
Natural Language Processing

Natural Language Processing (NLP) is a branch of artificial intelligen



## Document spliting

In [7]:
doc_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)

chunks = doc_splitter.split_documents(documents)

print(f"Number of chunks: {len(chunks)}\n")

for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:")
    print(chunk.page_content[:150])
    print("-" * 60)


Number of chunks: 4

Chunk 1:
Machine Learning Fundamentals

Machine learning is a field of artificial intelligence that focuses on building systems
that can learn patterns from da
------------------------------------------------------------
Chunk 2:
Common types of machine learning include supervised learning, unsupervised learning,
and reinforcement learning. These techniques are widely used in r
------------------------------------------------------------
Chunk 3:
Deep Learning Fundamentals

Deep learning is a subfield of machine learning that uses multi-layer neural networks,
often called deep neural networks, 
------------------------------------------------------------
Chunk 4:
Natural Language Processing

Natural Language Processing (NLP) is a branch of artificial intelligence that enables
computers to understand, interpret,
------------------------------------------------------------


## Embedding models

In [8]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")

## Initialize chromaDB vector store

In [9]:
chunks

[Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Machine Learning Fundamentals\n\nMachine learning is a field of artificial intelligence that focuses on building systems\nthat can learn patterns from data and make predictions or decisions without being explicitly programmed.\nInstead of following fixed rules, machine learning models improve their performance as they are exposed\nto more data over time.'),
 Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Common types of machine learning include supervised learning, unsupervised learning,\nand reinforcement learning. These techniques are widely used in recommendation systems,\nfraud detection, and predictive analytics.'),
 Document(metadata={'source': 'sample_docs/doc1.txt'}, page_content='Deep Learning Fundamentals\n\nDeep learning is a subfield of machine learning that uses multi-layer neural networks,\noften called deep neural networks, to model complex patterns in large datasets.\nThese network

In [10]:
# Create chromaDB
persist_dir = "./chroma_db"

# initialize chromadb with open ai embedding
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=persist_dir,
    collection_name="rag_collection"
)
print(f"number of vectors created: {vectorstore._collection.count()}")
print(f"directory: {persist_dir}")

number of vectors created: 8
directory: ./chroma_db


## Test Similarity search

In [20]:
query = "What are different types of machine learning?"

search1 = vectorstore.similarity_search(query, k=3)
search1

[Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Common types of machine learning include supervised learning, unsupervised learning,\nand reinforcement learning. These techniques are widely used in recommendation systems,\nfraud detection, and predictive analytics.'),
 Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Common types of machine learning include supervised learning, unsupervised learning,\nand reinforcement learning. These techniques are widely used in recommendation systems,\nfraud detection, and predictive analytics.'),
 Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Machine Learning Fundamentals\n\nMachine learning is a field of artificial intelligence that focuses on building systems\nthat can learn patterns from data and make predictions or decisions without being explicitly programmed.\nInstead of following fixed rules, machine learning models improve their performance as they are exposed\nto more data over 

In [21]:
print(f"Query: {query}\nNumber of similar matches: {len(search1)}\n")

for i, doc in enumerate(search1, start=1):
    print(f"Match {i}:")
    print(f"Content: {doc.page_content[:200]}")
    print(f"Metadata: {doc.metadata}\n")


Query: What are different types of machine learning?
Number of similar matches: 3

Match 1:
Content: Common types of machine learning include supervised learning, unsupervised learning,
and reinforcement learning. These techniques are widely used in recommendation systems,
fraud detection, and predic
Metadata: {'source': 'sample_docs/doc0.txt'}

Match 2:
Content: Common types of machine learning include supervised learning, unsupervised learning,
and reinforcement learning. These techniques are widely used in recommendation systems,
fraud detection, and predic
Metadata: {'source': 'sample_docs/doc0.txt'}

Match 3:
Content: Machine Learning Fundamentals

Machine learning is a field of artificial intelligence that focuses on building systems
that can learn patterns from data and make predictions or decisions without being
Metadata: {'source': 'sample_docs/doc0.txt'}



## Advanced similarity search

In [22]:
search2 = vectorstore.similarity_search_with_score(query, k=3)
search2

[(Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Common types of machine learning include supervised learning, unsupervised learning,\nand reinforcement learning. These techniques are widely used in recommendation systems,\nfraud detection, and predictive analytics.'),
  0.5003141760826111),
 (Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Common types of machine learning include supervised learning, unsupervised learning,\nand reinforcement learning. These techniques are widely used in recommendation systems,\nfraud detection, and predictive analytics.'),
  0.5009902119636536),
 (Document(metadata={'source': 'sample_docs/doc0.txt'}, page_content='Machine Learning Fundamentals\n\nMachine learning is a field of artificial intelligence that focuses on building systems\nthat can learn patterns from data and make predictions or decisions without being explicitly programmed.\nInstead of following fixed rules, machine learning models improve their pe

## Understanding similarity

- ChromaDB uses L2 distance(Euclidan distance)
    - Lower score = more similar(closer in vector space)
    - score of 0 = identical vectors
    - ranges 0 to 2 (but can be higher)
- Cosine similarity
    - Higher score = more similarity
    - score of 1 = identical
    - range -1 to 1


## initialize LLM, Rag chain, prmopt template, Query the RAG system

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo"
)

In [27]:
test_response = llm.invoke("what is Large language model")
test_response

AIMessage(content='A large language model is a type of artificial intelligence model that can understand and generate human language at a sophisticated level. These models are typically trained on vast amounts of text data to learn patterns and structures of language in order to generate human-like responses. Large language models are used in various applications such as natural language processing, chatbots, machine translation, and text generation. Some popular examples of large language models include GPT-3 (Generative Pre-trained Transformer 3) and BERT (Bidirectional Encoder Representations from Transformers).', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 106, 'prompt_tokens': 12, 'total_tokens': 118, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', '

In [29]:
from langchain.chat_models.base import init_chat_model
llm= init_chat_model("openai:gpt-3.5-turbo") 
test_response = llm.invoke("what is Large language model")
test_response

AIMessage(content='A large language model is a type of artificial intelligence system that is trained to understand and generate human language. These models are typically based on deep learning techniques, such as neural networks, and are capable of processing and generating vast amounts of text data. Large language models have many applications, including text generation, language translation, and natural language understanding. Examples of large language models include GPT-3 (Generative Pre-trained Transformer 3) and BERT (Bidirectional Encoder Representations from Transformers).', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 99, 'prompt_tokens': 12, 'total_tokens': 111, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-3.5-turbo-0125',

## modern RAG

In [50]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser


In [43]:
## convert vector store to retriver
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 3}
)


In [46]:
system_prompt = """
You are a knowledgeable and concise assistant.
Use ONLY the information provided in the context to answer the user's question.
Do NOT use prior knowledge or make assumptions.
If the answer cannot be found in the context, respond with:
"I don't know based on the provided context."
Be clear, factual, and concise in your response.
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "Context:\n{context}\n\nQuestion:\n{question}")
])

In [47]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='\nYou are a knowledgeable and concise assistant.\nUse ONLY the information provided in the context to answer the user\'s question.\nDo NOT use prior knowledge or make assumptions.\nIf the answer cannot be found in the context, respond with:\n"I don\'t know based on the provided context."\nBe clear, factual, and concise in your response.\n'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Context:\n{context}\n\nQuestion:\n{question}'), additional_kwargs={})])

In [51]:
# create document chain
# LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

# RAG chain (NO document chain needed)
rag_chain = (
    {
        "context": retriever,
        "question": lambda x: x
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [52]:
query = "What is the difference between machine learning and deep learning?"
answer = rag_chain.invoke(query)

print(answer)


Based on the provided context, machine learning is a field of artificial intelligence that focuses on building systems that learn patterns from data and make predictions or decisions without being explicitly programmed. In contrast, deep learning is a subfield of machine learning that specifically uses multi-layer neural networks to model complex patterns in large datasets, often inspired by the structure of the human brain. Deep learning is particularly effective with large amounts of unstructured data.
