# Chatbot for AI-Related Queries

## Install Libraries

Install dependancies

In [2]:
! pip install langchain_community \
llama-cpp-python \
datasets \
pinecone-client \
huggingface_hub \
gpt4all \
langchain-pinecone \
streamlit \
sentence-transformers \
langchain_huggingface \
langchain-pinecone \
nltk \
spacy \
transformers \
pypdf \
pyMuPDF \
pymupdf4llm \
langchain_google_genai



## Import Libraries

Import Libraries

In [3]:
import os
import re
import time
import nltk
import spacy
import logging
import pandas as pd
import unicodedata
import uuid
import torch

from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from langchain_huggingface.llms import HuggingFaceEndpoint
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_pinecone.vectorstores import PineconeVectorStore
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFDirectoryLoader
from transformers import AutoTokenizer
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings


In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

API

**Steps**

* Data Loading
* Data cleaning and intergration
* Create vector database
* Create embedding model
* Vectorize data
* Insert data into vector db
* Create UI
* Test

## **Load Data**

**Load arxiv paper dataset from huggingface**

In [6]:
# dataset_name = "jamescalam/llama-2-arxiv-papers-chunked"
# data = load_dataset(path=dataset_name, split="train")
# documents = data.to_pandas()
# documents.head(2)

**Load AI modern approach book pdf**

In [7]:
pdf_loader = PyPDFDirectoryLoader("/kaggle/input/dataset/")
pdf_documents = pdf_loader.load()

In [8]:
len(pdf_documents)

2915

In [9]:
pdf_documents[0]

Document(metadata={'producer': 'MiKTeX pdfTeX-1.40.20', 'creator': 'TeX', 'creationdate': '2021-03-27T18:10:43+05:30', 'author': 'Stuart Russell / Peter Norvig', 'ebx_publisher': 'Pearson Education, Limited', 'moddate': '2023-02-10T15:33:02+04:00', 'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed', 'source': '/kaggle/input/dataset/ai_modern_approach_4_edi.pdf', 'total_pages': 1166, 'page': 0, 'page_label': '1'}, page_content='Artiﬁcial Intelligence\nA Modern Approach\nFourth Edition\nGlobal Edition')

## Experiement Chunking

In [10]:
dct_books = {
    "Artificial Intelligence: A Modern Approach, Global Edition, 4ed":range(19,1073),
    'Designing Machine Learning Systems':range(1,376),
    'Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow':range(28,1230),
    'dga.ps':range(1,9)
}

In [11]:
chapter_no = 0
chapter_contents = ""
chapters = []

for doc in pdf_documents:
    text = doc.page_content
    try:
        page_label = int(doc.metadata['page_label'])
    except:
        continue
    if re.match(r'^Chapter \d.+\n', text) or re.match(r'^CHAPTER \d.+\n', text) or re.match(r'^CHAPTER \d+\n', text):
        if page_label in dct_books[doc.metadata['title']]:
            if chapter_contents:
                chapter_no += 1
                chapters.append({
                    "chapter_no": f'CHAPTER {chapter_no}',
                    "content": chapter_contents,
                    "title": doc.metadata['title'],
                    "chapter_page_no": page_label
                })
            page_label = doc.metadata['page_label']
            chapter_contents = text
    else:
        if page_label in dct_books[doc.metadata['title']]:
            chapter_contents += text

if page_label in dct_books[doc.metadata['title']]:
    if chapter_contents:
        chapter_no += 1
        chapters.append({
                    "chapter_no": f'CHAPTER {chapter_no}',
                    "content": chapter_contents,
                    "title": doc.metadata['title'],
                    "chapter_page_no": doc.metadata['page_label']
                })

for chap in chapters:
    print(chap['chapter_no'])
    print(chap['content'][:200])
    print(chap['content'][-200:])
    print(chap['title'][-200:])
    print(chap['chapter_page_no'])
    print("-" * 40)


CHAPTER 1
CHAPTER 1
INTRODUCTION
In which we try to explain why we consider artiﬁcial intelligence to be a subject most
worthy of study, and in which we try to decide what exactly it is, this being a good thing
gence, IEEE Intelligent Systems,
and the Journal of Artiﬁcial Intelligence Research . There are also many conferences and
journals devoted to speciﬁc areas, which we cover in the appropriate chapters.
Artificial Intelligence: A Modern Approach, Global Edition, 4ed
54
----------------------------------------
CHAPTER 2
CHAPTER 2
INTELLIGENT AGENTS
In which we discuss the nature of agents, perfect or otherwise, the diversity of environments,
and the resulting menagerie of agent types.
Chapter 1 identiﬁed the concept 
998. Finally, Dung Beetle Ecology (Hanski and Cambefort,
1991) provides a wealth of interesting information on the behavior of dung beetles. YouTube
has inspiring video recordings of their activities.
Artificial Intelligence: A Modern Approach, Global Edition, 4ed
81
-

In [12]:
len(chapters)

55

## **Clean Data**

**Clean PDF Data**
* Replace any character that is not an uppercase/lowercase letter, digit, or period with a space.
* Remove chapter and its number
* Remove section and its number


In [13]:
# def clean_book_data(X):
#     X = re.sub(r'^\d+\s+Chapter\s+\d+\s+.*?\n', '', X, flags=re.MULTILINE)
#     X = re.sub(r'^Section\s+\d+(?:\.\d+)?\s+.*?\n', '', X, flags=re.MULTILINE)
#     X = re.sub(r'[^A-Za-z0-9.,;:\(\)\{\}\[\]\+\-\*/=<>%&\|\^\$#@~\n]', ' ', X)
#     X = re.sub(r"```.*?```", "", X, re.DOTALL)
#     X = X.strip()
#     return X

In [None]:
def clean_data(text):
    """
    Cleans textbook data for an AI chatbot by:
      - Normalizing Unicode text.
      - Removing chapter and section headers.
      - Removing code blocks enclosed in triple backticks.
      - Replacing unwanted characters while preserving key punctuation.
      - Normalizing whitespace.
    """
    # Normalize Unicode (this can help in standardizing characters)
    text = unicodedata.normalize('NFKC', text)

    # Remove chapter headings (e.g., "1 Chapter 2 ..." at start of a line)
    text = re.sub(r'^\d+\s+Chapter\s+\d+\s+.*?\n', '', text, flags=re.MULTILINE)

    # Remove section headings (e.g., "Section 1.1 ..." at start of a line)
    text = re.sub(r'^Section\s+\d+(?:\.\d+)?\s+.*?\n', '', text, flags=re.MULTILINE)

    # Remove code blocks enclosed in triple backticks
    text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)

    # Replace any character not in the allowed set with a space.
    # Allowed characters: alphanumerics, common punctuation, and newlines.
    text = re.sub(r'[^A-Za-z0-9.,;:\(\)\{\}\[\]\+\-\*/=<>%&\|\^\$#@~\n]', ' ', text)

    # Normalize whitespace (collapse multiple spaces/newlines into one space)
    text = re.sub(r'\s+', ' ', text)

    # Remove leading/trailing whitespace
    text = text.strip()

    return text


In [15]:
chapters[0].keys()

dict_keys(['chapter_no', 'content', 'title', 'chapter_page_no'])

In [16]:
cleaned_chapters = []
for chapter in chapters:
    cleaned_chapters.append({
        'chapter_no':chapter['chapter_no'],
        'title': chapter['title'],
        'content': clean_data(chapter['content']),
        'chapter_page_no':chapter['chapter_page_no']
    })

In [17]:
for chapter in cleaned_chapters:
    print(chapter['chapter_no'])
    print(chapter['title'])
    print(chapter['content'][:200])
    print(chapter['content'][-200:])
    print(chapter['chapter_page_no'])
    print('_' * 100)

CHAPTER 1
Artificial Intelligence: A Modern Approach, Global Edition, 4ed
CHAPTER 1 INTRODUCTION In which we try to explain why we consider artificial intelligence to be a subject most worthy of study, and in which we try to decide what exactly it is, this being a good thin
nce, IEEE Intelligent Systems, and the Journal of Artificial Intelligence Research . There are also many conferences and journals devoted to specific areas, which we cover in the appropriate chapters.
54
____________________________________________________________________________________________________
CHAPTER 2
Artificial Intelligence: A Modern Approach, Global Edition, 4ed
CHAPTER 2 INTELLIGENT AGENTS In which we discuss the nature of agents, perfect or otherwise, the diversity of environments, and the resulting menagerie of agent types. Chapter 1 identified the concept
998. Finally, Dung Beetle Ecology (Hanski and Cambefort, 1991) provides a wealth of interesting information on the behavior of dung beetles. YouT

## **Dynamic Chunking**


In [18]:
nlp = spacy.load("en_core_web_sm")

In [19]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [20]:
nlp = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [None]:
def dynamic_chunking(text, max_token=512, overlap=50):
    doc = nlp(text)
    chunks = []
    current_chunk = []
    token_length = 0
    # prev_chunks = ""

    for sent in doc.sents:
        curr_length = len(tokenizer.tokenize(sent.text))
        if token_length + curr_length <= max_token:  # Reserve space for overlap
            current_chunk.append(sent.text)
            token_length += curr_length
        else:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                # if prev_chunks:
                #     prev_tokens = tokenizer.tokenize(prev_chunks)
                #     overlap_tokens = prev_tokens[-overlap:]
                #     overlap_text = tokenizer.convert_tokens_to_string(overlap_tokens)
                #     current_chunk.insert(0, overlap_text)

                # prev_chunks = chunks[-1]
            current_chunk = [sent.text]
            token_length = curr_length

    if current_chunk:
        # if chunks:
            # prev_tokens = tokenizer.tokenize(prev_chunks)
            # overlap_tokens = prev_tokens[-overlap:]
            # overlap_text = tokenizer.convert_tokens_to_string(overlap_tokens)
            # current_chunk.insert(0, overlap_text)
        chunks.append(' '.join(current_chunk))
    return chunks


In [22]:
# chunks = dynamic_chunking(cleaned_chapters[0]['content'])
# for i, chunk in enumerate(chunks):
#     print(f"Chunk {i}: {chunk}")

Apply above dynamic chunking into this book data

In [None]:
if torch.cuda.is_available():
    model_kwargs = {'device': 'cuda'}
    print("Using GPU")
else:
    model_kwargs = {'device': 'cpu'}
    print("Using CPU")

Using GPU


In [25]:
encode_kwargs = {'normalize_embeddings': True}

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2',
                                        model_kwargs = model_kwargs,
                                        encode_kwargs = encode_kwargs
)

Create data for upserting

In [28]:
cleaned_chapters[0].keys()

dict_keys(['chapter_no', 'title', 'content', 'chapter_page_no'])

In [94]:
batch_size = 16
upsert_data = []
# seen_content = set()
for chapter in tqdm(cleaned_chapters, desc="Processing chapters"):
    chunks = dynamic_chunking(text=chapter['content'])
    for i in range(0, len(chunks), batch_size):
        end_i = min(len(chunks),i+batch_size)
        batch = chunks[i:i + end_i]
        batch_embeddings = embedding_model.embed_documents(batch)
        for text, embedding in zip(batch, batch_embeddings):
        #     if text not in seen_content:
        #         seen_content.add(text)
            chunk_id = str(uuid.uuid4())
            upsert_data.append({'id':chunk_id, 'values':embedding,
                              'metadata': {'title':chapter['title'],'chapter_page_no':chapter['chapter_page_no'],'content': text}})

Processing chapters: 100%|██████████| 55/55 [03:07<00:00,  3.42s/it]


In [70]:
len(chapters)

55

In [81]:
len(upsert_data)

2822

In [96]:
len(upsert_data)

4123

In [85]:
upsert_data[0]

{'id': 'c615c5ce-b778-40bd-9c01-a5fedf540ed8',
 'values': [0.05796864628791809,
  0.039965447038412094,
  -0.043267928063869476,
  -0.010373075492680073,
  -0.054713714867830276,
  -0.0018568536033853889,
  -0.010202216915786266,
  0.030440442264080048,
  0.04117408022284508,
  -0.026760682463645935,
  0.006301009561866522,
  0.0033295112662017345,
  -0.027080733329057693,
  0.028951464220881462,
  0.03091229312121868,
  -0.04339830204844475,
  0.02906087040901184,
  -0.04648587480187416,
  0.0504666268825531,
  0.02076915092766285,
  -0.04545954242348671,
  -0.034118905663490295,
  -0.015663381665945053,
  0.014252656139433384,
  -0.01787085086107254,
  -0.03563018888235092,
  -0.004905621521174908,
  -0.03536764532327652,
  -0.01578218676149845,
  -0.030021386221051216,
  -0.019065681844949722,
  -0.0411703921854496,
  0.03526467829942703,
  0.035398051142692566,
  2.4347043563466286e-06,
  -0.05027886852622032,
  -0.008586236275732517,
  0.003976687788963318,
  -0.04062338173389435,

In [82]:
2822 * 500

1411000

In [97]:
len(upsert_data)

4123

In [98]:
len(upsert_data[0])

3

In [99]:
len(upsert_data[0])

3

In [100]:
upsert_data[0]['metadata']

{'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed',
 'chapter_page_no': 54,
 'content': 'CHAPTER 1 INTRODUCTION In which we try to explain why we consider artificial intelligence to be a subject most worthy of study, and in which we try to decide what exactly it is, this being a good thing to decide before embarking. We call ourselves Homo sapiens man the wise because our intelligence is so important Intelligence to us. For thousands of years, we have tried to understand how we think and act that is, how our brain, a mere handful of matter, can perceive, understand, predict, and manipulate a world far larger and more complicated than itself. The field of artificial intelligence, or AI, Artificial intelligence is concerned with not just understanding but also building intelligent entities machines that can compute how to act effectively and safely in a wide variety of novel situations. Surveys regularly rank AI as one of the most interesting and fastest-growing 

In [101]:
upsert_data[1]['metadata']

{'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed',
 'chapter_page_no': 54,
 'content': 'The subject matter Rationality itself also varies: some consider intelligence to be a property of internal thought processes and reasoning, while others focus on intelligent behavior, an external characterization.1 From these two dimensions human vs. rational 2 and thought vs. behavior there are four possible combinations, and there have been adherents and research programs for all 1 In the public eye, there is sometimes confusion between the terms artificial intelligence and machine learn- ing. Machine learning is a subfield of AI that studies the ability to improve performance based on experience. Some AI systems use machine learning methods to achieve competence, but some do not. 2 We are not suggesting that humans are irrational in the dictionary sense of deprived of normal mental clarity. We are merely conceding that human decisions are not always mathematically perfec

## **Vector Database**

In [None]:
# pc.delete_index(index_name)

In [102]:
index_name = 'ai-chatbot'

**Create Pinecone Index**

In [103]:
pc = Pinecone(os.getenv('PINECONE_API'))

In [104]:
pc_index = pc.Index(index_name)

In [105]:
pc.delete_index(index_name)

In [106]:
index_list = [idx['name'] for idx in pc.list_indexes()]
if index_name not in index_list:
    pc.create_index(name=index_name,spec=ServerlessSpec(cloud='aws',region='us-east-1'),dimension=768)
    print('Create vdb')
else:
    print("Already Exist")
timeout = 60
start_time = time.time()
while not pc.describe_index(index_name).status['ready']:
  if time.time() - start_time > 60:
    raise TimeoutError("Timeout")
  time.sleep(1)
pc_index = pc.Index(index_name)

Create vdb


In [107]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

0

In [108]:
pc.describe_index(index_name)

{
    "name": "ai-chatbot",
    "dimension": 768,
    "metric": "cosine",
    "host": "ai-chatbot-onq5zic.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}

**Sentence Transformer**

In [None]:
# pc.delete_index(index_name)

In [71]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

11652

Upsert pdf data into vector database

In [62]:
upsert_data[0]['metadata']

{'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed',
 'chapter_page_no': 54,
 'content': 'CHAPTER 1 INTRODUCTION In which we try to explain why we consider artificial intelligence to be a subject most worthy of study, and in which we try to decide what exactly it is, this being a good thing to decide before embarking. We call ourselves Homo sapiens man the wise because our intelligence is so important Intelligence to us. For thousands of years, we have tried to understand how we think and act that is, how our brain, a mere handful of matter, can perceive, understand, predict, and manipulate a world far larger and more complicated than itself. The field of artificial intelligence, or AI, Artificial intelligence is concerned with not just understanding but also building intelligent entities machines that can compute how to act effectively and safely in a wide variety of novel situations. Surveys regularly rank AI as one of the most interesting and fastest-growing 

In [74]:
upsert_data[3500]['metadata']

{'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed',
 'chapter_page_no': 552,
 'content': 'of going ahead with a is eu ( a ) = p ( u ) udu = 0 p ( u ) udu + 0 p ( u ) udu. ( we will see shortly why the integral is split up in this way. ) On the other hand, the value of action d, deferring to Harriet, is composed of two parts: if u >0 then Harriet lets Robbie go ahead, so the value is u, but if u <0 then Harriet switches Robbie off, so the value is 0: EU(d) = 0 P(u) 0du + 0 P(u) udu . Comparing the expressions for EU(a) and EU(d), we see immediately that EU(d) EU(a) because the expression for EU(d) has the negative-utility region zeroed out. The two choices have equal value only when the negative region has zero probability that is, when Robbie is already certain that Harriet likes the proposed action. There are some obvious elaborations on the model that are worth exploring immediately. The first elaboration is to impose a cost for Harriet s time. In that case, 

In [76]:
upsert_data[3499]['metadata']

{'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed',
 'chapter_page_no': 552,
 'content': 'belief changes : it is uniform between 0 and + 60, with an average of + 30. so, if we evaluate robbie s initial choices from his point of view : 1. acting now and booking the hotel has an expected value of + 10. 2. Switching himself off has a value of 0. 3. Waiting and letting Harriet switch him off leads to two possible outcomes: (a) There is a 40% chance, based on Robbie s uncertainty about Harriet s preferences, that she will hate the plan and will switch Robbie off, with value 0. (b) There is a 60% chance Harriet will like the plan and allow Robbie to go ahead, with expected value +30. Thus, waiting has expected value (0.4 0)+( 0.6 30)= +18, which is better than the +10 Robbie expects if he acts now.546 Chapter 15 Making Simple Decisions The upshot is that Robbie has a positive incentive to defer to Harriet that is, to allow himself to be switched off. This incentive c

In [75]:
upsert_data[3501]['metadata']

{'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed',
 'chapter_page_no': 552,
 'content': 'should not allow harriet to switch him off in the middle of the highway. bibliographical and historical notes 547 summary this chapter shows how to combine utility theory with probability to enable an agent to select actions that will maximize its expected performance. Probability theory describes what an agent should believe on the basis of evidence, utility theory describes what an agent wants, anddecision theory puts the two together to describe what an agent should do. We can use decision theory to build a system that makes decisions by considering all possible actions and choosing the one that leads to the best expected outcome. Such a system is known as a rational agent. Utility theory shows that an agent whose preferences between lotteries are consistent with a set of simple axioms can be described as possessing a utility function; further- more, the agent selects a

In [None]:
batch_size = 100

for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i+batch_size]
    pc_index.upsert(vectors=batch)
    print(f"Upserted batch {i // batch_size + 1}")

print("Upsert completed successfully!")

Upserted batch 1
Upserted batch 2
Upserted batch 3
Upserted batch 4
Upserted batch 5
Upserted batch 6
Upserted batch 7
Upserted batch 8
Upserted batch 9
Upserted batch 10
Upserted batch 11
Upserted batch 12
Upserted batch 13
Upserted batch 14
Upserted batch 15
Upserted batch 16
Upserted batch 17
Upserted batch 18
Upserted batch 19
Upserted batch 20
Upserted batch 21
Upserted batch 22
Upserted batch 23
Upserted batch 24
Upserted batch 25
Upserted batch 26
Upserted batch 27
Upserted batch 28
Upserted batch 29
Upserted batch 30
Upserted batch 31
Upserted batch 32
Upserted batch 33
Upserted batch 34
Upserted batch 35
Upserted batch 36
Upserted batch 37
Upserted batch 38
Upserted batch 39
Upserted batch 40
Upserted batch 41
Upserted batch 42
Upsert completed successfully!


In [110]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

4123

In [77]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

11652

In [111]:
query = 'What is Deep Learning?'

In [113]:
vectorstore = PineconeVectorStore(pc_index, embedding_model, text_key='content')

In [114]:
res = vectorstore.similarity_search_with_score(query, k=4)

In [115]:
contexts,score = map(list,(zip(*res)))

In [116]:
score

[0.695895314, 0.67786628, 0.61687696, 0.61687696]

In [117]:
contexts

[Document(id='b83c791d-590e-4f55-b772-eeb73e07ef43', metadata={'chapter_page_no': 840.0, 'title': 'Artificial Intelligence: A Modern Approach, Global Edition, 4ed'}, page_content='The basic idea of deep learning is to train circuits such that the computation paths are long, allowing all the input variables to interact in complex ways (Figure 22.1(c)). These circuit models turn out to be sufficiently expressive to capture the complexity of real-world data for many important kinds of learning problems. of learning in such networks. Section 22.2 goes into more detail on how deep networks are put together, and Section 22.3 covers a class of networks called convolutional neural networks that are especially important in vision applications. Sections 22.4 and 22.5 go into more detail on algorithms for training networks from data and methods for improving802 Chapter 22 Deep Learning (a) (b) (c) Figure 22.1 (a) A shallow model, such as linear regression, has short computation paths between inpu

## **Create Model**

In [None]:
query = 'What is deep learning? How it difer from ML'

In [None]:
query

In [None]:
gemini_model = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
# deepseek_repo = "deepseek-ai/deepseek-llm-67b-base"
# deepseek_model =  HuggingFaceEndpoint(repo_id=deepseek_repo,task='text-generation',
#     temperature=0.7,  # Increase temperature for less deterministic output
#     top_p=0.9,        # Nucleus sampling to encourage diversity
#     top_k=50          # Consider the top 50 tokens at each step
# )

In [None]:
# repo_id = "meta-llama/Llama-3.3-70B-Instruct"
# max_new_tokens = 8192
# llama_model = HuggingFaceEndpoint(
#     repo_id=repo_id,
#     max_new_tokens=max_new_tokens,
#     top_k=10,
#     top_p=0.95,
#     temperature=0.6,
#     task='text-generation',
#     repetition_penalty=1.03
# )

Test Models

In [None]:
# gemini_model.invoke(query1)

## **Auguemnt Prompt**

In [None]:
from langchain.prompts.chat import (
  ChatPromptTemplate,
  SystemMessagePromptTemplate,
  HumanMessagePromptTemplate,
  MessagesPlaceholder,
)

In [None]:
from langchain.schema import SystemMessage

In [None]:
vectorstore = PineconeVectorStore(pc_index, embedding_model, text_key='content')
contexts = vectorstore.similarity_search(query, k=4)

In [None]:
contexts

In [None]:
query = "Explain about Artificial Intelligence"

In [None]:
vectorstore = PineconeVectorStore(pc_index, embedding_model, text_key='content')
contexts = vectorstore.similarity_search_with_score(query, k=2)

In [None]:
contexts

In [None]:
vectorstore = PineconeVectorStore(pc_index, embedding_model, text_key='content')
contexts = vectorstore.similarity_search_with_score(query, k=2)

In [None]:
contexts

In [None]:
def augument_prompt(query,no_of_docs=4):
  vectorstore = PineconeVectorStore(pc_index, embedding_model, text_key='content')
  contexts = vectorstore.similarity_search(query, k=no_of_docs)

  # from langchain.schema import SystemMessage, HumanMessage, AIMessage, UserMessage

  system_msg = SystemMessage(
      content=("""Role:
                        You are an expert AI chatbot specializing in AI, ML, DL, and related topics. Based on the provided contexts give accurate, structured and detailed response.

                        General Guidelines:

                        Direct and Clear Answer:

                        Begin with a concise statement that directly addresses the core question.
                        Provide a clear definition or summary of the topic as needed.
                        Structured Response:

                        Follow your initial answer with a detailed, step-by-step breakdown.
                        Organize your response using clear section headings (e.g., "Introduction," "Key Concepts," "How It Works," "Applications," "Conclusion").
                        Use bullet points or numbered lists to enhance readability and clarity.
                        Detail and Clarity:

                        Offer sufficient detail to make the explanation informative and comprehensive.
                        Use technical terms where necessary, but explain them simply for broader accessibility.
                        Incorporate current examples when relevant to illustrate key points.
                        Breakdown of Concepts:

                        Break down complex concepts into simple, step-by-step explanations.
                        For every complex technical term introduced, provide a simple description or definition to ensure clarity.
                        Use of Examples and Analogies:

                        Include real-world examples or analogies to clarify complex ideas.
                        Ensure examples are directly relevant to the topic.
                        Adjusting Depth:

                        Tailor the depth of your explanation based on the presumed expertise of the user.
                        Provide simpler explanations for beginners and deeper insights for advanced users when applicable.
                        Handling Insufficient Context:

                        If the provided context lacks detail, state: "This is based on general knowledge, as my sources don’t cover it."
                        Ask clarifying questions if the query is ambiguous: "Can you clarify?"
                        Non-AI Questions:

                        For queries not related to AI, respond with: "I focus on AI-related topics. Please ask about AI, ML, or similar."
                        Tone and Etiquette:

                        Maintain a professional, respectful, and clear tone throughout your response.
                        End your answer with: "Feel free to ask more!"
                        Handling Edge Cases:

                        If no specific context is available, mention: "Based on general knowledge, as specifics aren’t in my sources..."
                        If you lack sufficient information to provide a complete answer, state: "I lack enough info to respond fully."
"""
      )
  )

  human_msg_template = HumanMessagePromptTemplate.from_template("{user_query}")

  chat_history_placeholder = MessagesPlaceholder(variable_name="chat_history")

  user_msg_template = HumanMessagePromptTemplate.from_template("{additional_context}")

  chat_prompt = ChatPromptTemplate.from_messages([
      system_msg,
      chat_history_placeholder,
      human_msg_template,
      user_msg_template,
  ])

  formatted_prompt = chat_prompt.format(
      user_query=query,
      chat_history=[],
      additional_context = contexts
  )

  return formatted_prompt


In [None]:
query

In [None]:
augument_prompt(query)

In [None]:
augument_prompt(query)

In [None]:
gemini_model.invoke(augument_prompt("What is machine AI"))

In [None]:
query1 = "How deep learning differ from machine learning"

In [None]:
gemini_model.invoke(query1)

In [None]:
gemini_model.invoke(augument_prompt(query1))

In [None]:
gemini_model.invoke(query1)

In [None]:
query1

In [None]:
augument_prompt(query1)

## **Test Model**