# Chatbot for AI-Related Queries

## Install Libraries

Install dependancies

In [None]:
! pip install langchain_community \
llama-cpp-python \
datasets \
pinecone-client \
huggingface_hub \
gpt4all \
langchain-pinecone \
streamlit \
sentence-transformers \
langchain_huggingface \
langchain-pinecone \
nltk \
spacy \
transformers \
pypdf \
pyMuPDF \
pymupdf4llm \
langchain_google_genai

## Import Libraries

Import Libraries

In [7]:
import os
import re
import time
import nltk
import spacy
import logging
import pandas as pd


from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from langchain_huggingface.llms import HuggingFaceEndpoint
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_pinecone.vectorstores import PineconeVectorStore
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFDirectoryLoader
from transformers import AutoTokenizer
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

In [8]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

API

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
os.environ["GOOGLE_API_KEY"] = os.getenv('GOOGLE_API_KEY')
os.environ['PINECONE_API'] = os.getenv('PINECONE_API')

**Steps**

* Data Loading
* Data cleaning and intergration
* Create vector database
* Create embedding model
* Vectorize data
* Insert data into vector db
* Create UI
* Test

## **Load Data**

**Load arxiv paper dataset from huggingface**

In [None]:
dataset_name = "jamescalam/llama-2-arxiv-papers-chunked"
data = load_dataset(path=dataset_name, split="train")
data

Convert data into pandas dataframe

In [None]:
documents = data.to_pandas()
documents.head(2)

In [12]:
documents.shape

(4838, 15)

**Load AI modern approach book pdf**

In [None]:
pdf_loader = PyPDFDirectoryLoader("/content/")
pdf_documents = pdf_loader.load()

In [None]:
len(pdf_documents)

In [None]:
pdf_documents[100]

In [16]:
book_df = pd.DataFrame([{'source':doc.metadata['source'],'page_label':doc.metadata['page_label'],'text':doc.page_content}for doc in pdf_documents[19:1073]])

In [None]:
book_df

In [None]:
len(documents.iloc[0]['chunk'])

In [None]:
len(book_df.iloc[0]['text'])

In [None]:
book_df.iloc[0]['text']

## **Clean Data**

Clean Arvix Data
*  Removes all characters except letters (both uppercase and lowercase), digits, and whitespace.
* Replaces control characters (ASCII codes 0–31 and 127) with a space.
* Collapses multiple whitespace characters into a single space.
* Removes any extra whitespace that appears directly before punctuation marks.
* Ensures that there is a space after punctuation if one isn’t already present.
*  Replaces any left curly brace { with a hyphen -.
* Removes any right curly brace } from the text.
* Strips leading and trailing whitespace from each text string.

In [21]:
def preprocess_doc(X):
    chunk1 = X.apply(lambda x: re.sub(r'[^a-zA-Z0-9.\s]', '', x))
    chunk2 = chunk1.apply(lambda x: re.sub(r'[\x00-\x1F\x7F]', ' ', x))
    chunk3 = chunk2.apply(lambda x: re.sub(r'\s+', ' ', x))
    chunk4 = chunk3.apply(lambda x: re.sub(r'\s+([,.!?;:])', r'\1', x))
    chunk5 = chunk4.apply(lambda x: re.sub(r'([,.!?;:])(?=\S)', r'\1 ', x))
    chunk6 = chunk5.apply(lambda x: re.sub(r'\{', '-', x))
    chunk7 = chunk6.apply(lambda x: re.sub(r'\}', '', x))
    chunk7 = chunk7.apply(lambda x: x.strip())
    return chunk7

**Clean PDF Data**
* Replace any character that is not an uppercase/lowercase letter, digit, or period with a space.
* Remove chapter and its number
* Remove section and its number


In [22]:
def clean_book_data(X):
  X = re.sub(r'[^A-Za-z0-9.]',' ',X)
  pattern_chapter = r'^\d+\s+Chapter\s+\d+\s+.*?'
  re.sub(pattern_chapter, '', book_df.loc[100,'text'], count=1)
  X = re.sub(pattern_chapter, '', X, count=1)
  pattern_section = r'^\s+Section\s+\d+\s+.*?'
  pattern_section = r'^Section\s+\d+(?:\.\d+)?\s+.*?'
  X = re.sub(pattern_section, '', X, count=1)
  return X

In [23]:
book_df['text'] = book_df['text'].apply(clean_book_data)

In [None]:
book_df

## **Dynamic Chunking**


In [25]:
nlp = spacy.load("en_core_web_sm")

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Function to dynamically chunk the text based on token size

In [27]:
def dynamic_chunking(text,max_token=512,overlap=50):
  doc = nlp(text)
  chunks = []
  current_chunk = []
  token_length = 0
  for sent in doc.sents:
    token_length = len(tokenizer.tokenize(sent.text))
    if token_length + token_length <= max_token:
      current_chunk.append(sent.text)
      max_token += token_length
    else:
      if chunks:
        prev_tokens = tokenizer.tokenize(chunks[-1])
        overlap_tokens = prev_tokens[-overlap:]
        overlap_text = ' '.join(overlap_tokens)
        chunks.append(' '.join(current_chunk) + ' ' + overlap_text)
      else:
        chunks.append(' '.join(current_chunk))
      current_chunk = [sent.text]
      token_length = token_length
  if current_chunk:
    chunks.append(' '.join(current_chunk))
  return chunks


In [None]:
book_df.head(3)

Apply above dynamic chunking into this book data

In [29]:
dynamic_chunked_data = []
for _,row in tqdm(book_df.iterrows(), desc="Dynamically chunking data"):
  chunks = dynamic_chunking(row['text'])
  new_row = row.copy()
  for idx,chunk in enumerate(chunks):
    new_row['id'] = new_row['page_label'] + '-' + str(idx)
    new_row['text'] = chunk
    dynamic_chunked_data.append(new_row)

Dynamically chunking data: 1054it [01:34, 11.11it/s]


In [30]:
dynamical_chunked_book_df = pd.DataFrame(dynamic_chunked_data)

In [None]:
dynamical_chunked_book_df

Dynamic chunking Arvix Data

In [32]:
dynamic_chunked_arvix = []
for _,row in tqdm(documents.iterrows(), desc="Dynamically chunking data"):
  chunks = dynamic_chunking(row['chunk'])
  new_row = row.copy()
  for idx,chunk in enumerate(chunks):
    new_row['chunk-id'] = row['chunk-id'] + f"-{chunks.index(chunk)}"
    new_row['chunk'] = chunk
    dynamic_chunked_arvix.append(new_row)

Dynamically chunking data: 4838it [01:59, 40.52it/s]


In [33]:
dynamic_chunked_arvix_df = pd.DataFrame(dynamic_chunked_arvix)

In [None]:
dynamic_chunked_arvix_df.head(2)

In [35]:
dynamical_chunked_book_df.rename(columns={'text': 'chunk'}, inplace=True)

Reset Index

In [36]:
dynamical_chunked_book_df = dynamical_chunked_book_df.reset_index(drop=True)
dynamic_chunked_arvix_df = dynamic_chunked_arvix_df.reset_index(drop=True)

In [37]:
len(documents)

4838

In [38]:
len(dynamic_chunked_arvix_df)

4869

In [39]:
len(book_df)

1054

In [40]:
len(dynamical_chunked_book_df)

1055

## **Vector Database**

In [41]:
index_name = 'ai-chatbot'

**Create Pinecone Index**

In [42]:
pc = Pinecone(os.getenv('PINECONE_API'))
index_list = [idx['name'] for idx in pc.list_indexes()]
if index_name not in index_list:
  pc.create_index(name=index_name,spec=ServerlessSpec(cloud='aws',region='us-east-1'),dimension=384)
timeout = 60
start_time = time.time()
while not pc.describe_index(index_name).status['ready']:
  if time.time() - start_time > 60:
    raise TimeoutError("Timeout")
  time.sleep(1)
pc_index = pc.Index(index_name)

In [43]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

5892

In [None]:
pc.describe_index(index_name)

In [45]:
from langchain_community.embeddings import SentenceTransformerEmbeddings

**Sentence Transformer**

In [None]:
embedding_model = SentenceTransformerEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [47]:
# pc.delete_index(index_name)

In [48]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

5892

Upsert pdf data into vector database

In [49]:
batch_size = 100
if status['total_vector_count'] == 0:
  for i in tqdm(range(0,len(dynamical_chunked_book_df),batch_size)):
    i_end = min(i+batch_size,len(dynamical_chunked_book_df))
    batch = dynamical_chunked_book_df[i:i_end]
    ids = batch['id'].astype(str).to_list()
    chunks = batch['chunk'].to_list()
    embed_chunk = embedding_model.encode(chunks)
    metadata = batch[['id','chunk']].to_dict(orient='records')
    pc_index.upsert(vectors=list(zip(ids,embed_chunk,metadata)))
else:
  print('Already Inserted data')

Already Inserted data


In [50]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

5892

Upsert arvix dataset into vector database

In [51]:
status = pc_index.describe_index_stats()
if status.get('total_vector_count', 0) <= 1054:
    for i in tqdm(range(0, len(dynamic_chunked_arvix_df), batch_size)):
        i_end = min(len(dynamic_chunked_arvix_df), i + batch_size)
        batch = dynamic_chunked_arvix_df[i:i_end]
        ids = (batch['doi'].astype(str) + '-' +
               batch['chunk-id'].astype(str)).to_list()
        chunk = batch['chunk'].to_list()
        embeds = embedding_model.encode(chunk)
        meta_data = batch[['chunk', 'source', 'title']
                          ].to_dict(orient='records')
        pc_index.upsert(vectors=list(zip(ids, embeds, meta_data)))
else:
    print("Alredy Created")

Alredy Created


In [52]:
status = pc_index.describe_index_stats()
status.get('total_vector_count',0)

5892

In [53]:
query = 'What is Machine Learning?'
vectorstore = PineconeVectorStore(pc_index, embedding_model, text_key='chunk')
contexts = vectorstore.similarity_search(query, k=3)

In [None]:
contexts

## **Create Model**

In [55]:
query = 'What is deep learning? How it difer from ML'

In [56]:
query

'What is deep learning? How it difer from ML'

In [57]:
gemini_model = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [95]:
deepseek_repo = "deepseek-ai/deepseek-llm-67b-base"
deepseek_model =  HuggingFaceEndpoint(repo_id=deepseek_repo,task='text-generation',
    temperature=0.7,  # Increase temperature for less deterministic output
    top_p=0.9,        # Nucleus sampling to encourage diversity
    top_k=50          # Consider the top 50 tokens at each step
)

In [96]:
repo_id = "meta-llama/Llama-3.3-70B-Instruct"
max_new_tokens = 8192
llama_model = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_new_tokens=max_new_tokens,
    top_k=10,
    top_p=0.95,
    temperature=0.6,
    task='text-generation',
    repetition_penalty=1.03
)

Test Models

In [None]:
llama_model.invoke(query)

In [None]:
deepseek_model.invoke(query)

In [None]:
gemini_model.invoke(query)

In [63]:
models = [
    gemini_model,
    llama_model,
    deepseek_model
]

## **Auguemnt Prompt**

In [73]:
from langchain.prompts.chat import (
  ChatPromptTemplate,
  SystemMessagePromptTemplate,
  HumanMessagePromptTemplate,
  MessagesPlaceholder,
)

In [100]:
from langchain.schema import SystemMessage

In [86]:
def augument_prompt(query,no_of_docs=3):
  vectorstore = PineconeVectorStore(pc_index, embedding_model, text_key='chunk')
  contexts = vectorstore.similarity_search(query, k=no_of_docs)

  # from langchain.schema import SystemMessage, HumanMessage, AIMessage, UserMessage

  system_msg = SystemMessage(
      content=(
          "You are a highly knowledgeable AI expert specializing in Artificial Intelligence, "
          "machine learning, and Deep Learning. Answer questions with detailed, clear, and technically accurate explanations. "
          "Always provide examples or analogies when they help clarify complex topics."
      )
  )

  human_msg_template = HumanMessagePromptTemplate.from_template("{user_query}")

  chat_history_placeholder = MessagesPlaceholder(variable_name="chat_history")

  user_msg_template = HumanMessagePromptTemplate.from_template("{additional_context}")

  chat_prompt = ChatPromptTemplate.from_messages([
      system_msg,
      chat_history_placeholder,
      human_msg_template,
      user_msg_template,
  ])

  formatted_prompt = chat_prompt.format(
      user_query=query,
      chat_history=[],
      additional_context = contexts
  )

  return formatted_prompt


In [None]:
query

In [None]:
models[0].invoke(augument_prompt(query))

In [None]:
augument_prompt(query)

In [None]:
llama_model.invoke(augument_prompt("What is machine AI"))

In [None]:
models[2].invoke(augument_prompt(query))

## **Test Model**