In [1]:
import os
import re
import pdfplumber
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document
from langchain.llms import OpenAI
from langchain.agents import initialize_agent, Tool
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from transformers import GPT2Tokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import DuckDuckGoSearchResults
from transformers import BlipProcessor, BlipForConditionalGeneration

# Step 1: Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = "uXKKfHuxEQhzRQT3BlbkFJJVpR-jjaVD0PqhjzuYyb261t4fOWRIv5X1-EHCBhkA"

# Initialize the LLM
llm = OpenAI(temperature=0.7)

# Embedding Model for Text
embedding_model = OpenAIEmbeddings()

# Step 2: Load PDF with pdfplumber and Split into Chunks
def load_pdf_and_split_into_chunks(pdf_path, chunk_size=1000):
    chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                # Split text into chunks using RecursiveCharacterTextSplitter
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
                page_chunks = text_splitter.split_text(text)
                for chunk in page_chunks:
                    chunks.append(chunk)
    return chunks

# Step 3: Embed the PDF Chunks
def embed_pdf_chunks(chunks):
    # Ensure the chunks are non-empty strings
    clean_chunks = [chunk for chunk in chunks if isinstance(chunk, str) and chunk.strip()]
    embeddings = embedding_model.embed_documents(clean_chunks)
    return embeddings

# Step 4: Create FAISS Vector Store
def create_faiss_vector_store(embeddings, chunks):
    # Create document objects
    documents = [Document(page_content=chunk) for chunk in chunks]
    
    # Zip embeddings with text chunks
    text_embedding_pairs = list(zip(chunks, embeddings))
    
    # Create FAISS vector store
    vector_store = FAISS.from_embeddings(text_embedding_pairs, embedding_model)
    return vector_store

# Step 5: Search PDF using embeddings and return relevant chunks
def search_pdf_with_embeddings(query, vector_store):
    # Ensure the query is a string
    if not isinstance(query, str):
        return "Query should be a string."

    # Clean the query input if necessary
    query = query.strip()

    # Log the query to verify
    print(f"Processed Query: {query}")
    
    # Check for empty queries
    if not query:
        return "Query cannot be empty."

    try:
        # Obtain the query embedding
        query_embedding = embedding_model.embed_query(query)
        
        # Log the embedding to verify it
        print(f"Query Embedding: {query_embedding}")

        # Perform similarity search with the query embedding
        results = vector_store.similarity_search_by_vector(query_embedding, k=5)  # Return top 5 results
    except Exception as e:
        return f"Error during embedding or search: {e}"

    if results:
        return "\n\n".join([res.page_content for res in results])
    else:
        return "No relevant information found in the PDF."

# Step 6: Initialize the PDF reader with FAISS search
def initialize_pdf_search_tool(pdf_path):
    chunks = load_pdf_and_split_into_chunks(pdf_path)
    embeddings = embed_pdf_chunks(chunks)
    vector_store = create_faiss_vector_store(embeddings, chunks)
    return vector_store

# Load and embed the PDF
pdf_vector_store = initialize_pdf_search_tool('Barad_Praveen_kumar_DS.pdf')

# PDF Search Tool
pdf_tool = Tool(
    name="PDF Reader",
    func=lambda query: search_pdf_with_embeddings(query, pdf_vector_store),
    description="Extract information from the PDF file using embeddings."
)

def search_url(query):
    search_tool = DuckDuckGoSearchResults()
    return search_tool.run(query)

url_tool = Tool(
    name="Search URL",
    func=search_url,
    description="Fetch information from the web using URL search."
)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")#this is preprocessing reducing the size and numerical format
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def describe_image(image_path):
    try:
        image = Image.open(image_path)
        inputs = processor(image, return_tensors="pt")#convert into tensors
        out = model.generate(**inputs)#tensors input and give  output of token its
        description = processor.decode(out[0], skip_special_tokens=True) #it will take ids and  and decode into words
        return description
    except Exception as e:
        return f"Error describing image: {e}"

# Image Tool
image_tool = Tool(
    name="Image Describer",
    func=lambda query: describe_image('pngtree-lotus-flower-jpg-pink-lotus-flower-image_13023952.jpg'),
    description="Describe an image."
)


# In-memory conversation buffer for chat history
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Combine all tools (Here, only the PDF tool is used for demonstration)
tools = [pdf_tool, url_tool,image_tool]

# Initialize the agent with tools, memory, and output parsing error handling
agent_with_memory = initialize_agent(
    tools=tools,
    llm=llm,
    agent_type="conversational-react-description",
    memory=memory,
    verbose=True,
    handle_parsing_errors=True
)

# Step 7: Create a prompt template to guide the agent
prompt_template = PromptTemplate(
    input_variables=["chat_history", "user_input"],
    template="""
    You are a helpful assistant. Use the following conversation history and user input to respond.

    Conversation history:
    {chat_history}

    User: {user_input}

    Only use tools (PDF Reader, Search, Image Describer, or Calculator) when explicitly asked by the user or when it's clear that the query requires them.

    Assistant:"""
)

# Load GPT-2 tokenizer to estimate token usage
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def count_tokens(text):
    return len(tokenizer.encode(text))

# Step 8: Function to interact with the agent using memory and prompt templates
# Limit the number of messages in chat history based on token size
def chatbot_conversation(user_input):
    # Clear memory for a fresh conversation
    memory.clear()
    
    # Fetch the chat history from memory
    chat_history = memory.load_memory_variables({})["chat_history"]
    
    # Prepare the prompt
    prompt = prompt_template.format(
        chat_history=chat_history,
        user_input=user_input
    )
    
    # Check token count
    token_count = count_tokens(prompt)
    print(f"Token count: {token_count}")
    
    # If token count exceeds the limit, trim chat history
    max_tokens_allowed = 4097 - 256  # Leave space for completion tokens
    while token_count > max_tokens_allowed:
        # Trim the oldest message from chat history
        chat_history = chat_history[1:]
        prompt = prompt_template.format(
            chat_history=chat_history,
            user_input=user_input
        )
        token_count = count_tokens(prompt)
        print(f"Reduced token count: {token_count}")
    
    # Get response from agent if within token limit
    if token_count <= 4097:
        response = agent_with_memory.run(prompt)
        return response
    else:
        return "Prompt is too long to process. Please shorten the conversation."


# Step 9: Continuous Conversation Loop for Chatbot Interaction
# Example usage:



  from .autonotebook import tqdm as notebook_tqdm
  llm = OpenAI(temperature=0.7)
  embedding_model = OpenAIEmbeddings()
  agent_with_memory = initialize_agent(


: 

In [11]:
def run_chatbot():
    print("Chatbot initialized. Type 'exit' to stop the conversation.")
    
    while True:
        user_input = input("You: ")  # Get input from user in the notebook
        if user_input.lower() == "exit":
            print("Exiting chatbot...")
            break
        
        response = chatbot_conversation(user_input)  # Call the conversation function
        print(f"Assistant: {response}")

# Run the chatbot
run_chatbot()


Chatbot initialized. Type 'exit' to stop the conversation.


  response = agent_with_memory.run(prompt)
Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Token count: 91
[32;1m[1;3m I should use Search URL since the input requires me to fetch information from the web.
Action: Search URL
Action Input: "praveen kumar barad studied"[0m
Observation: [33;1m[1;3msnippet: Praveen Kumar's win in the Men's High Jump - T64 Final at the Paris Paralympics 2024 marks India's sixth gold medal, setting a new national record. The young athlete from Noida surpassed his Tokyo ..., title: Who is Praveen Kumar? Noida's para athlete and India's 6th gold ..., link: https://timesofindia.indiatimes.com/sports/paris-paralympics/who-is-praveen-kumar-noidas-para-athlete-and-indias-6th-gold-medallist/articleshow/113142134.cms, snippet: Praveen Kumar on Friday won India's 6th gold medal at Paris Paralympics when he topped the podium in the Men's High Jump T64 event. USA's Derek Loccident claimed the silver with a best jump of 2.06m. Poland's Maciej Lepiato and Uzbekistan's Temurbek Giyazov cleared 2.03m, finished joint third and won two bronze medals in this e

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Token count: 96
[32;1m[1;3m This is a complex question and will likely require multiple steps to find the answer.
Action: Search URL
Action Input: "praveen kumar barad techwave consulting india studied"[0m
Observation: [33;1m[1;3msnippet: Praveen Barad Worked as software engineer Techwave Published Jun 20, 2024 + Follow Natural Language Processing (NLP) stands at the intersection of artificial intelligence (AI) and machine learning ..., title: Exploring Natural Language Processing (NLP) - LinkedIn, link: https://www.linkedin.com/pulse/exploring-natural-language-processing-nlp-praveen-barad-y0lrc, snippet: Techwave Consulting India Private Limited is an unlisted private company incorporated on 12 January, 2015. It is classified as a private limited company and is located in Hyderabad, Telangana. It's authorized share capital is INR 5.00 cr and the total paid-up capital is INR 4.57 cr. Techwave Consulting India's operating revenues range is INR ..., title: TECHWAVE CONSULTING INDIA 

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Token count: 91
[32;1m[1;3m I should use the Search URL tool to fetch information from the web using the given query.
Action: Search URL
Action Input: "education of praveen kumar barad"[0m
Observation: [33;1m[1;3msnippet: Praveen Kumar's win in the Men's High Jump - T64 Final at the Paris Paralympics 2024 marks India's sixth gold medal, setting a new national record. The young athlete from Noida surpassed his Tokyo ..., title: Who is Praveen Kumar? Noida's para athlete and India's 6th gold ..., link: https://timesofindia.indiatimes.com/sports/paris-paralympics/who-is-praveen-kumar-noidas-para-athlete-and-indias-6th-gold-medallist/articleshow/113142134.cms, snippet: Praveen Kumar on Friday won India's 6th gold medal at Paris Paralympics when he topped the podium in the Men's High Jump T64 event. USA's Derek Loccident claimed the silver with a best jump of 2.06m. Poland's Maciej Lepiato and Uzbekistan's Temurbek Giyazov cleared 2.03m, finished joint third and won two bronze medals i

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Token count: 95
[32;1m[1;3m This is a general user input, I should ask for more information
Action: Ask for more information
Action Input: What would you like to know?[0m
Observation: Ask for more information is not a valid tool, try one of [PDF Reader, Search URL, Image Describer].
Thought:[32;1m[1;3m The user input is a company name and a person name, I should use Search URL tool to fetch information
Action: Use Search URL
Action Input: techwave consulting india praveen kumar barad[0m
Observation: Use Search URL is not a valid tool, try one of [PDF Reader, Search URL, Image Describer].
Thought:[32;1m[1;3m The user input is a company name and a person name, I should use Image Describer tool to describe the company logo or person's image
Action: Use Image Describer
Action Input: techwave consulting india praveen kumar barad[0m
Observation: Use Image Describer is not a valid tool, try one of [PDF Reader, Search URL, Image Describer].
Thought:[32;1m[1;3m The user input is a pe

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Token count: 88
[32;1m[1;3m I should use a Search URL because the user is asking for information that can be found online.
Action: Search URL
Action Input: "temperature of rajasthan"[0m
Observation: [33;1m[1;3msnippet: Weather Underground provides local & long-range weather forecasts, weatherreports, maps & tropical weather conditions for the Jaipur area. ... Rajasthan, India Weather Conditions star_ratehome. 83 ..., title: Jaipur, India Weather Conditions | Weather Underground, link: https://www.wunderground.com/weather/in/jaipur, snippet: September 6, 2024. Weather in Rajasthan, Jaipur: Get the current weather, hourly and weekly weather forecast for Jaipur, along with current temperature, rainfall, wind speed, humidity, air-quality, 15-days weather forecast and season trend for Jaipur., title: Current weather and temperature in Jaipur: Hourly and weekly weather ..., link: https://www.skymetweather.com/forecast/weather/india/rajasthan/jaipur/jaipur, snippet: The best time to visi

In [12]:
pip install -r requirements.txt



