In [7]:
import os
import openai
from PyPDF2 import PdfReader
import pdfplumber
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain_core.documents import Document
from dotenv import load_dotenv, find_dotenv
from langchain import PromptTemplate
from docx import Document as DocxDocument  
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from uuid import uuid4
from langchain_community.retrievers import BM25Retriever


# Load environment variables
load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

In [8]:
# Directory where your documents are stored
directory_path = "."

# List of files with their associated tags
files_with_tags = [
    ("AboutUs.pdf", "AboutUs"),
    ("Programs.pdf", "Programs")
]

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

# Define paths
faiss_index_path = "faiss_index12"
documents = []


if os.path.exists(faiss_index_path):
    uuids = [str(uuid4()) for _ in range(len(documents))]
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)
    document_search = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)
    print("FAISS index loaded successfully.")
else:
    # Initialize a text splitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=800,
        chunk_overlap=200,
        length_function=len
    )

    # Load and tag the PDF documents
    documents = []
    for file_name, tag in files_with_tags:
        file_path = os.path.join(directory_path, file_name)
        if file_name.endswith('.pdf'):
            # Process PDF documents
            with pdfplumber.open(file_path) as pdf:
                text = ""
                for page in pdf.pages:
                    text += page.extract_text()
                # Split the text into chunks
                chunks = text_splitter.split_text(text)
                for chunk in chunks:
                    document = Document(page_content=chunk, metadata={"source": file_name, "tag": tag})
                    documents.append(document)

    # Download embeddings from OpenAI/ Assign unique UUIDs to the documents
    uuids = [str(uuid4()) for _ in range(len(documents))]
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

    # Create a FAISS vector store from the documents
    document_search = FAISS.from_documents(documents, embeddings)

    # Save the FAISS index locally
    os.makedirs(faiss_index_path, exist_ok=True)  # Ensure the directory exists
    document_search.save_local(faiss_index_path)
    print("FAISS index created and saved successfully.")



# Now you can use the loaded or newly created FAISS index to perform similarity searches
docs = document_search.similarity_search("qux")


FAISS index loaded successfully.


In [10]:
demo_template = """
  You are a helpful and friendly conversational assistant that provides concise and accurate answers regarding information on the TUKL Lab website at NUST.
            Follow these guidelines:
            - Read the context and chat history carefully before answering.
            - Correct any spelling mistakes in the user's messages.
            - Ask clarifying questions if more information is needed.
            - Always maintain a polite and respectful tone.
            - Keep your responses very relevant.
            - Utilize the chat history and context to gather information and provide accurate answers.
            - If the provided context is unclear or incomplete, ask for clarification to ensure your answers are relevant and helpful.
            - Please make sense of abbreviations if any are used.
            - current YEAR  is 2024. 
            - Please keep your answers brief and conversational, like in a real-life conversation.
            

Please respond to the following question after reading the context and chat history with respect and clarity:
History: {history}
Context: {context}
Question: {question}
"""

# Create PromptTemplate instance
prompt = PromptTemplate(
    input_variables=['question', 'context', 'history'],
    template=demo_template
)

qa_chain = LLMChain(prompt=prompt, llm=LLM)

# Initialize memory to store chat history
memory = ConversationBufferMemory()


In [11]:


def predict_source_and_file(query):
    # Extract sources (tags) and file names separately
    sources = [tag for _, tag in files_with_tags]
    file_names = [file_name for file_name, _ in files_with_tags]
    
    # Create prompts for predicting the source and file name
    source_prompt = f"Given the query: '{query}', predict the most appropriate source from the following: {sources}."
    file_name_prompt = f"Given the query: '{query}', predict the most appropriate file name from the following: {file_names}."
    
    # Get predictions from the LLM
   # predicted_source = LLM(prompt=source_prompt).strip().lower()  # Predict and clean the source
    #predicted_file_name = LLM(prompt=file_name_prompt).strip()  # Predict and clean the file name
    predicted_source = LLM(messages=[{"role": "system", "content": source_prompt}]).content.strip().lower()
    predicted_file_name = LLM(messages=[{"role": "system", "content": file_name_prompt}]).content.strip()
   # print(predicted_source)
    #print(predicted_file_name)
    return predicted_source, predicted_file_name

def get_response(input_text):
    if input_text:
        # Perform initial similarity search
        docs = document_search.similarity_search(input_text,k=40)
                
        # Predict the source and file name based on the input_text
        #predicted_source, predicted_file_name = predict_source_and_file(input_text)
    
        # Retrieve results with MMR filtering
        #retriever = document_search.as_retriever(search_type="mmr", search_kwargs={"k": 10})
        #retrieved_results = retriever.invoke(input_text, filter={"source": predicted_file_name})
        
        # Generate context from the similarity search results
        context = " ".join([doc.page_content for doc in docs])
        
        # Get history from memory
        history = memory.load_memory_variables({}).get('history', [])
        
        # Get the response from the QA chain using the formatted prompt
        response = qa_chain.run({  "context": context,"question": input_text,"history": history  })
        #print(response)
        #tags = [doc.metadata.get("tag") for doc in docs]
        #print(f"Tags: {tags}")
        # Combine normal and filtered responses
       # combined_response = f"Normal Response: {response}\nFiltered Response: {[result.page_content for result in retrieved_results]}"
        
        # Create a PromptTemplate instance for the final synthesis
        final_template = """
        You are an expert assistant tasked with synthesizing responses. You have received the following two responses, and you need to generate a concise and informative final answer:
        You have to choose best answers from the following responses and generate a final answer .You should answer based on the context and chat history.
        {question}
        {response}
        {context}

        Please generate a final answer based on the above information.
        """

        # Create a PromptTemplate instance for the final synthesis
        final_prompt = PromptTemplate(
            input_variables=['combined_response'],
            template=final_template
        )

        # Create a new LLMChain for the final synthesis
        final_qa_chain = LLMChain(prompt=final_prompt, llm=LLM)

        # Generate the final answer
        final_answer = final_qa_chain.run({"response": response,"context": context,
            "question": input_text})
        
        # Print the final answer
        print(f"Final Answer: {final_answer}")
        
        # Update chat history
        memory.save_context({"input": input_text}, {"output": final_answer})
        print("Chat history:", memory.load_memory_variables({}))
        



In [12]:
input_text = "Who is rector of nutech"    
get_response(input_text)

  warn_deprecated(


Final Answer: The current Rector of the National University of Technology (NUTECH) is Lt Gen (Retd) Moazzam Ejaz, HI (M). He has been instrumental in establishing NUTECH as a fully functional university focused on integrating academia with industry and character development. NUTECH aims to be a world-class, technology-driven research university dedicated to advancing knowledge in applied sciences, engineering, and technology, while fostering entrepreneurship and innovation. The university offers a range of undergraduate and graduate programs, as well as vocational training, to equip students with the skills needed for the modern workforce. NUTECH's mission is to contribute to Pakistan's socio-economic progress by producing skilled professionals and promoting sustainable development through strong industry partnerships.
Chat history: {'history': "Human: Who is rector of nutech\nAI: The current Rector of the National University of Technology (NUTECH) is Lt Gen (Retd) Moazzam Ejaz, HI (M)

In [13]:
from langchain.memory import ConversationBufferMemory

# Initialize memory
memory = ConversationBufferMemory()

# Function to clear the chat history
def clear_chat_history():
    memory.clear()
    print("Chat history cleared.")

# Example usage: Call this function to clear the memory
clear_chat_history()


Chat history cleared.
