In [None]:
!pip install transformers sentence-transformers langchain
!pip install -U langchain-community
!pip install PyPDF2
!pip install faiss-cpu
!pip install langchain_huggingface


Collecting langchain-community
  Downloading langchain_community-0.3.4-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.6 (from langchain-community)
  Downloading langchain-0.3.6-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.14 (from langchain-community)
  Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.0-py3-none-any.whl.metadata (7.6 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-jso

In [None]:
import os
from google.colab import userdata
from huggingface_hub import InferenceClient
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from PyPDF2 import PdfReader

In [None]:
sec_key=userdata.get("HUGGINGFACEHUB_API_TOKEN")
os.environ["HUGGINGFACEHUB_API_TOKEN"]=sec_key

In [None]:
model_name = "microsoft/layoutlm-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
embedder = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

def extract_text_from_pdf(pdf_file_path):
    text = ""
    with open(pdf_file_path, 'rb') as f:
        pdf_reader = PdfReader(f)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def extract_text_from_multiple_pdfs(pdf_file_paths):
    combined_text = ""
    for pdf_file_path in pdf_file_paths:
        combined_text += extract_text_from_pdf(pdf_file_path)
    return combined_text

def recursive_split_text(text, max_length=1000):
    if len(text) <= max_length:
        return [text]

    split_index = text.rfind(' ', 0, max_length)
    if split_index == -1:
        split_index = max_length

    return recursive_split_text(text[:split_index]) + recursive_split_text(text[split_index:].strip())

def create_faiss_index(text_data, embedder):
    chunks = recursive_split_text(text_data)
    vectorstore = FAISS.from_texts(chunks, embedder)
    return vectorstore

def handle_user_input(user_question, vectorstore, conversation_history):
    retriever = vectorstore.as_retriever()
    relevant_text = retriever.get_relevant_documents(user_question)

    response = generate_response_from_inference_api(user_question, relevant_text, conversation_history)

    conversation_history.append({"user": user_question, "bot": response})
    return response

def format_conversation_history(conversation_history):
    """Formats conversation history for prompt generation."""
    formatted_history = "\n".join(
        f"You: {entry['user']}\nBot: {entry['bot']}" for entry in conversation_history
    )
    return formatted_history

def generate_response_from_inference_api(user_question, relevant_text, conversation_history):
    sec_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")

    prompt = f"""
    You are an assistant that provides detailed answers based on the content of multiple PDF documents and the conversation history.

    Here is the relevant information extracted from the PDFs:
    {relevant_text}

    Here is the history of the conversation so far:
    {format_conversation_history(conversation_history)}

    The user has asked the following question:
    {user_question}

    Provide a detailed and relevant answer based on the information from all the PDFs and the conversation history.

    Answer:
    """

    client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=sec_key)
    response = client.text_generation(prompt, max_new_tokens=5000)
    if hasattr(response, 'generated_text'):
        return response.generated_text
    else:
        return response



tokenizer_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/451M [00:00<?, ?B/s]

Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  embedder = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
## Before implementing RAG

In [None]:
## First example with the prompt : When did the deadpool and wolverine movie released
def test_model_inference(user_question):
    sec_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")

    prompt = f"""
    You are an assistant that provides detailed answers. The user has asked the following question:
    {user_question}

    Answer:
    """

    client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=sec_key)
    response = client.text_generation(prompt, max_new_tokens=500)
    return response.generated_text if hasattr(response, 'generated_text') else response

user_question = "When did the deadpool and wolverine movie released"
response = test_model_inference(user_question)
print("Bot:", response)


Bot:  The Deadpool and Wolverine movie you're referring to is "Deadpool & Wolverine" which is a crossover comic book series, not a film. If you're asking about movies featuring these characters, here are the release dates:

    - Deadpool (2016)
    - Logan (Wolverine's solo film, 2017)
    - Deadpool 2 (2018)
    - Once Upon a Deadpool (2018, a PG-13 cut of Deadpool 2)


In [None]:
## Second example with the prompt : Tell the summary of ghostbusters:frozen empire movie

def test_model_inference(user_question):
    sec_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")

    prompt = f"""
    You are an assistant that provides detailed answers. The user has asked the following question:
    {user_question}

    Answer:
    """

    client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=sec_key)
    response = client.text_generation(prompt, max_new_tokens=500)
    return response.generated_text if hasattr(response, 'generated_text') else response

# Test the model with a sample question
user_question = "Tell the summary of ghostbusters:frozen empire movie"
response = test_model_inference(user_question)
print("Bot:", response)


Bot:  "Ghostbusters: Frozen Empire" is a 2021 American supernatural comedy film directed by Jason Reitman and Gil Kenan, and written by Dan Aykroyd, Jason Reitman, and Gil Kenan. It is a sequel to the 1989 film "Ghostbusters II" and the fourth installment in the Ghostbusters franchise.

    The film is set in the fictional town of Summerville, Oklahoma, where a mysterious cold snap has caused the town to freeze over. The Ghostbusters, now consisting of Egon Spengler (Harold Ramis), Ray Stantz (Dan Aykroyd), Winston Zeddemore (Ernie Hudson), and a new recruit, Jillian Holtzmann (Kate McKinnon), are called in to investigate the phenomenon.

    As they delve deeper into the mystery, they discover that the cold snap is the work of a powerful and ancient entity known as the "Frozen Empire," which is attempting to take over the world by freezing it. The Ghostbusters must use their knowledge of the paranormal and their advanced technology to stop the entity and save the world from a frozen f

In [None]:
## Third example with the prompt : Tell the name of the latest iphone,what are its capabilites

def test_model_inference(user_question):
    sec_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")

    prompt = f"""
    You are an assistant that provides detailed answers. The user has asked the following question:
    {user_question}

    Answer:
    """

    client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=sec_key)
    response = client.text_generation(prompt, max_new_tokens=500)
    return response.generated_text if hasattr(response, 'generated_text') else response

# Test the model with a sample question
user_question = "Tell the name of the latest iphone,what are its capabilites"
response = test_model_inference(user_question)
print("Bot:", response)


Bot:  The latest iPhone is the iPhone 14 Pro Max, released in September 2022. Here are some of its key capabilities:

    - **Display**: 6.7-inch Super Retina XDR OLED display with ProMotion technology (120Hz adaptive refresh rate), HDR10, 1000 nits typical brightness, and 1600 nits peak brightness.

    - **Processor**: A16 Bionic chip (5nm, 6-core CPU, 5-core GPU, 16-core Neural Engine).

    - **Cameras**:
      - **Rear**: 48MP main camera with sensor-shift optical image stabilization (OIS), 12MP ultrawide camera, and 12MP telephoto camera with 3x optical zoom.
      - **Front**: 12MP TrueDepth camera.

    - **Video**: Cinematic mode for 4K video recording at 24 fps, ProRes video recording, Dolby Vision HDR video recording, and Cinematic mode for video.

    - **Battery**: Built-in rechargeable lithium-ion battery supporting MagSafe wireless charging (up to 15W), Qi wireless charging (up to 7.5W), and fast charging (up to 20W).

    - **Storage**: Available in 128GB, 256GB, 512GB,

In [None]:
## After implementing RAG

In [None]:
pdf_file_paths = ["Deadpool & Wolverine.pdf","Ghostbusters_ Frozen Empire.pdf","iPhone 16 Pro.pdf"]
pdf_text = extract_text_from_multiple_pdfs(pdf_file_paths)
vectorstore = create_faiss_index(pdf_text, embedder)
conversation_history = []
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")



You: When did the deadpool and wolverine movie released
Bot:  Deadpool & Wolverine was released on July 26, 2024, as part of Phase Five of the Marvel Cinematic Universe (MCU). The film premiered on July 22, 2024, at the David H. Koch Theater in New York City.
You: exit


In [None]:
## Third Example
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")



You: Tell the summary of ghostbusters:frozen empire movie
Bot:  Ghostbusters: Frozen Empire is a 2024 American supernatural comedy film directed by Jason Reitman and written by Gil Kenan and Jason Reitman. It is the fourth installment in the Ghostbusters franchise, serving as a direct sequel to Ghostbusters: Afterlife (2021). The film follows the Spengler family, who have moved to New York City, as they join forces with the original Ghostbusters to combat a new ghost threat that has begun to terrorize the city.

    The cast includes Paul Rudd as Gary Grooberson, Carrie Coon as Callie Spengler, Finn Wolfhard as Trevor Spengler, Mckenna Grace as Phoebe Spengler, Kumail Nanjiani as Nadeem Jaffrey, Patton Oswalt as Dr. Hubert Warkzinski, Celeste O'Connor as Lucky Domingo, Logan Kim as Podcast, Emily Alyn Lind as Melody, James Acaster as Dr. Lars Pinfield, Bill Murray as Dr. Peter Venkman, Dan Aykroyd as Dr. Raymond "Ray" Stantz, Ernie Hudson as Dr. Winston Zeddemore, and Annie Potts as Ja

In [None]:
## Second Example
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")



You: Tell the name of the latest iphone,what r its capabilites
Bot:  The latest iPhone models announced by Apple are the iPhone 16 and iPhone 16 Plus, as well as the iPhone 16 Pro and iPhone 16 Pro Max. Here are some of their capabilities based on the information extracted from the PDFs:

    **iPhone 16 and iPhone 16 Plus:**
    - Display: 6.1-inch and 6.7-inch Super Retina XDR OLED, 60Hz, HDR10, Dolby Vision, 1200 nits (typ), 1700 nits (HBM)
    - Sound: Stereo speakers, Dolby Atmos
    - Connectivity: Wi-Fi 6, Bluetooth 5.3, Ultra-wideband (UWB), NFC (reader mode, Express Cards), LEO satellite (Globalstar, limited)
    - Camera: Dual 12MP camera system (Wide and Ultra Wide), 4K video recording at 24 fps, 30 fps, or 60 fps
    - Battery: 3279 mAh (iPhone 16) and 4352 mAh (iPhone 16 Plus)
    - Storage: 128GB, 256GB, or 512GB
    - RAM: 6GB
    - Processor: A18 Bionic chip
    - Water resistance: IP68 (maximum depth of 6 meters up to 30 minutes)

    **iPhone 16 Pro and iPhone 16 Pro 

In [None]:
###### 5 examples of Optimised prompts

In [None]:

pdf_file_paths = ["HR.pdf","IT.pdf","BANKING.pdf","FINANCE.pdf","CONSULTANT.pdf"]
pdf_text = extract_text_from_multiple_pdfs(pdf_file_paths)
vectorstore = create_faiss_index(pdf_text, embedder)
conversation_history = []

while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")


## Before optimised prompt: What qualities does the HR candidate have?
## After optimised prompt: What are the key qualities and strengths of the HR candidate in areas like employee engagement, conflict resolution, and talent acquisition? Describe any notable achievements or certifications in HR practices.

You: What qualities does the HR candidate have?
Bot: 

    The HR candidate, based on the provided information, possesses the following qualities:

    1. **People-oriented and Confidential**: The candidate is described as "people-oriented" and maintains confidentiality, which are crucial qualities for an HR executive.

    2. **Organized and Efficient**: The candidate's experience in marketing and advertising highlights their ability to maintain organization, complete deadlines, and stay on task, indicating strong organizational skills.

    3. **Exceptional Multi-Tasker**: The candidate's experience also mentions exceptional multi-tasking skills, which are essential in HR roles that often involve juggling multiple responsibilities.

    4. **Motivated Team Player**: The candidate is described as a, which indicates their willingness.

    , a quality.

    , a quality.

    1 , a quality.

    The candidate, and the candidate.

    quality.

    The candidate and quality.

    1. **10

In [None]:
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")


## Before optimised prompt: What are the skills of the banking candidate?
## After optimised prompt: What unique skills and qualities make this candidate suited for banking? Highlight their experience in financial products, risk management, and any customer service achievements.


You: What are the skills of the banking candidate?
Bot:  Based on the information provided in the PDFs and the conversation history, the banking candidate possesses a diverse set of skills, including both technical and soft skills, making them well-suited for various roles in the banking industry. Here's a detailed breakdown of their skills:

    **Technical Skills:**

    1. **Programming and Development:**
       - Proficient in COBOL, a programming language commonly used in banking and finance industries.
       - Experience in creating CICS MAP from scratch and developing the presentation layer and service programs.
       - Familiarity with creating High-level design documents and providing architectural solutions.
       - Experience in developing new programs to handle various functions of banking activities, both online and batch.

    2. **Database and Data Management:**
       - Experience in tracking and reporting all defects in the System Integration Testing (SIT) environme

In [None]:
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")



## Before optimised prompt: What makes the IT candidate a good fit?
## After optimised prompt: What qualities and technical skills make this candidate a strong fit for IT? Focus on their programming expertise, problem-solving abilities, and experience with recent technology projects.


You: What makes the IT candidate a good fit?
Bot:  The IT candidate is a strong fit for various IT roles due to their extensive experience, diverse skill set, and proven track record of success in IT management, strategy, and project delivery. Here's a detailed breakdown of what makes the IT candidate a good fit, based on the information from the PDFs and the conversation history:

    **1. Proven IT Management and Leadership Experience:**

    - The IT candidate has held multiple leadership positions, including Vice President of Information Technology, System Administrator, and IT Director. This extensive experience demonstrates their ability to manage and lead IT teams effectively.
    - In their role as Vice President of Information Technology, the candidate built and improved IT organizations, turning around underperforming teams and making IT a strategic partner of the company. This experience showcases their ability to drive change and deliver results in IT management.
    - As a

KeyboardInterrupt: 

In [None]:
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")



## Before optimised prompt: What are the strengths of the finance candidate?
## After optimised prompt: What are the finance candidate’s strengths in areas like financial analysis, regulatory compliance, and cost management? Include any noteworthy achievements or tools they specialize in.


You: What are the strengths of the finance candidate?
Bot:  The finance candidate's strengths are the following:
    - Strong financial skills and knowledge
    - Experience in financial analysis and reporting
    - Ability to manage and analyze financial data
    - Strong financial analysis skills
    - Ability to analyze financial data and identify trends
    - Strong financial analysis skills
    - Ability to analyze financial data and identify trends
    - Ability to analyze financial data and identify trends
    - Ability to analyze financial data and identify trends
You: What are the finance candidate’s strengths in areas like financial analysis, regulatory compliance, and cost management? Include any noteworthy achievements or tools they specialize in.
Bot:  The finance candidate's strengths in financial analysis, regulatory compliance, and cost management are evident in their extensive experience and notable achievements in these areas. Here's a detailed breakdown of their key 

In [None]:
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    print(f"Bot: {response}")



## Before optimised prompt: What are the qualities of the consultant candidate?
## After optimised prompt: What are the top qualities of this consulting candidate, particularly in client relationship management, strategic planning, and project leadership? Describe any significant achievements in client outcomes or process improvements.


You: What are the qualities of the consultant candidate?
Bot:  The consultant candidate possesses a unique blend of skills, experiences, and qualities that make them an excellent fit for consulting roles. Based on the information from the PDFs and the conversation history, here's a detailed breakdown of the consultant candidate's key qualities:

    **1. Strong Business Acumen:**

    - **Industry Experience**: The candidate has extensive experience in various industries, including property management, public speaking, advertising, and media relations. This diverse industry background enables them to quickly understand and adapt to new business environments.
    - **Strategic Thinking**: The candidate's experience in strategic brand management and PR campaigns demonstrates their ability to think strategically and develop effective solutions to complex business challenges.
    - **Financial Awareness**: The candidate's experience in budget creation and management, as well as their abili