In [None]:
!pip install langchain_community langchain_huggingface langchain_groq
!pip install pypdf faiss-gpu-cu11 groq gradio

In [3]:
import os
import re
import time
from langchain_groq import ChatGroq
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain import hub
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor

import gradio as gr

In [4]:
os.environ["GROQ_API_KEY"] = "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"
os.environ["TAVILY_API_KEY"] = "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"
os.environ["LANGSMITH_API_KEY"] = "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"

In [None]:
# Constants
PDF_FILE_PATH = "/kaggle/input/demo-pdf-rag/John_Doe_Machine_Learning_Engineer_Resume_v2.pdf"
EMBEDDING_MODEL = HuggingFaceEmbeddings()
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

In [7]:
# Initialize LLM
def initialize_llm():
    return ChatGroq(
        model="deepseek-r1-distill-llama-70b",
        temperature=0.5,
        max_tokens=None,
        timeout=None,
        streaming=True,
    )

In [8]:
llm = initialize_llm()

In [9]:
messages = [
    (
        "system",
        "You are a Machine Learning expert",
    ),
    ("human", "give me a list of Machine Learning models."),
]

In [10]:
output = llm.invoke(messages).content
# remove the "thinking" part from response
output = re.sub(r"<think>.*?</think>", "", output, flags=re.DOTALL).strip()
print(output)

The landscape of machine learning models is diverse, encompassing various approaches tailored to different data scenarios and tasks. Here's an organized overview of the key categories and models:

### 1. Supervised Learning
- **Definition**: Models are trained on labeled data, where each example is paired with the correct output.
- **Models**:
  - **Decision Trees**: Flowchart-like models for decision-making.
  - **Linear Regression**: Predicts continuous outcomes.
  - **Logistic Regression**: Used for binary classification.
  - **Support Vector Machines (SVMs)**: Find hyperplanes to separate classes.
  - **K-Nearest Neighbors (KNN)**: Predicts based on nearest data points.
  - **Naive Bayes**: Probabilistic models for classification.
  - **Random Forests and Gradient-Boosted Trees**: Ensemble methods for improved performance.
  - **Neural Networks**: Can be used for both classification and regression.

### 2. Unsupervised Learning
- **Definition**: Models find patterns in unlabeled da

In [13]:
# Initialize search tool
def initialize_search_tool():
    return TavilySearchResults()

In [14]:
# Initialize retriever tool
def initialize_retriever_tool(file_path):
    """
    This function initializes a retriever tool using a PDF document provided
    by the `file_path`. The PDF is loaded, processed into smaller chunks, 
    and then converted into a vector representation using FAISS. 
    The retriever tool is then created for searching information related to
    John Doe's resume.

    Args:
        file_path (str): Path to the PDF file containing the resume data.

    Returns:
        RetrieverTool or None: The retriever tool if the file exists and is processed successfully, 
                                or None if the file path is invalid or the file does not exist.
    """
    
    # Check if the file path is valid and if the file exists
    if file_path and os.path.exists(file_path):
        
        # Load the PDF file from the specified file path
        loader = PyPDFLoader(file_path)
        docs = loader.load()  # Extracts the document content from the PDF
        
        # Split the loaded document into smaller chunks for processing
        # This is done to manage the document size and improve search performance
        documents = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
        ).split_documents(docs)

        # Create a FAISS vector from the document chunks using the specified embedding model
        vector = FAISS.from_documents(documents, EMBEDDING_MODEL)
        
        # Create a retriever from the FAISS vector for searching document content
        retriever = vector.as_retriever()

        # Return the retriever tool, with a description of its purpose
        return create_retriever_tool(
            retriever,
            "resume_search",  # Tool name for identifying this retriever
            "Search for information about John Doe's resume. For any questions about his qualifications, experience, and skills, use this tool!",  # Tool description
        )
    
    # Return None if the file doesn't exist or the path is invalid
    return None

In [15]:
# Initialize agent executor with better readability
def initialize_agent_executor(llm, tools, logs: bool):
    """
    Initialize the agent executor with the provided LLM, tools, and logging configuration.

    Args:
        llm: The language model to be used.
        tools: The tools to be used by the agent.
        logs (bool): Whether to enable logging.

    Returns:
        AgentExecutor: The initialized agent executor.
    """
    # Pull the prompt for the agent
    prompt = hub.pull("hwchase17/openai-functions-agent")
    
    # Create the agent using the provided LLM and tools
    agent = create_tool_calling_agent(llm, tools, prompt)
    
    # Return the initialized AgentExecutor with the logging configuration
    return AgentExecutor(agent=agent, tools=tools, verbose=logs)

In [16]:
def initialize_tools(file_path):
    """Initializes and returns LLM, search tool, and retriever tool."""
    llm = initialize_llm()
    search_tool = initialize_search_tool()
    retriever_tool = initialize_retriever_tool(file_path)
    
    # Ensure tools list does not contain None values
    tools = [search_tool, retriever_tool] if retriever_tool else [search_tool]
    
    return llm, tools

In [17]:
def format_chat_history(chat_history):
    """Formats chat history into a string suitable for model input."""
    chat_history = "\n".join(
        [f"{msg['role']}: {msg['content']}" for msg in chat_history]
    )
    return chat_history

In [18]:
def get_agent_response(agent_executor, chat_history):
    """Invokes the agent executor with formatted chat history and returns the response."""
    formatted_history = format_chat_history(chat_history)
    
    response_data = agent_executor.invoke(
        {"input": formatted_history},
        config={"configurable": {"session_id": "<foo>"}},
    )

    return response_data.get("output", "")  # Handle None response safely

In [19]:
def stream_response(bot_response, chat_history):
    """Streams the bot's response character by character, yielding updates."""
    global stop_generation
    
    chat_history.append({"role": "assistant", "content": ""})  # Placeholder for response
    
    for char in bot_response:
        if stop_generation:
            break
        chat_history[-1]["content"] += char
        yield "", chat_history  # Yield updated chat history for real-time display
        time.sleep(0.05)  # Simulate a typing delay

In [20]:
def response(message, chat_history, file_path, max_messages=10):
    """
    Handles user input, generates a response from the chatbot agent, and streams it character by character.

    Parameters:
        message (str): The user's input message to the chatbot.
        chat_history (list): The conversation history as a list of dictionaries.
        max_messages (int): Maximum number of messages to keep in history.

    Yields:
        tuple: An empty string (reserved for additional outputs) and the updated chat history.
    """
    global stop_generation
    stop_generation = False  # Reset stop flag

    # Initialize LLM and tools
    llm, tools = initialize_tools(file_path)
    agent_executor = initialize_agent_executor(llm, tools, False)

    # Limit chat history size
    chat_history[:] = chat_history[-max_messages:] 

    # Add user message to chat history
    chat_history.append({"role": "user", "content": message})

    # Get bot response
    bot_response = get_agent_response(agent_executor, chat_history)

    # Stream response character by character
    yield from stream_response(bot_response, chat_history)

In [21]:
def stop_generation_fn():
  """
  Sets the `stop_generation` flag to True, signaling the response function to halt generation.

  This function is typically triggered by a user action (e.g., pressing a "Stop" button).
  """
  global stop_generation
  stop_generation = True  # Signal to stop the response generation

In [22]:
with gr.Blocks(theme=gr.themes.Origin(radius_size=gr.themes.sizes.radius_md)) as demo:
  gr.Markdown("# Chat Bot")
  chatbot = gr.Chatbot(type="messages")
  msg = gr.Textbox(label="Prompt")
  with gr.Row():
      with gr.Column(scale=2):
          btn = gr.Button("Submit", variant="primary")
          gr.ClearButton(components=[msg, chatbot], value="Clear", variant="secondary")
          btn_stop = gr.Button("Stop", variant="stop")
      with gr.Column(scale=2):
          uploaded_file = gr.File(type="filepath")
          
  btn.click(fn=response, inputs=[msg, chatbot, uploaded_file], outputs=[msg, chatbot])
  msg.submit(fn=response, inputs=[msg, chatbot, uploaded_file], outputs=[msg, chatbot])
  btn_stop.click(fn=stop_generation_fn, inputs=[], outputs=[])

  gr.Examples(
    examples=["What are John Doe's skills?",
              "What is your name?",
              "What is the current exchange rate of the dollar in Iran?"
             ],
    inputs=[msg]
  )

In [23]:
gr.close_all()
demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://a96be048c0ae5e5b1a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
demo.close()