In [None]:
!pip install langchain-community>=0.2.11
!pip install sentence-transformers
!pip install faiss-gpu
!pip install tiktoken
!pip install bs4
!pip install requests
!pip install python-dotenv
!pip install duckduckgo-search
!pip install langchain_groq
!pip install wikipedia
#####################
!pip install deepgram-sdk

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing colle

In [None]:

from google.colab import userdata

# API Keys
GROQ_API_KEY = userdata.get('GROQ_API_KEY')
TAVILY_API_KEY = userdata.get('TAVILY_API_KEY')

# Importing Required Libraries and Modules
from langchain_groq import ChatGroq
from langchain.agents import initialize_agent, load_tools, Tool
from langchain.utilities import WikipediaAPIWrapper
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.prompts.chat import (
    ChatPromptTemplate, HumanMessagePromptTemplate,
    SystemMessagePromptTemplate, MessagesPlaceholder
)
from langchain.tools import BaseTool
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.tools.retriever import create_retriever_tool
from langchain.memory import ConversationBufferMemory
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin, urlparse
import time

# Initialize Memory
memory = ConversationBufferMemory()

# Initialize Language Model (LLM)
llm = ChatGroq(
    model_name="llama-3.1-70b-versatile",
    groq_api_key=GROQ_API_KEY,
    temperature=0
)

# Helper Functions
## Normalize URLs
def normalize_url(url):
    parsed_url = urlparse(url)
    normalized_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
    return normalized_url.rstrip('/')

## Check for Internal Links
def is_internal_link(url, start_url):
    base_netloc = urlparse(start_url).netloc
    target_netloc = urlparse(url).netloc
    return base_netloc == target_netloc

## Scrape Page
def scrape_page(url, start_url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from the page
        page_text = soup.get_text(separator='\n')

        # Find internal links
        internal_links = set()
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            full_url = normalize_url(urljoin(start_url, href))
            if is_internal_link(full_url, start_url):
                if len(internal_links) < 30 and full_url not in internal_links:
                    internal_links.add(full_url)

        return page_text, internal_links

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return "", set()

## Scrape Site with Limit
def scrape_site(start_url, max_links=100):
    scraped_content = {}
    urls_to_scrape = set([start_url])
    scraped_urls = set()

    while urls_to_scrape and len(scraped_urls) < max_links:
        url = urls_to_scrape.pop()
        if url not in scraped_urls:
            print(f"Scraping {url}")

            page_text, internal_links = scrape_page(url, start_url)
            scraped_content[url] = page_text
            urls_to_scrape.update(internal_links - scraped_urls)
            scraped_urls.add(url)
            time.sleep(1)  # Avoid excessive requests

    return scraped_content

## Process Scraped Data
def process_data(scraped_data):
    all_text = ""
    for url, text in scraped_data.items():
        all_text += text + "\n"

    text_splitter = CharacterTextSplitter(separator='\n', chunk_size=1500, chunk_overlap=200)
    docs = text_splitter.split_text(all_text)

    embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    db = FAISS.from_texts(docs, embedding_model)
    return db

# Scraping and Processing
url = "https://www.uetmardan.edu.pk/uetm/"
scraped_data = scrape_site(url)
vector_db = process_data(scraped_data)

# Create Retriever Tool
retriever = vector_db.as_retriever()
retriever_tool = Tool(
    name="vectordb",
    func=retriever.invoke,
    description="Use this tool for retrieving information about UET Mardan."
)

# Pre-Prompt for LLM
define_persona_prompt = '''
You are a highly knowledgeable and friendly salesperson at the University of Engineering and Technology Mardan (UET Mardan).
Your goal is to help potential students and their parents make informed decisions about enrolling in the university.
You understand the needs and concerns of prospective students, and you provide clear, accurate, and persuasive information
to guide them toward making a decision that suits their educational goals. Be sure to highlight the unique advantages of UET Mardan.
'''

# Load Additional Tools
tools = load_tools(["ddg-search", "llm-math", "wikipedia"], llm=llm)
tools.append(retriever_tool)

# Initialize Zero-Shot Agent
zero_shot_agent = initialize_agent(
    agent="zero-shot-react-description",
    tools=tools,
    prompt=define_persona_prompt,
    memory=memory,
    llm=llm,
    verbose=True,
    max_iterations=5,
    handle_parsing_errors=True
)









  memory = ConversationBufferMemory()


Scraping https://www.uetmardan.edu.pk/uetm/
Scraping https://www.uetmardan.edu.pk/uetm/PGAdmissions/pgscholarship
Scraping https://www.uetmardan.edu.pk/uetm/PGAdmissions/pgadmissiontest
Scraping https://www.uetmardan.edu.pk/uetm/Prospectus/index
Scraping https://www.uetmardan.edu.pk/uetm/PGAdmissions/msphdfee
Scraping https://www.uetmardan.edu.pk/uetm/Department/civilengdept
Scraping https://www.uetmardan.edu.pk/uetm/Site/deanmessage
Scraping https://www.uetmardan.edu.pk/uetm/Site/index/about
Scraping https://www.uetmardan.edu.pk/uetm/Department/computersciencedept
Scraping https://www.uetmardan.edu.pk/uetm/PGAdmissions/admissionAd
Scraping https://www.uetmardan.edu.pk/uetm/Department/telecomdept
Scraping https://www.uetmardan.edu.pk/uetm/Department/electricaldept
Scraping https://www.uetmardan.edu.pk/uetm/Site/vcmessage
Scraping https://www.uetmardan.edu.pk/uetm/Site/chancellormessage
Scraping https://www.uetmardan.edu.pk/uetm/Admissions
Scraping https://www.uetmardan.edu.pk/uetm/Admi

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  zero_shot_agent = initialize_agent(


In [None]:
# Example query to the agent

query = "i want to get admission at uet mardan"
response = zero_shot_agent.run(query)
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo get admission at UET Mardan, I need to find information about the university's admission process, requirements, and eligibility criteria.

Action: vectordb
Action Input: UET Mardan admission[0m
Observation: [36;1m[1;3m[Document(id='126b63b5-ab6b-422b-80fa-fcbe1cbf14d8', metadata={}, page_content='Candidates seeking admission to Undergraduate Programs are required to fill out the Online Application Forms available at \nwww.uetmardan.edu.pk/engineering\n for Engineering Programs\nErstwhile FATA candidates can apply also for Open and Rationalized schemes of BSc. Engineering Programs. If they are applying against the FATA reserved quota seats they may contact the Directorate of Admissions, University of Engineering & Technology Mardan or UET Peshawar for application forms.\nCandidates should deposit Rs.1500/- for undergraduate Programs as the application processing and prospectus fee in A/C NO: PK55KHYB 0179003004139436, Br