In [1]:
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_huggingface import HuggingFaceEndpoint
from dotenv import load_dotenv
import os
import requests
from bs4 import BeautifulSoup

# Load environment variables from .env file
load_dotenv()

# Retrieve the API token from the environment variable
sec_key = "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"

if not sec_key:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN is not set in the environment variables.")

# Set the API token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = sec_key

# Define the repository ID and initialize the HuggingFaceEndpoint
repo_id = "microsoft/Phi-3-mini-4k-instruct"
llm = HuggingFaceEndpoint(repo_id=repo_id, max_length=128, temperature=0.7, token=sec_key, timeout=60)

def fetch_website_content(url):
    """Fetch content from the given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        text = ' '.join(p.get_text() for p in soup.find_all('p'))
        return text
    else:
        raise ValueError(f"Failed to retrieve content from {url}, status code: {response.status_code}")

# Streamlit UI
st.header("Anna")
with st.sidebar:
    st.title("Web URLs")
    urls = st.text_area("Enter the URLs from which information has to be extracted (one per line)").splitlines()

# Extract the text
if urls:
    combined_content = ""
    url_chunks_map = {}
    chunk_id = 0

    text_splitter = RecursiveCharacterTextSplitter(
        separators="\n",
        chunk_size=1000,
        chunk_overlap=150,
        length_function=len
    )

    for url in urls:
        if url.strip():
            content = fetch_website_content(url.strip())
            chunks = text_splitter.split_text(content)
            for chunk in chunks:
                url_chunks_map[chunk_id] = {'url': url.strip(), 'content': chunk}
                chunk_id += 1

    # Generating embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Creating vector store - FAISS
    vector_store = FAISS.from_texts([chunk['content'] for chunk in url_chunks_map.values()], embeddings)

    # Get user question
    user_question = st.text_input("Type your question here")

    # Do similarity search
    if user_question:
        match = vector_store.similarity_search(user_question)
        
        # Find the relevant URL
        relevant_url = None
        for doc in match:
            for chunk_id, chunk in url_chunks_map.items():
                if doc.page_content == chunk['content']:
                    relevant_url = chunk['url']
                    break
            if relevant_url:
                break

        # Output results
        chain = load_qa_chain(llm, chain_type="stuff")
        response = chain.run(input_documents=match, question=user_question)
        
        # Display the response
        st.write(response)
        
        # Display the relevant source URL at the bottom
        if relevant_url:
            st.write("\n\n**Source of the content:**")
            st.write(f"- {relevant_url}")


import psutil
import time

def log_resource_usage(interval=10):
    """Log the resource usage at the specified interval (in seconds)."""
    while True:
        # Get the current CPU and memory usage
        cpu_usage = psutil.cpu_percent(interval=1)
        memory_info = psutil.virtual_memory()
        
        # Log the usage
        print(f"CPU Usage: {cpu_usage}%")
        print(f"Memory Usage: {memory_info.percent}%")
        
        # Wait for the next interval
        time.sleep(interval)

# Start logging resource usage in a separate thread
import threading
threading.Thread(target=log_resource_usage, daemon=True).start()

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.
                    token was transferred to model_kwargs.
                    Please make sure that token is what you intended.
  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Aravind\.cache\huggingface\token
Login successful


2024-06-27 13:20:52.156 
  command:

    streamlit run C:\Users\Aravind\anaconda3\envs\py310\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-06-27 13:20:52.158 Session state does not function when running a script without `streamlit run`


CPU Usage: 1.3%
Memory Usage: 66.3%
CPU Usage: 0.0%
Memory Usage: 65.8%
CPU Usage: 1.2%
Memory Usage: 65.9%
CPU Usage: 0.4%
Memory Usage: 65.7%
CPU Usage: 2.5%
Memory Usage: 65.9%


In [None]:
!streamlit run C:\Users\Aravind\anaconda3\envs\py310\lib\site-packages\ipykernel_launcher.py

CPU Usage: 6.2%
Memory Usage: 67.4%
CPU Usage: 2.0%
Memory Usage: 67.4%
CPU Usage: 1.7%
Memory Usage: 67.4%
CPU Usage: 1.0%
Memory Usage: 67.1%
CPU Usage: 0.4%
Memory Usage: 66.8%
CPU Usage: 1.0%
Memory Usage: 66.5%
