In [15]:
import os
import streamlit as st
import pickle
import PyPDF2
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain
import tempfile
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain


# Set your OpenAI API key
load_dotenv()
openai_api_key = os.environ['OPENAI_API_KEY']
url_file_path = "url_faiss_store_openai.pkl"

# Streamlit setup
st.set_page_config(
    page_title="Personal AI Assistant",
    layout="wide",
    initial_sidebar_state="expanded"
)
st.markdown("<h1 style='text-align: center; color: black;'>🤖 Personal AI Assistant</h1>", unsafe_allow_html=True)
st.sidebar.markdown("<h3 style='text-align: center; color: black;'>Assistant Console</h3>", unsafe_allow_html=True)

# ---- URL Loading & Embedding ----
num_links = st.sidebar.slider("How many links do you want to input?", min_value=1, max_value=5, value=1)
urls = [st.sidebar.text_input(f"URL {i+1}", key=f"url{i}") for i in range(num_links)]
if urls:
    loader = UnstructuredURLLoader(urls=urls)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "."], chunk_size=1000)
    url_docs = text_splitter.split_documents(data)
    if url_docs:
        embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
        url_vectorindex_openai = FAISS.from_documents(url_docs, embeddings)
        with open(url_file_path, "wb") as f:
            pickle.dump(url_vectorindex_openai, f)

# ---- PDF Loading & Embedding ----
uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=['pdf'])
if uploaded_file:
    pdf_reader = PdfReader(uploaded_file)
    pdf_text = ""
    for page in pdf_reader.pages:
        pdf_text += page.extract_text()
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "."], chunk_size= 500)
    pdf_docs = text_splitter.split_text(pdf_text)
    if pdf_docs:
        embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
        pdf_vectors = FAISS.from_texts(pdf_docs, embeddings)
        

# ---- Query Interface ----
llm = OpenAI(temperature=0.9, max_tokens=500, openai_api_key=openai_api_key)
data_source = st.selectbox("What do you want to inquire about?", ["URL", "PDF"])

if data_source == "URL":
    query_url = st.text_input('Ask your question about URLs:')
    if query_url:
        if os.path.exists(url_file_path):  # Ensure URL database exists
            with open(url_file_path, "rb") as f:
                vectorstore = pickle.load(f)
                chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
                result = chain({"question": query_url}, return_only_outputs=True)
                st.header("Answer based on URLs:")
                st.subheader(result['answer'])

elif data_source == "PDF":
    query_pdf = st.text_input('Ask your question about PDFs:')
    if query_pdf:
        docs = pdf_vectors.similarity_search(query_pdf)

        chain = load_qa_chain(llm, chain_type="stuff")
        response = chain.run(input_documents=docs, question=query_pdf)
           
        st.write(response)

    if st.button("Summarize PDF"):
        def summarize_pdfs_from_folder(pdfs_folder):
            summaries = []
            for pdf_file in pdfs_folder:
                with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                    temp_path = temp_file.name
                    temp_file.write(pdf_file.getvalue())
                loader = PyPDFLoader(temp_path)
                docs = loader.load_and_split()
                chain = load_summarize_chain(llm, chain_type="map_reduce")
                summary = chain.run(docs)
                summaries.append(summary)
                os.remove(temp_path)
            return summaries

        summaries = summarize_pdfs_from_folder([uploaded_file])
        for summary in summaries:
            st.write(summary)

Error fetching or processing , exception: Exactly one of file, filename and url must be specified.


In [11]:
!pip install unstructured

Collecting unstructured
  Downloading unstructured-0.15.13-py3-none-any.whl.metadata (29 kB)
Collecting chardet (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting lxml (from unstructured)
  Using cached lxml-5.3.0-cp310-cp310-win_amd64.whl.metadata (3.9 kB)
Collecting nltk (from unstructured)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tabulate (from unstructured)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting beautifulsoup4 (from unstructured)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloadin