In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain import OpenAI
from langchain_huggingface import HuggingFaceEmbeddings  # Updated import
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import numpy as np

# Load environment variables
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Initialize OpenAI
openai = OpenAI(api_key=OPENAI_API_KEY)

# Function to scrape the website
def scrape_website(url):
    try:
        loader = WebBaseLoader(url)
        documents = loader.load()
        if documents:
            content = " ".join([doc.page_content for doc in documents])
            return content
        else:
            print("No documents found.")
            return None
    except Exception as e:
        print(f"An error occurred while scraping the website: {e}")
        return None

# Function to divide text into chunks
def chunk_text(text, chunk_size=500):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Function to convert text chunks into vector embeddings
def text_to_vectors(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectors = embeddings.embed_documents(chunks)  # Use embed_documents method
    return vectors, embeddings  # Return embeddings too

# Function to store vectors in a vector store database
def store_vectors(chunks, embeddings):
    # Create a FAISS index using from_texts
    vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

# Function to create a retrieval QA chain
def create_retrieval_qa_chain(vector_store):
    # Ensure that you are passing the correct arguments to RetrievalQA
    retrieval_qa = RetrievalQA.from_chain_type(
        llm=openai, 
        chain_type="stuff",  # Or "map_reduce", depending on your needs
        retriever=vector_store.as_retriever()
    )
    return retrieval_qa

# Function to get response from LLM
def get_llm_response(query, retrieval_qa):
    response = retrieval_qa.run(query)
    return response

# Main execution
if __name__ == "__main__":
    url = input("Enter the website URL to scrape: ")  # Ask the user for the website URL
    scraped_text = scrape_website(url)

    if scraped_text is None or scraped_text.strip() == "":
        print("Failed to retrieve the website content.")
    else:
        print("Scraped Text:\n", scraped_text[:1000])  # Print the first 1000 characters of the scraped text
        
        chunks = chunk_text(scraped_text)
        print(f"Number of chunks created: {len(chunks)}")  # Debugging print
        vectors, embeddings = text_to_vectors(chunks)  # Get vectors and embeddings
        print(f"Number of vectors created: {len(vectors)}")  # Debugging print
        
        vector_store = store_vectors(chunks, embeddings)  # Pass embeddings for FAISS
        
        query = input("Enter your query: ")
        retrieval_qa = create_retrieval_qa_chain(vector_store)
        
        response = get_llm_response(query, retrieval_qa)
        print("Response:\n", response)


USER_AGENT environment variable not set, consider setting it to identify your requests.
  openai = OpenAI(api_key=OPENAI_API_KEY)


Scraped Text:
 



Artificial intelligence - Wikipedia


























Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search















Donate








Appearance
















Create account

Log in








Personal tools





 Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Goals




Toggle Goals subsection





1.1
Reasoning and problem-solving








1.2
Knowledge representation








1.3
Planning and decision-making








1.4
Learning








1.5
Natural language processing








1.6
Perception








1.7
Social intelligence








1.8
General intelligence










2
Techniques




Toggle Techniques subsection






  from tqdm.autonotebook import tqdm, trange


Number of vectors created: 389


  retrieval_qa = RetrievalQA(vector_store=vector_store, llm=openai)


ValidationError: 4 validation errors for RetrievalQA
combine_documents_chain
  Field required [type=missing, input_value={'vector_store': <langcha...roxy='', logit_bias={})}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
retriever
  Field required [type=missing, input_value={'vector_store': <langcha...roxy='', logit_bias={})}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
vector_store
  Extra inputs are not permitted [type=extra_forbidden, input_value=<langchain_community.vect...t at 0x000001CB8B1383D0>, input_type=FAISS]
    For further information visit https://errors.pydantic.dev/2.9/v/extra_forbidden
llm
  Extra inputs are not permitted [type=extra_forbidden, input_value=OpenAI(client=<openai.res...proxy='', logit_bias={}), input_type=OpenAI]
    For further information visit https://errors.pydantic.dev/2.9/v/extra_forbidden