In [1]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [6]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

## 1. Pull an article from the web

In [2]:
def get_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.get_text() for p in soup.find_all("p")]
    return "\n".join(paragraphs)

#### Example article on "Research with RAG"

In [3]:
article_url = "https://aws.amazon.com/what-is/retrieval-augmented-generation/"  # Replace with real URL
article_text = get_article_text(article_url)

## 2. Chunk the content

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)
chunks = text_splitter.split_text(article_text)


## 3. Store in vector DB (FAISS)

In [7]:
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
vectorstore = FAISS.from_texts(chunks, embeddings)

## 4. Query and answer

In [8]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
llm = ChatOpenAI(temperature=0, model_name="gpt-4.1-mini")

  llm = ChatOpenAI(temperature=0, model_name="gpt-4.1-mini")


In [9]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

#### Ask a question

In [11]:
query = "Explain how RAG improves research efficiency."
result = qa_chain.invoke({"query": query})

print("Answer:", result["result"])
print("\nSources:")
for doc in result["source_documents"]:
    print("-", doc.page_content[:200], "...")


Answer: RAG (Retrieval-Augmented Generation) improves research efficiency by enabling large language models (LLMs) to retrieve and incorporate relevant, authoritative information from pre-determined knowledge sources during the generation process. This approach offers several benefits that enhance research workflows:

1. **Accurate and Relevant Information:** RAG directs the LLM to access up-to-date and specific data from trusted sources, ensuring that the generated content is accurate and pertinent to the research query.

2. **Source Attribution:** The output can include citations or references, allowing researchers to verify information easily and consult original documents for deeper understanding, which saves time in cross-checking facts.

3. **Access to Latest Data:** By connecting LLMs to live or frequently updated sources such as news sites or social media feeds, RAG ensures that researchers receive the most current information without manually searching multiple platforms.

4. 