In [1]:
pip install beautifulsoup4 requests sentence-transformers faiss-cpu openai streamlit


Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Step 1: Crawl and Scrape Content
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract text content from <p> and <h1>-<h3> tags
    content = ' '.join([tag.get_text() for tag in soup.find_all(['p', 'h1', 'h2', 'h3'])])
    return content

# Step 2: Segment Text into Chunks
def segment_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Step 3: Generate Vector Embeddings
def generate_embeddings(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Pre-trained model
    embeddings = model.encode(chunks)
    return embeddings

# Step 4: Store Embeddings in a Vector Database
def store_embeddings(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 similarity
    index.add(embeddings)
    return index

# Example Usage
urls = [
    "https://www.uchicago.edu/",
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

all_chunks = []
for url in urls:
    text = scrape_website(url)
    chunks = segment_text(text)
    all_chunks.extend(chunks)

embeddings = generate_embeddings(all_chunks)
vector_database = store_embeddings(embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
def search_query(query, vector_database, chunks):
    # Embed the user query
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode([query])
    
    # Perform similarity search
    distances, indices = vector_database.search(query_embedding, k=5)  # Retrieve top-5 chunks
    relevant_chunks = [chunks[i] for i in indices[0]]
    return relevant_chunks


In [5]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [None]:
import openai

def generate_response(query, context, model="gpt-3.5-turbo"):
    openai.api_key = "your-api-key-here"  # Replace with your OpenAI API key
    prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Example Usage
query = "What programs does Stanford University offer?"
relevant_chunks = search_query(query, vector_database, all_chunks)
context = ' '.join(relevant_chunks)
response = generate_response(query, context)
print(response)
import streamlit as st

st.title("RAG Pipeline - Chat with Websites")
st.write("Ask questions based on scraped website content!")

query = st.text_input("Enter your question:")
if query:
    relevant_chunks = search_query(query, vector_database, all_chunks)
    context = ' '.join(relevant_chunks)
    response = generate_response(query, context)
    st.write("### Response:")
    st.write(response)
streamlit run app.py
