# Enhanced RAG (Retrieval Augmented Generation)

This notebook demonstrates an improved RAG system capable of retrieving information from multiple sources and providing accurate responses based on the retrieved context.

In [38]:
# Install required packages
# !pip install faiss-cpu mistralai beautifulsoup4 requests numpy

## Load API key

In [39]:
import os
import dotenv

# Load from .env file if it exists
dotenv.load_dotenv()

# Set API key (replace with your own key or use environment variable)
os.environ["MISTRAL_API_KEY"] = ""  # Your key
print(f"MISTRAL_API_KEY: {os.environ.get('MISTRAL_API_KEY')}")
api_key = os.getenv("MISTRAL_API_KEY")

MISTRAL_API_KEY: EyT0g8QsBRkMBaB5P9xinoG1hgXty8fy


## Import required libraries

In [40]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import faiss
from mistralai import Mistral, UserMessage
import time
from urllib.parse import urlparse

## Define utilities for fetching and processing web content

In [41]:
def get_content_from_url(url):
    """
    Fetch content from a given URL and extract the main text.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        html_doc = response.text
        soup = BeautifulSoup(html_doc, "html.parser")
        
        # Remove script and style elements
        for script in soup(["script", "style", "header", "footer", "nav"]):
            script.extract()
        
        # Extract text from main content areas
        main_content = soup.find("main") or soup.find("article") or soup.find("div", class_="content") or soup.find("body")
        
        if main_content:
            text = main_content.get_text(separator='\n', strip=True)
        else:
            text = soup.get_text(separator='\n', strip=True)
        
        # Clean up the text
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        
        domain = urlparse(url).netloc
        source_info = f"Source: {domain} - {url}"
        
        return text, source_info
    except Exception as e:
        print(f"Error fetching content from {url}: {e}")
        return None, None

## Gather data from multiple sources

In [42]:
# List of URLs to fetch content from
urls = [
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and", # Sport and Wellness
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/student-attendance-policy", # Attendance
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/final-grade-policy", # Final Grade
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/student-conduct-policy", # Student Conduct,
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/udst-policies-and-procedures/academic-schedule-policy", # Academic Schedule,
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/student-appeals-policy", # Student Appeals,
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/transfer-policy", # Transfer Policy,
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/admissions-policy", # Admissions Policy,
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/registration-policy", # Registration Policy,
    "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/udst-policies-and-procedures/graduation-policy" # Graduation Policy
]

# Fetch content from each URL
all_texts = []
all_sources = []

for url in urls:
    text, source = get_content_from_url(url)
    if text and len(text) > 100:  # Ensure we have meaningful content
        all_texts.append(text)
        all_sources.append(source)

print(f"Processed {len(all_texts)} URLs successfully")

# Save the combined text to a file
combined_text = "\n\n---\n\n".join(all_texts)
file_name = "assets/combined_documents.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(combined_text)

Error fetching content from https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/academic-integrity-policy: 404 Client Error: Not Found for url: https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/academic-integrity-policy
Processed 10 URLs successfully


## Chunk the text with overlap for better context preservation

In [43]:
def chunk_text(text, chunk_size=512, overlap=100):
    """
    Split the text into overlapping chunks to maintain context across chunks.
    """
    chunks = []
    chunk_sources = []
    
    for i, doc_text in enumerate(all_texts):
        # Ensure the document has enough content to be worthwhile
        if len(doc_text) < 50:
            continue
            
        # Create overlapping chunks
        doc_chunks = []
        start = 0
        while start < len(doc_text):
            end = min(start + chunk_size, len(doc_text))
            doc_chunks.append(doc_text[start:end])
            start += chunk_size - overlap
            
        chunks.extend(doc_chunks)
        chunk_sources.extend([all_sources[i]] * len(doc_chunks))
    
    return chunks, chunk_sources

chunks, chunk_sources = chunk_text(all_texts)

## Get embeddings using Mistral API

In [44]:
def get_text_embedding(list_txt_chunks, batch_size=20):
    """
    Get embeddings for text chunks, with batching to avoid rate limits
    """
    client = Mistral(api_key=api_key)
    all_embeddings = []
    
    # Process in batches to avoid rate limits
    for i in range(0, len(list_txt_chunks), batch_size):
        batch = list_txt_chunks[i:i+batch_size]
        try:
            embeddings_batch_response = client.embeddings.create(model="mistral-embed", inputs=batch)
            all_embeddings.extend(embeddings_batch_response.data)
            # Add a small delay to avoid hitting rate limits
            time.sleep(1)  
        except Exception as e:
            print(f"Error getting embeddings for batch {i}:{i+batch_size}: {e}")
            # Add empty embeddings as placeholders
            for _ in range(len(batch)):
                all_embeddings.append(None)
    
    return all_embeddings

In [45]:
# Get embeddings for all chunks
text_embeddings = get_text_embedding(chunks)

# Filter out any failed embeddings
valid_embeddings = []
valid_chunks = []
valid_sources = []

for i, embedding in enumerate(text_embeddings):
    if embedding is not None:
        valid_embeddings.append(embedding.embedding)
        valid_chunks.append(chunks[i])
        valid_sources.append(chunk_sources[i])

# Convert to numpy array for FAISS
print(valid_sources)
embeddings = np.array(valid_embeddings)

['Source: www.udst.edu.qa - https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and', 'Source: www.udst.edu.qa - https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and', 'Source: www.udst.edu.qa - https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and', 'Source: www.udst.edu.qa - https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and', 'Source: www.udst.edu.qa - https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and', 'Source: www.udst.edu.qa - https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and', 'Source: www.udst.edu.qa - https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-

## Create and populate the vector database

In [46]:
# Build the FAISS index
d = len(valid_embeddings[0])  # Dimension of the embeddings
index = faiss.IndexFlatL2(d)
index.add(embeddings)

## RAG Query Function

In [47]:
def rag_query(question, k=3):
    """
    Perform a complete RAG query pipeline:
    1. Embed the question
    2. Retrieve relevant chunks
    3. Generate a response using the LLM
    """
    # Get embedding for the question
    question_embeddings = get_text_embedding([question])
    
    if not question_embeddings or question_embeddings[0] is None:
        return "Error: Could not generate embeddings for the question."
    
    query_embedding = np.array([question_embeddings[0].embedding])
    
    # Search for similar chunks
    D, I = index.search(query_embedding, k=k)
    
    # Get the retrieved chunks and their sources
    retrieved_chunks = [valid_chunks[i] for i in I.tolist()[0]]
    retrieved_sources = [valid_sources[i] for i in I.tolist()[0]]
    
    # Format the context with source information
    context = ""
    for i, chunk in enumerate(retrieved_chunks):
        context += f"\nChunk {i+1}:\n{chunk}\n{retrieved_sources[i]}\n---\n"
    
    # Build the prompt for the LLM
    prompt = f"""
    You are given the following context information. Use it to answer the user's question accurately.
    If the information needed is not in the context, please say "I don't have enough information to answer this question."
    
    Context information:
    ---------------------
    {context}
    ---------------------
    
    Question: {question}
    
    Please provide a comprehensive answer based solely on the context information provided.
    Include references to the sources when appropriate.
    """
    
    # Generate response using Mistral
    client = Mistral(api_key=api_key)
    messages = [UserMessage(content=prompt)]
    try:
        chat_response = client.chat.complete(
            model="mistral-large-latest",  # You can change to a different model if needed
            messages=messages,
        )
        response = chat_response.choices[0].message.content
    except Exception as e:
        response = f"Error generating response: {str(e)}"
    
    return response, context, retrieved_sources

## Test the RAG system

In [48]:
# Test with a sample question
question = "Can i not attend classes?"
response, context, sources = rag_query(question)

print(f"Question: {question}\n\nAnswer:\n{response}")

Question: Can i not attend classes?

Answer:
Based on the context information provided, students are expected to attend classes regularly. Here are the key points:

1. **Student Responsibility**: Students are responsible for regular and punctual attendance of all learning sessions and prescribed activities for the courses in which they are enrolled (Chunk 1).

2. **Attendance Importance**: The university recognizes that regular attendance and participation in class is fundamental to student success (Chunk 2).

3. **Attendance Standard**: The maximum allowable limit for absenteeism is 15% of learning sessions per course during a semester. Exceeding this limit may result in a failing grade for that course (Chunk 3).

Therefore, while you can miss some classes, not attending classes regularly may lead to consequences, including a failing grade. It is important to adhere to the attendance policy as outlined by the university.

Source:
- www.udst.edu.qa - Student Attendance Policy


## Save the RAG Components for Streamlit App

In [49]:
import pickle

# Save the index and associated data
faiss.write_index(index, "assets/rag_index.faiss")

with open("assets/rag_data.pkl", "wb") as f:
    pickle.dump({
        "chunks": valid_chunks,
        "sources": valid_sources,
        "api_key": api_key
    }, f)