In [1]:
import requests
from bs4 import BeautifulSoup
import os
import json
import faiss
import numpy as np
from bs4 import BeautifulSoup, NavigableString
from urllib.parse import urljoin
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from collections import defaultdict
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
def get_data_from_website(url):
    # Get response from the server
    response = requests.get(url)
    if response.status_code == 500:
        print("Server error")
        return
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Step 1: Find all tab titles
    tab_titles = soup.find_all("div", class_="elementor-tab-title")

    # Step 2: Find all corresponding tab contents
    tab_data = {}

    for title_div in tab_titles:
        tab_id = title_div.get("data-tab")
        tab_title = title_div.get_text(strip=True)

        matching_content = soup.find("div", class_="elementor-tab-content", attrs={"data-tab": tab_id})
        tab_content = matching_content.get_text(separator="\n", strip=True) if matching_content else ""

        tab_data[tab_title] = tab_content

    # ✅ Create 'data' folder if it doesn't exist
    os.makedirs("data", exist_ok=True)

    # ✅ Save as JSON file
    with open("data/tab_data.json", "w", encoding="utf-8") as f:
        json.dump(tab_data, f, ensure_ascii=False, indent=2)

    print("Data saved to data/tab_data.json")


In [3]:
# get_data_from_website("https://www.dinecollege.edu/academics/academic-policies/")

In [4]:
def append_ferpa_data(url, output_filename="data//tab_data.json"):
    """
    Fetches data from a FERPA-related website, processes it, and appends it to a JSON file.

    Args:
        url (str): The URL of the website to scrape.
        output_filename (str, optional): The name of the JSON file to save/append data to.
            Defaults to "data//tab_data.json".
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    data = {}
    h3_tags = soup.find_all('h3')
    for h3_tag in h3_tags:
        question = h3_tag.text.strip()
        answer_parts = []
        sibling = h3_tag.find_next_sibling()
        while sibling and sibling.name in ['p', 'div', 'ul', 'ol']:
            answer_parts.append(sibling.text.strip())
            sibling = sibling.find_next_sibling()
        answer = " ".join(answer_parts).strip()
        if question and answer:
            data[question] = answer

    modified_data = {}
    for key, value in data.items():
        new_key = key.rstrip(":")
        modified_value = value.replace("Back to Top", "").strip()
        if len(modified_value.split()) >= 5:
            modified_data[new_key] = modified_value

    _append_to_json(modified_data, output_filename)

def append_civil_rights_data(url, output_filename="data//tab_data.json"):
    """
    Fetches data from a civil rights laws website, processes it, and appends it to a JSON file.

    Args:
        url (str): The URL of the website to scrape.
        output_filename (str, optional): The name of the JSON file to save/append data to.
            Defaults to "data//tab_data.json".
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    final_data = {}

    # Extract the main heading and description
    main_heading = soup.find('h1', class_='usa-hero__heading')
    main_desc = soup.find('div', class_='field--name-body')

    if main_heading and main_desc:
        title = main_heading.text.strip()
        description = main_desc.text.strip()
        final_data[title] = description

    # Now extract the cards information
    cards = soup.find_all('div', class_='card-image-top-txt')

    for card in cards:
        card_title_tag = card.find('div', class_='field--name-field-ed-card-image-top-title')
        card_summary_tag = card.find('div', class_='field--name-field-ed-card-image-top-summary')
        card_link_tag = card.find('div', class_='field--name-field-ed-card-image-top-link')

        if card_title_tag and card_summary_tag:
            card_title = card_title_tag.text.strip()
            card_summary = card_summary_tag.text.strip()

            # Get the link if available
            link = ""
            if card_link_tag and card_link_tag.find('a'):
                href = card_link_tag.find('a')['href']
                if href.startswith("/"):
                    href = "https://www.ed.gov" + href
                link = href
            final_data[card_title] = f"{card_summary} link :- {link}".strip()

    _append_to_json(final_data, output_filename)

def append_file_complaint_data(url, output_filename="data//tab_data.json"):
    """
    Fetches data from the file a complaint website, processes it, and appends it to a JSON file.

    Args:
        url (str): The URL of the website to scrape.
        output_filename (str, optional): The name of the JSON file to save/append data to.
            Defaults to "data//tab_data.json".
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove unnecessary tags
    for tag in soup(["script", "style", "footer", "nav", "header", "aside"]):
        tag.decompose()

    # Remove known banners or sections
    for div in soup.find_all(['div', 'section'], class_=[
        'usa-banner', 'header', 'navigation', 'menu', 'site-header',
        'usa-footer', 'main-header', 'branding', 'footer-links'
    ]):
        div.decompose()

    for elem in soup.find_all(id=[
        'header', 'footer', 'navbar', 'skip-link', 'back-to-top'
    ]):
        elem.decompose()

    # Get heading
    heading = soup.find('h1')
    key = heading.get_text(strip=True) if heading else "No Heading Found"

    # Collect all visible text
    body_text = soup.get_text(separator='\n')
    lines = [line.strip() for line in body_text.splitlines() if line.strip()]

    # Keywords/phrases to exclude
    unwanted_keywords = [
        "Complaint Forms", "Electronic Complaint Form Learn how to file", "How OCR Evaluates Complaints",
        "FAQs on the Complaint Process", "Customer Service Standards for the Case Resolution Process",
        "Complainant and Interviewee Rights and Protections", "Rights and protections",
        "Office of Communications and Outreach", "Page Last Reviewed"
    ]

    # Remove lines matching unwanted sections
    filtered_lines = [
        line for line in lines
        if not any(keyword.lower() in line.lower() for keyword in unwanted_keywords)
    ]

    # Try to add Electronic Complaint Form and Fillable PDF Complaint Form links
    extra_links_text = ""
    electronic_form = soup.find('a', string=lambda text: text and 'Electronic Complaint Form' in text)
    pdf_form = soup.find('a', string=lambda text: text and 'Fillable PDF Complaint Form' in text)

    if electronic_form:
        href = electronic_form.get('href')
        extra_links_text += f"\nElectronic Complaint Form: {href}"
    if pdf_form:
        href = pdf_form.get('href')
        extra_links_text += f"\nFillable PDF Complaint Form: {href}"

    # Final value
    value = ' '.join(filtered_lines) + extra_links_text

    # Result dict
    result = {key: value}

    _append_to_json(result, output_filename)

def _append_to_json(new_data, output_filename):
    """
    Appends a dictionary of data to an existing JSON file or creates a new one.

    Args:
        new_data (dict): The dictionary data to append.
        output_filename (str): The name of the JSON file.
    """
    try:
        with open(output_filename, 'r+', encoding='utf-8') as f:
            try:
                existing_data = json.load(f)
                existing_data.update(new_data)
                f.seek(0)
                json.dump(existing_data, f, ensure_ascii=False, indent=4)
                f.truncate() # Remove remaining part if new data is shorter
            except json.JSONDecodeError:
                print("Error decoding existing JSON file. Overwriting with new data.")
                f.seek(0)
                json.dump(new_data, f, ensure_ascii=False, indent=4)
                f.truncate()
    except FileNotFoundError:
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(new_data, f, ensure_ascii=False, indent=4)

In [5]:
ferpa_url = "https://studentprivacy.ed.gov/ferpa"
civil_rights_url = "https://www.ed.gov/laws-and-policy/civil-rights-laws"
file_complaint_url = "https://www.ed.gov/laws-and-policy/civil-rights-laws/file-complaint"

# append_ferpa_data(ferpa_url)
# append_civil_rights_data(civil_rights_url)
# append_file_complaint_data(file_complaint_url)

print("Data appended to data//tab_data.json")

Data appended to data//tab_data.json


In [6]:
def extract_table_as_text(table):
    """Convert HTML table to a formatted string."""
    rows = []
    for row in table.find_all('tr'):
        cols = [col.get_text(strip=True) for col in row.find_all(['th', 'td'])]
        rows.append('\t'.join(cols))
    return '\n'.join(rows)

def append_fafsa_data(url, output_filename="data//tab_data.json"):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    # print("Soup :-", soup)
    container = soup.find('div', class_='field field--name-body field--type-text-with-summary field--label-hidden field__item')
    if not container:
        print("No content container found.")
        return

    result = defaultdict(str)
    current_header = None

    # Iterate through direct children of the container
    for tag in container.find_all(recursive=False):
        if tag.name and tag.name.startswith('h'):
            current_header = tag.get_text(strip=True)
            result[current_header] = ''

        elif tag.name == 'p' and current_header:
            paragraph_text = tag.get_text(strip=True)
            if paragraph_text:
                result[current_header] += paragraph_text + '\n'

        elif tag.name == 'table' and current_header:
            table_text = extract_table_as_text(tag)
            if table_text:
                result[current_header] += '\n' + table_text + '\n'

        elif tag.name == 'div' and current_header:
            # Look for any nested tables inside divs (e.g. grid or layout blocks)
            nested_table = tag.find('table')
            if nested_table:
                table_text = extract_table_as_text(nested_table)
                if table_text:
                    result[current_header] += '\n' + table_text + '\n'

    # Strip trailing whitespace
    result = {k: v.strip() for k, v in result.items() if len(v.strip())>1}

    return result

In [7]:
# under developement
# append_fafsa_data("https://www.ed.gov/higher-education/paying-college/better-fafsa")

In [10]:
def extract_table_as_text(table):
    rows = []
    for row in table.find_all('tr'):
        cols = [col.get_text(strip=True) for col in row.find_all(['th', 'td'])]
        rows.append('\t'.join(cols))
    return '\n'.join(rows)

def extract_text_with_links(element, base_url):
    result = ""
    for child in element.children:
        if child.name == 'a':
            link_text = child.get_text(strip=True)
            link_url = child.get('href', '')
            if link_url and not link_url.startswith(('http://', 'https://')):
                link_url = urljoin(base_url, link_url)
            result += f"{link_text} [{link_url}]" if link_url else link_text
        elif isinstance(child, str):
            result += child
        elif child.name:
            result += extract_text_with_links(child, base_url)
    return result.strip()

def extract_list_content(heading_element, base_url):
    content = []
    current = heading_element.next_sibling

    while current:
        if isinstance(current, NavigableString):
            current = current.next_sibling
            continue
        if current.name == 'ul':
            for li in current.find_all('li', recursive=True):
                item_content = extract_text_with_links(li, base_url)
                content.append(item_content)
            break
        elif current.name in ['h2', 'h3']:
            break
        current = current.next_sibling

    return content

def extract_h3_with_paragraphs(soup):
    result = {}
    panels = soup.find_all("div", class_="panel panel-primary")
    for panel in panels:
        heading = panel.find("div", class_="panel-heading")
        body = panel.find("div", class_="panel-body")
        if heading and body:
            h3 = heading.find("h3")
            p = body.find("p")
            if h3 and p:
                heading_text = h3.get_text(strip=True)
                paragraph_text = p.get_text(strip=True)
                result[heading_text] = paragraph_text
    return result

def append_fafsa_data(url, output_filename="data/tab_data.json"):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return

    base_url = url
    soup = BeautifulSoup(response.content, 'html.parser')
    container = soup.find('div', class_='field field--name-body field--type-text-with-summary field--label-hidden field__item')
    if not container:
        print("No content container found.")
        return

    result = defaultdict(str)
    current_header = None

    for tag in container.find_all(recursive=False):
        if tag.name and tag.name.startswith('h'):
            current_header = tag.get_text(strip=True)
            result[current_header] = ''

        elif tag.name == 'p' and current_header:
            paragraph_text = tag.get_text(strip=True)
            if paragraph_text:
                result[current_header] += paragraph_text + '\n'

        elif tag.name == 'table' and current_header:
            table_text = extract_table_as_text(tag)
            if table_text:
                result[current_header] += '\n' + table_text + '\n'

        elif tag.name == 'div' and current_header:
            nested_table = tag.find('table')
            if nested_table:
                table_text = extract_table_as_text(nested_table)
                if table_text:
                    result[current_header] += '\n' + table_text + '\n'

    result = {k: v.strip() for k, v in result.items() if len(v.strip()) > 1}

    headers = container.find_all(['h2', 'h3'])
    for header in headers:
        header_text = header.get_text(strip=True)
        list_items = extract_list_content(header, base_url)
        if list_items:
            if header_text in result:
                result[header_text] += '\n' + '\n'.join(list_items)
            else:
                result[header_text] = '\n'.join(list_items)

    panel_data = extract_h3_with_paragraphs(soup)
    for heading, paragraph in panel_data.items():
        if heading in result:
            result[heading] += '\n' + paragraph
        else:
            result[heading] = paragraph

    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    _append_to_json(result, output_filename)
    print(f"Data appended to {output_filename}")
    return result

# if __name__ == "__main__":
#     url = "https://www.ed.gov/higher-education/paying-college/better-fafsa"
#     result = append_fafsa_data(url)
#     print(json.dumps(result, indent=2))


In [11]:
# # Load JSON data
# with open('data/tab_data.json', 'r', encoding='utf-8') as f:
#     tab_data = json.load(f)

# # Combine tab title and content into a document
# documents = [f"{key}: {value}" for key, value in tab_data.items()]
# metadata = list(tab_data.keys())

# # Load a pre-trained embedding model from Hugging Face
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Generate embeddings
# embeddings = model.encode(documents, convert_to_numpy=True, show_progress_bar=True)

# # Create FAISS index
# embedding_dim = embeddings.shape[1]
# index = faiss.IndexFlatL2(embedding_dim)  # Using L2 similarity
# index.add(embeddings)

# # Save the index
# faiss.write_index(index, 'data/faiss_index.idx')

# # Save the metadata for reverse lookup
# with open('data/faiss_metadata.json', 'w', encoding='utf-8') as f:
#     json.dump(metadata, f, ensure_ascii=False, indent=2)

# print("✅ FAISS index and metadata saved.")

Batches: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s]

✅ FAISS index and metadata saved.





In [12]:
# # Load the saved FAISS index and metadata
# index = faiss.read_index('data/faiss_index.idx')
# with open('data/faiss_metadata.json', 'r', encoding='utf-8') as f:
#     metadata = json.load(f)

# # Also load the tab_data to fetch full content for results
# with open('data/tab_data.json', 'r', encoding='utf-8') as f:
#     tab_data = json.load(f)

In [18]:
# metadata
# model
# tab_data

In [24]:
# def search_query(user_query, index, model, metadata, tab_data, top_k=2, distance_threshold=0.5):
#     def get_matches(threshold):
#         query_vector = model.encode([user_query], convert_to_numpy=True)
#         distances, indices = index.search(query_vector, top_k)

#         if threshold is None:
#             valid_indices = [
#                 i for i in range(len(indices[0]))
#                 if indices[0][i] < len(metadata)
#             ]
#         else:
#             valid_indices = [
#                 i for i, dist in enumerate(distances[0])
#                 if dist < threshold and indices[0][i] < len(metadata)
#             ]

#         retrieved_titles = [metadata[indices[0][i]] for i in valid_indices]
#         retrieved_docs = [tab_data[title] for title in retrieved_titles if title in tab_data]

#         return retrieved_titles, retrieved_docs, distances

#     # First attempt: use threshold
#     retrieved_titles, retrieved_docs, distances = get_matches(distance_threshold)

#     # Fallback attempt: no threshold if nothing found
#     if not retrieved_titles or not retrieved_docs:
#         retrieved_titles, retrieved_docs, distances = get_matches(None)

#     return retrieved_titles, retrieved_docs, distances


# search_query("Want to know about the academic appeals.", index, model, metadata, tab_data)


(['Grades', 'Academics'],
 ['General Grade Appeal\nGrades are determined solely by the individual faculty who taught the course for the session(s) or the semester(s). A student who wishes to contest a grade must first attempt to resolve the matter with the course faculty.\nIf the matter cannot be resolved with the instructor, the student may appeal to the appropriate Dean of School. The student must provide the evidence as to why the grade posted by the faculty is an error. if the matter is not resolved with the Dean of School, the student may appeal a final time to the Academic Standards Committee. The decision of the Academic Standards committee is final.\nGrades may be appealed within one academic year. The Grade Appeal Form can be obtained by Office of the Registrar and will guide students through each of the three steps.\nGrade Point Average (GPA) System\nGrade Point Average (GPA) refers to the average grade at any particular time during, or at the end of, any particular semester.

In [17]:
# llm = ChatGroq(
#         model="Llama3-8b-8192",
#         temperature=0,
#         max_tokens=8000,
#         timeout=30,
#         max_retries=2,
#     )

# def generate_answer(user_query, retrieved_titles, tab_data):
#     # Combine relevant tab content
#     retrieved_docs = "\n\n".join([f"{title}: {tab_data[title]}" for title in retrieved_titles if title in tab_data])

#     # Construct the prompt
#     prompt = f"""
#         You are an expert assistant helping users understand resources related to FAFSA. 
#         Answer the user's question **only** based on the information provided below. 
#         Do **not** use any external knowledge or make assumptions beyond the provided documents.

#         When answering:
#         - Use a friendly, guiding tone.
#         - Structure the answer in a **clear, easy-to-follow manner**.
#         - Use **storytelling** where appropriate to guide the user step-by-step.
#         - Highlight important tools or resources using bullet points or bold text.
#         - Group information by relevant audience (e.g., students, educators, officials) if applicable.

#         If the answer is **not present** in the information, reply with: 
#         "I'm sorry, but that question is outside the scope of the provided information."

#         Information:
#         {retrieved_docs}

#         Question: {user_query}

#         Answer:
#         """



#     # Generate response using ChatGroq (LLaMA3)
#     response = llm.invoke(prompt)
#     return response


In [None]:
# def search_and_generate(user_query, top_k=3):
#     query_vector = model.encode([user_query], convert_to_numpy=True)
#     distances, indices = index.search(query_vector, top_k)

#     retrieved_titles = [metadata[idx] for idx in indices[0]]

#     # Generate the answer using LLaMA (ChatGroq)
#     answer = generate_answer(user_query, retrieved_titles, tab_data)

#     print(f"\n🧠 Answer:\n{answer.content}")


# # question 1 :- Want to know about the academic appeals.
# # question 2 :- How to File A Complaint for civil rights.
# # question 3 :- Can you tell me the capital of france?
# # question 4 :- Want to understand about the Resources for students and families, high school educators and college access counselors, and college officials

# search_and_generate("Want to understand about the Resources for students and families, high school educators and college access counselors, and college officials")



🧠 Answer:
I'd be happy to help you understand the resources available for students and families, high school educators and college access counselors, and college officials.

**For Students and Families:**

* The **FAFSA Toolkit for Students and Families** is a great resource to get started with the FAFSA process. You can find it at [https://www.ed.gov/sites/ed/files/finaid/info/apply/fafsa-toolkit-students-families.pdf].
* Learn about the **Better FAFSA Form** and check out the slide deck at [https://www.ed.gov/sites/ed/files/finaid/info/apply/better-fafsa-slide-deck.pdf].
* Read the **FAFSA Pro Tips** at [https://studentaid.gov/announcements-events/fafsa-support/pro-tips] to help you successfully complete the FAFSA form.
* Watch the video on **Applying for Financial Aid with the FAFSA Form** at [https://www.youtube.com/watch?v=UupEQdS2VMY].
* Use the **Federal Student Aid Estimator** at [https://studentaid.gov/aid-estimator/] to receive an estimate of how much federal student aid you

In [1]:
import json
import tiktoken
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Function to count tokens
def count_tokens(text, model="cl100k_base"):
    """Count the number of tokens in a text string using tiktoken"""
    try:
        encoder = tiktoken.get_encoding(model)
        tokens = encoder.encode(text)
        return len(tokens)
    except Exception as e:
        print(f"Error counting tokens: {e}")
        # Rough estimation if tiktoken fails
        return len(text.split()) * 1.3 

# Load JSON data
with open('data/tab_data.json', 'r', encoding='utf-8') as f:
    tab_data = json.load(f)

# Load model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize ChromaDB
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="jericho_documents")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Text splitter for chunking documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

In [3]:
# Process each document: chunk, embed, and add to ChromaDB
document_chunks = []
chunk_ids = []
chunk_metadata = []
chunk_count = 0

for title, content in tab_data.items():
    # Create document with title and content
    document = f"{title}: {content}"
    
    # Split document into chunks
    chunks = text_splitter.split_text(document)
    
    # Process each chunk
    for i, chunk in enumerate(chunks):
        chunk_id = f"chunk_{chunk_count}"
        chunk_count += 1
        
        # Store chunk with its metadata
        document_chunks.append(chunk)
        chunk_ids.append(chunk_id)
        chunk_metadata.append({"title": title, "chunk_index": i, "source": "tab_data"})

# Generate embeddings for all chunks at once (more efficient)
embeddings = model.encode(document_chunks, convert_to_numpy=True, show_progress_bar=True)
embeddings

Batches: 100%|██████████| 70/70 [00:15<00:00,  4.42it/s]


array([[-0.05162941,  0.03510767,  0.00345315, ..., -0.00883835,
         0.02973488, -0.00312516],
       [-0.0566844 ,  0.07717527,  0.00406552, ...,  0.03000312,
         0.0135109 , -0.08125546],
       [-0.03494455,  0.12837943, -0.01350101, ..., -0.01108158,
         0.04252497, -0.04689284],
       ...,
       [-0.06428131,  0.00833308,  0.05722607, ..., -0.05581113,
        -0.02154832,  0.041064  ],
       [-0.05172734,  0.02989433,  0.09451348, ...,  0.02358157,
         0.01147298, -0.0493726 ],
       [-0.0467954 ,  0.07046118,  0.09045599, ...,  0.00788155,
         0.03002216, -0.07994787]], shape=(2223, 384), dtype=float32)

In [4]:
# Add documents and embeddings to ChromaDB collection
collection.add(
    embeddings=embeddings.tolist(),
    documents=document_chunks,
    metadatas=chunk_metadata,
    ids=chunk_ids
)

print(f"✅ Added {len(document_chunks)} chunks to ChromaDB collection")

✅ Added 2223 chunks to ChromaDB collection


In [5]:
llm = ChatGroq(
    model="Llama3-8b-8192",
    temperature=0,
    max_tokens=4000,  # Reduced from 8000
    timeout=30,
    max_retries=2,
)

def search_query(user_query, top_k=3):
    """Search ChromaDB for relevant documents based on user query"""
    # Generate embedding for the query
    query_embedding = model.encode([user_query], convert_to_numpy=True)[0].tolist()
    
    # Query the collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    
    retrieved_chunks = results['documents'][0]  # Top k chunks
    chunk_metadata = results['metadatas'][0]    # Metadata for each chunk
    distances = results['distances'][0]         # Distance scores
    
    # Get the unique titles from the retrieved chunks
    retrieved_titles = list(set([metadata['title'] for metadata in chunk_metadata]))
    
    return retrieved_titles, retrieved_chunks, distances

def generate_answer(user_query, retrieved_chunks, tab_data):
    """Generate an answer using LLaMA based on retrieved content"""
    # Include both full tab content and the specific retrieved chunks
    chunk_context = "\n\n".join(retrieved_chunks)
    
    # Construct the prompt
    prompt = f"""
        You are an expert assistant helping users understand resources related to FAFSA. 
        Answer the user's question **only** based on the information provided below. 
        Do **not** use any external knowledge or make assumptions beyond the provided documents.

        When answering:
        - Use a friendly, guiding tone.
        - Structure the answer in a **clear, easy-to-follow manner**.
        - Use **storytelling** where appropriate to guide the user step-by-step.
        - Highlight important tools or resources using bullet points or bold text.
        - Group information by relevant audience (e.g., students, educators, officials) if applicable.

        If the answer is **not present** in the information, reply with: 
        "I'm sorry, but that question is outside the scope of the provided information."

        Most relevant chunks:
        {chunk_context}

        Question: {user_query}

        Answer:
        """
    
    # Count tokens in the prompt
    token_count = count_tokens(prompt)
    print(f"📊 Sending {token_count} tokens to the LLM")

    # Check if we're close to the limit
    if token_count > 6000:
        print(f"⚠️ WARNING: Token count ({token_count}) is approaching or exceeding Groq's limit of 6000 TPM")
    
    # Generate response using ChatGroq (LLaMA3)
    response = llm.invoke(prompt)
    return response

def search_and_generate(user_query, top_k=3):
    """End-to-end function to search and generate an answer"""
    # Search for relevant content
    retrieved_titles, retrieved_chunks, distances = search_query(user_query, top_k)

    # Print token counts for each chunk
    print("📏 Token counts for each retrieved chunk:")
    for i, chunk in enumerate(retrieved_chunks):
        print(f"  Chunk {i+1}: {count_tokens(chunk)} tokens")
    
    # Generate the answer - ONLY pass retrieved chunks, not full documents
    answer = generate_answer(user_query, retrieved_chunks, tab_data)

    # Count tokens in the response
    response_tokens = count_tokens(answer.content)
    print(f"📊 Response contains {response_tokens} tokens")
    
    print(f"\n🧠 Answer:\n{answer.content}")
    
    # Return details for debugging or further use
    # Return details for debugging or further use
    return {
        "retrieved_titles": retrieved_titles,
        "chunks_count": len(retrieved_chunks),
        "total_input_tokens": count_tokens("\n\n".join(retrieved_chunks) + user_query),
        "response_tokens": response_tokens,
        "answer": answer.content
    }

In [6]:
# Test query
query = "Want to understand about the Resources for students and families, high school educators and college access counselors, and college officials"
search_and_generate(query)

📏 Token counts for each retrieved chunk:
  Chunk 1: 118 tokens
  Chunk 2: 88 tokens
  Chunk 3: 106 tokens
📊 Sending 518 tokens to the LLM
📊 Response contains 515 tokens

🧠 Answer:
I'd be happy to help you understand the resources related to FAFSA!

**For Students and Families:**

* The Free Application for Federal Student Aid (FAFSA) is a crucial step in applying for financial aid for higher education.
* The FAFSA website ([www.fafsa.gov](http://www.fafsa.gov)) provides a comprehensive guide on how to complete the application, including a step-by-step process and a list of required documents.
* The FAFSA4caster tool ([www.fafsa.gov/FAFSA4caster](http://www.fafsa.gov/FAFSA4caster)) helps students and families estimate their Expected Family Contribution (EFC) and plan for college expenses.

**For High School Educators and College Access Counselors:**

* The FAFSA Toolkit for High School Educators and College Access Counselors ([https://www.ed.gov/sites/ed/files/finaid/info/apply/fafsa-to

{'retrieved_titles': ['data\\hr_policies\\PPPM - 2021 - Updated 02.23.2024 HR.pdf',
  'For high school educators and college access counselors'],
 'chunks_count': 3,
 'total_input_tokens': 335,
 'response_tokens': 515,
 'answer': "I'd be happy to help you understand the resources related to FAFSA!\n\n**For Students and Families:**\n\n* The Free Application for Federal Student Aid (FAFSA) is a crucial step in applying for financial aid for higher education.\n* The FAFSA website ([www.fafsa.gov](http://www.fafsa.gov)) provides a comprehensive guide on how to complete the application, including a step-by-step process and a list of required documents.\n* The FAFSA4caster tool ([www.fafsa.gov/FAFSA4caster](http://www.fafsa.gov/FAFSA4caster)) helps students and families estimate their Expected Family Contribution (EFC) and plan for college expenses.\n\n**For High School Educators and College Access Counselors:**\n\n* The FAFSA Toolkit for High School Educators and College Access Counselors (