In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Function to recursively scrape links
def scrape_links(url, root_url, visited=None):
    """
    Crawl through all the sublinks of a root URL, keeping track of what has been visited
    Returns a list of strings (URLs)
    """
    if visited is None:
        visited = set()

    # Skip if the URL has already been visited
    if url in visited:
        return []

    # Mark the URL as visited
    visited.add(url)

    try:
        # Fetch the page content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all links from the page
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        full_url = urljoin(root_url, href)

        # Filter out fragment links, we'll deal with this later
        if full_url.startswith("https://wehi-researchcomputing.github.io/faq#") or full_url.startswith("https://wehi-researchcomputing.github.io/email"):
            continue  # Skip adding this link

        # Only process links that start with the root URL
        if full_url.startswith(root_url):
            links.append(full_url)
            
            # Recursively scrape links from the sublink
            links.extend(scrape_links(full_url, root_url, visited))

    return links

# Root URL
root_url = "https://wehi-researchcomputing.github.io/"

# Scrape and store all the links
all_links = scrape_links(root_url, root_url)

Error fetching https://wehi-researchcomputing.github.io/[url](https:/www.eventbrite.com.au/o/the-rse-association-of-australia-and-new-zealand-65201929823): 404 Client Error: Not Found for url: https://wehi-researchcomputing.github.io/%5Burl%5D(https:/www.eventbrite.com.au/o/the-rse-association-of-australia-and-new-zealand-65201929823)
Error fetching https://wehi-researchcomputing.github.io/11-Summer-2024-2025: 404 Client Error: Not Found for url: https://wehi-researchcomputing.github.io/11-Summer-2024-2025


In [2]:
# Remove duplicates 
urls = list(set(all_links))

# Checking the URLs that were scraped
print(urls)

['https://wehi-researchcomputing.github.io/faq', 'https://wehi-researchcomputing.github.io/intakes/', 'https://wehi-researchcomputing.github.io/student-schex', 'https://wehi-researchcomputing.github.io/explanation_about_ohs', 'https://wehi-researchcomputing.github.io/student-genomics-invoicing', 'https://wehi-researchcomputing.github.io/expectations_open_source_contributors', 'https://wehi-researchcomputing.github.io/software_maturity_model', 'https://wehi-researchcomputing.github.io/RDM-0220-RCP-Student-Internship-Handbook.pdf', 'https://wehi-researchcomputing.github.io/student-immunology-web-application', 'https://wehi-researchcomputing.github.io/[url](https:/www.eventbrite.com.au/o/the-rse-association-of-australia-and-new-zealand-65201929823)', 'https://wehi-researchcomputing.github.io/student-aive', 'https://wehi-researchcomputing.github.io/social_media_policy', 'https://wehi-researchcomputing.github.io/student-haemosphere', 'https://wehi-researchcomputing.github.io/student-clinica

In [3]:
"""
Add additional files that should be entered into the vector database
"""

# List of PDF files to load
pdf_files = [
    "RCP0032 Intake 10 Student Internship Summary Reports.pdf",
    "Research Computing Platform Student Internship Handbook.pdf",
    "Student Projects Outline - Summer 2425.pdf"
]

# Additional URLs + bad practice strategy to add "weighting" to the FAQ page
additional_urls = [
    "https://wehi-researchcomputing.github.io/faq"
    "https://wehi-researchcomputing.github.io/intake_dates"
]

urls += additional_urls

In [4]:
from langchain.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Function to scrape page content and extract headers with section IDs
def scrape_page(url):
    """
    Scrape individual pages to retrieve all of their section ID within the URL.

    ie.         https://wehi-researchcomputing.github.io/faq#how-should-i-ask-for-help-to-solve-a-problem
    instead of: https://wehi-researchcomputing.github.io/faq
    """
    
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None, None

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract main text content
    page_text = soup.get_text(separator="\n")
    
    # Extract headers with IDs
    section_ids = {}
    for header in soup.find_all(["h2"]):
        section_text = header.get_text(strip=True)
        section_id = header.get("id")  # Extract ID if available
        if section_id and section_id not in section_ids.values():
            section_ids[section_text] = section_id  # Store mapping of header text -> ID

    return page_text, section_ids

"""
Load PDF content and webpage content + metadata of all of the sources into all_docs
"""

# Initialize an empty list to store all documents
all_docs = []

# Load PDFs with metadata
for pdf_file in pdf_files:
    pdf_loader = PyPDFLoader(pdf_file)
    pdf_docs = pdf_loader.load()
    for doc in pdf_docs:
        doc.metadata["source"] = pdf_file  # Add source metadata
    all_docs.extend(pdf_docs)

for url in urls:
    # Scrape page to extract section headers
    _, section_ids = scrape_page(url)  # Get section mapping

    # Load webpage content using WebBaseLoader
    url_loader = WebBaseLoader(url)
    web_docs = url_loader.load()
    
    for doc in web_docs:
        doc.metadata["source"] = url  # Store the source URL
        doc.metadata["section_ids"] = section_ids  # Store extracted section headers

    all_docs.extend(web_docs)

USER_AGENT environment variable not set, consider setting it to identify your requests.
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 88 0 (offset 0)
Ignoring wrong pointing object 101 0 (offset 0)
Ignoring wrong pointing object 110 0 (offset 0)
Ignoring wrong pointing object 119 0 (offset 0)
Ignoring wrong pointing object 136 0 (offset 0)


Error fetching https://wehi-researchcomputing.github.io/[url](https:/www.eventbrite.com.au/o/the-rse-association-of-australia-and-new-zealand-65201929823): 404 Client Error: Not Found for url: https://wehi-researchcomputing.github.io/%5Burl%5D(https:/www.eventbrite.com.au/o/the-rse-association-of-australia-and-new-zealand-65201929823)
Error fetching https://wehi-researchcomputing.github.io/11-Summer-2024-2025: 404 Client Error: Not Found for url: https://wehi-researchcomputing.github.io/11-Summer-2024-2025
Error fetching https://wehi-researchcomputing.github.io/faqhttps://wehi-researchcomputing.github.io/intake_dates: 404 Client Error: Not Found for url: https://wehi-researchcomputing.github.io/faqhttps://wehi-researchcomputing.github.io/intake_dates


In [5]:
"""
Split documents into chunks, so that when retrieving context, the entire document is not loaded (wasted context window),
but instead only the relevant chunks are added. Experiment with chunk_size and chunk_overlap parameters
"""

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
chunks = text_splitter.split_documents(all_docs)

In [6]:
# Function to find the most relevant section ID for a chunk
def find_section_id(chunk_text, section_ids):
    """
    Find the most relevant section ID for a specific chunk.

    Returns the section_id (ie. how-should-i-ask-for-help-to-solve-a-problem)
    """
    if not section_ids:
        return None
        
    for header, section_id in section_ids.items():
        if header in chunk_text:  # If the header appears in the chunk, use its ID
            return section_id
    return None  # No matching section ID found

# Assign section IDs to chunks
for chunk in chunks:
    """
    Find the relevant section ID's and add to metadata
    Testing adding section ID to page content as well for better retrieval if query resembles section ID
    """
    section_id = find_section_id(chunk.page_content, chunk.metadata.get("section_ids", {}))
    if section_id:
        chunk.metadata["section_id"] = section_id  # Store section ID
        chunk.page_content += f"\nSection ID: {section_id.replace('-', ' ')}"

# Ensure sources include section IDs in retrieval
for chunk in chunks:
    """
    Add section Id's to source, clean up chunk metadata
    """
    if "section_id" in chunk.metadata:
        chunk.metadata["source"] = f"{chunk.metadata['source']}#{chunk.metadata['section_id']}"
    chunk.metadata.pop("section_ids", None)  

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

"""
Initialise embedding model + vector store using ChromaDB
Create new vector store/load existing vector store
"""

# Initialize embedding model "all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create vector store
vector_store = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")

# Load the existing ChromaDB database
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)

  vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)


In [8]:
from langchain_ollama.llms import OllamaLLM

"""
Iniitalise LLM, bling-phi-3-gguf

For local deployment, Ollama is recommended.
For external deployment, it is possible to download the chosen LLM model via hf_hub_download
and load the model using GPT4ALL
"""

# LLM Model
MODEL = "hf.co/llmware/bling-phi-3-gguf"
llm = OllamaLLM(model=MODEL)

"""
from huggingface_hub import hf_hub_download
from langchain_community.llms import GPT4All

model_path = "models"
model_name = "bling-phi-3.gguf"
hf_hub_download(repo_id="llmware/bling-phi-3-gguf", filename=model_name, local_dir=model_path)

llm = GPT4All(model="./models/bling-phi-3.gguf")
"""

'\nfrom huggingface_hub import hf_hub_download\nfrom langchain_community.llms import GPT4All\n\nmodel_path = "models"\nmodel_name = "bling-phi-3.gguf"\nhf_hub_download(repo_id="llmware/bling-phi-3-gguf", filename=model_name, local_dir=model_path)\n\nllm = GPT4All(model="./models/bling-phi-3.gguf")\n'

In [9]:
from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

"""
Create Langchain RAG Pipeline, combining prompt, vector store retrieval and LLM
"""

template = """{context}
        
        Question: {question}
        
        Please provide a clear and concise answer based only on the information provided above. 
        If the information is not sufficient to answer the question, please say so."""

prompt = PromptTemplate.from_template(template)


# Define a new chain to return both the answer and the sources
qa_chain_with_sources = (
    RunnableParallel(
        {
            "context": vector_store.as_retriever(),
            "question": RunnablePassthrough(),
        }
    )
    | {
        "answer": prompt | llm | StrOutputParser(),
        "sources": lambda x: [doc.metadata.get("source", "Unknown") for doc in x["context"]],
    }
)

In [10]:
# Function to call a RAG LLM query
def rag_query(query, history):
    """
    2 Parameters: Query --> string user query, history --> chat history
    
    Invoke the Langchain RAG sequence, and retrieve sources. History is currently unused due to limited context window available,
    may implement in the future
    
    Returns formatted string output for the chatbot
    """
    # Invoke the chain
    response = qa_chain_with_sources.invoke(query)
    
    answer = response["answer"]
    unique_sources = list(set(response["sources"]))

    # Print answers + sources
    output = f"Answer: {answer}\n\nSources:\n" + "\n".join(unique_sources)
    return output


In [11]:
import gradio as gr

"""
Gradio chat interface calling the rag_query as the function
Added example queries to give users an idea of what type of queries can be asked/answered
"""

# Create Gradio ChatInterface
iface = gr.ChatInterface(
    fn=rag_query,  # Function to call for generating responses
    title="WEHI Student Intern Chatbot Demo",
    type='messages',
    description="Ask questions related to your WEHI internship and get answers with sources.",
    examples=[
        "What flexibility is there for the internship?",
        "What are the tasks for the REDMANE Data Ingestion team?",
        "What are the key things to do before the weekly meetings?", 
        "How do I tackle complex and ambiguous projects?",
        "What happens over Easter break at WEHI?",
        "When is the final presentation due?",
        "What is Nectar?",
        "Is the internship remote or in person?"
    ],
)

# Launch Gradio interface, share=True opens a public URL for 72-hour demo
iface.launch()



* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


