In [None]:
! pip install langchain langchain-openai langchain-community langchain-chroma

In [None]:
# specify OpenAI API key
import os
os.environ["OPENAI_API_KEY"] = 'Insert_Key_Here'

# Download the Roster information from the Terp's Men's Basketball Website. Note that this takes >3 minutes, so I have done this for us, but here is Python code where you could download it:



```
# Define the URL of the website from which we want to load documents
url = "https://umterps.com/sports/mens-basketball/roster"

# Import the RecursiveUrlLoader class from the langchain_community.document_loaders.recursive_url_loader module
# This class is used to recursively load documents from a given URL
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader

# Import the BeautifulSoup class from the bs4 module and alias it as Soup
# BeautifulSoup is a library used for parsing HTML and XML documents
from bs4 import BeautifulSoup as Soup

# Define a function to clean up the extracted HTML text
# - This function takes the raw HTML content and processes it to extract clean text
def clean_html_text(html):
    # Parse the HTML content with BeautifulSoup
    soup = Soup(html, "html.parser")
    # Get all text, stripped of leading/trailing whitespace and extra newlines/tabs
    cleaned_text = soup.get_text(separator=' ', strip=True)
    return cleaned_text

# Create an instance of RecursiveUrlLoader
# - url: the starting URL to load documents from
# - max_depth: the maximum depth to which links on the page should be followed (here, 2 levels deep)
# - extractor: a lambda function that takes the HTML content, parses it with BeautifulSoup, and cleans the text
loader = RecursiveUrlLoader(
    url=url, max_depth=2, extractor=lambda x: clean_html_text(x)
)

# Load the documents from the specified URL and its linked pages up to the specified depth
# The loaded and cleaned content is stored in the 'docs' variable
docs = loader.load()

# 'docs' now contains the cleaned text content of the pages loaded from the URL and its linked pages up to the specified depth

# Import the pickle module, which is used for serializing and deserializing Python objects
import pickle

# Open a file named 'terps_mens_basketball_roster_info.pkl' in write-binary mode
# 'wb' stands for write-binary
with open('terps_mens_basketball_roster_info_cleaned.pkl', 'wb') as file:
    # Use the pickle.dump() function to serialize the 'docs' object and write it to the file
    pickle.dump(docs, file)
```

In [None]:
# Load the terps_mens_basketball_roster_info.pkl file as docs
! wget https://github.com/abmcmillan/RAG_Tutorial/raw/main/terps_mens_basketball_roster_info_cleaned.pkl

# Import the pickle module, which is used for serializing and deserializing Python objects
import pickle

# Open a file named 'terps_mens_basketball_roster_info.pkl' in write-binary mode
# 'wb' stands for write-binary
with open('terps_mens_basketball_roster_info_cleaned.pkl', 'rb') as file:
    # Use the pickle.dump() function to deserialize the 'docs' object and read from the file
    docs = pickle.load(file)

In [None]:
# Import the RecursiveCharacterTextSplitter class from the langchain_text_splitters module
# This class is used to split text into chunks based on character count
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create an instance of RecursiveCharacterTextSplitter
# - chunk_size: the desired size of each text chunk (here, 2000 characters)
# - chunk_overlap: the number of characters that overlap between consecutive chunks (here, 300 characters)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)

# Use the text_splitter instance to split the 'docs' text into chunks
# The split_documents method will take the 'docs' object (a collection of text documents) and split it
# into smaller chunks according to the specified chunk size and overlap
splits = text_splitter.split_documents(docs)

# 'splits' now contains the text documents divided into smaller chunks of approximately 1000 characters each,
# with an overlap of 200 characters between consecutive chunks

In [None]:
# Import the Chroma class from the langchain_chroma module
# Chroma is a vector database that stores and retrieves documents based on their vector embeddings
from langchain_chroma import Chroma

# Import the OpenAIEmbeddings class from the langchain_openai module
# OpenAIEmbeddings is used to compute embeddings for documents using OpenAI's embedding model
from langchain_openai import OpenAIEmbeddings

# Create a vector database (Chroma) from the split documents
# - documents: the list of document chunks that need to be stored in the vector database
# - embedding: an instance of OpenAIEmbeddings used to compute vector embeddings for the documents
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

# Create a retriever from the vector database
# The retriever can be used to search for and retrieve documents based on their vector embeddings
retriever = vectorstore.as_retriever()

# 'vectorstore' now contains the vector embeddings for the document chunks and allows for efficient
# similarity-based retrieval of documents
# 'retriever' is an interface to query the vector database and fetch relevant documents based on query embeddings

In [None]:
# Define the system prompt for the Retrieval-Augmented Generation (RAG) model
# The system prompt provides instructions for the assistant on how to answer questions
system_prompt = (
    "You are an assistant for question-answering tasks about the University of Maryland"
    "Terrapins Men's Basketball team. Use the following pieces of retrieved context to "
    "answer the question. If you don't know the answer, say that you don't know. Use "
    "three sentences maximum and keep the answer concise.\n\n"
    "{context}"
)

# Import the ChatPromptTemplate class from the langchain_core.prompts module
# ChatPromptTemplate is used to create a structured template for chat-based prompts
from langchain_core.prompts import ChatPromptTemplate

# Create a ChatPromptTemplate instance using the system and human messages
# - ("system", system_prompt): The system message provides the assistant with instructions
# - ("human", "{input}"): The human message represents the user's input question, which will be filled in dynamically
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# 'prompt' is now a structured template for generating responses using the RAG model
# The system message guides the assistant on how to use the retrieved context to answer questions concisely
# The human message represents the user's input question to which the assistant will respond

In [None]:
# Import the ChatOpenAI class from the langchain_openai module
# ChatOpenAI is used to define and interact with OpenAI's chat-based language models
from langchain_openai import ChatOpenAI

# Create an instance of ChatOpenAI
# - model: specifies the model version to be used, in this case "gpt-3.5-turbo-0125"
llm = ChatOpenAI(model="gpt-3.5-turbo-0125",temperature=0)

# Import the create_stuff_documents_chain function from the langchain.chains.combine_documents module
# This function is used to create a chain that processes documents and generates answers
from langchain.chains.combine_documents import create_stuff_documents_chain

# Create a question-answer chain using the language model (llm) and the defined prompt template
# - llm: the language model to be used for generating answers
# - prompt: the structured prompt template for generating responses
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Import the create_retrieval_chain function from the langchain.chains module
# This function is used to create a retrieval chain that combines document retrieval and question answering
from langchain.chains import create_retrieval_chain

# Create a retrieval-augmented generation (RAG) chain
# - retriever: the document retriever that fetches relevant documents based on query embeddings
# - question_answer_chain: the question-answer chain that generates answers using the retrieved documents
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# 'llm' is the language model instance used for generating answers
# 'question_answer_chain' is the chain that processes documents and generates answers using the language model and prompt template
# 'rag_chain' is the combined chain that first retrieves relevant documents and then generates answers based on those documents


In [None]:
# Create a utility function to wrap print and wrap text for Colab

# Import the textwrap module from the standard library
# textwrap is used to wrap text into paragraphs with a specified width
import textwrap

# Print the response with text wrapping for better display
# - textwrap.fill: Wraps the input text to the specified width (50 characters in this case)
def print_wrapped( text, width=50 ):
  print(textwrap.fill( text, width=width))

In [None]:
# Use the language model (LLM) directly without retrieval-augmented generation (RAG)
# Invoke the LLM with a system message providing context and a human message containing the user's question
no_rag_response = llm.invoke([
    # System message: Provides context to the assistant about its role
    ("system",
     "You are an assistant for question-answering tasks about the University of Maryland"
     "Terrapins Men's Basketball team. If you don't know the answer, say that you don't "
     "know. Use three sentences maximum and keep the answer concise.\n\n"),
    # Human message: The user's input question to the assistant
    ("human", "Who is the current head coach of the Maryland Terrapins Men's Basketball Team?"),
])

print_wrapped( no_rag_response.content )

In [None]:
# Use retrieval-augmented generation (RAG) to answer the question
# Invoke the RAG chain with the user's question as input
response = rag_chain.invoke({"input": "Who is the current head coach of the Maryland Terrapins Men's Basketball Team?"})

# Print the response with text wrapping for better display
# - textwrap.fill: Wraps the answer text to the specified width (50 characters in this case)
print_wrapped(response["answer"] )

In [None]:
response = rag_chain.invoke({"input": "Where did Kevin Willard play college basketball and what position did he play?"})
print_wrapped(response["answer"] )

In [None]:
# Impressive, but how do we know this is accurate? Can we get a source?
# indeed we can, Langchain provides context for nearest searches
response["context"][0]

In [None]:
response = rag_chain.invoke({"input": "Who is the shortest player on the Terps Men's Basketball Team?"})
print_wrapped(response["answer"] )

In [None]:
response = rag_chain.invoke({"input": "Who is the biggest jokester on the team?"})
print_wrapped(response["answer"] )

In [None]:
response = rag_chain.invoke({"input": "How short is Julian Reese?"})
print_wrapped(response["answer"] )

In [None]:
response = rag_chain.invoke({"input": "How much does Reese weigh?"})
print_wrapped(response["answer"], width=50 )

In [None]:
response = rag_chain.invoke({"input": "What players are not from the United States?"})
print_wrapped(response["answer"], width=50 )

In [None]:
response = rag_chain.invoke({"input": "Who is the video coordinator and how many years of experience do they have? "})
print_wrapped(response["answer"], width=50 )

In [None]:
response["context"]