In [None]:
'''
Simple RAG Implementation
Author: Christian Sarmiento
Purpose: This notebook is intended to get a simple implementation of RAG set up with LangChain.
Date Created: 10/1/24
Last Updated: 10/1/24
Data: https://archive.ics.uci.edu/dataset/450/sports+articles+for+objectivity+analysis
Sources:
- https://python.langchain.com/docs/tutorials/rag/
- https://python.langchain.com/docs/tutorials/llm_chain/
- https://medium.com/@dinabavli/rag-basics-basic-implementation-of-retrieval-augmented-generation-rag-e80e0791159d
- ChatGPT: o1-preview
-----------------------------------------------------------------------------------------------------------------------
RAG Research             |               Machine Learning Independent Study             |              DR. EITEL LAURIA
'''

In [1]:
# Imports
import sys
sys.path.append("/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Private Code")
from api_keys import openAIKey
from api_keys import langchainKey
from langchain_openai import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain import hub  # for RAG prompt
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.docstore.document import Document
from langchain_core.messages import AIMessage, HumanMessage
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
import pandas as pd
import os

In [2]:
# LangChain Enviornment Variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = langchainKey()
os.environ["OPENAI_API_KEY"] = openAIKey()

In [3]:
# Load OpenAI model
llm = ChatOpenAI(model="gpt-4o-mini")

In [6]:
# Load Data
folderPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/sports_articles_corpus/Raw data"
sportsArticles = []

for fileName in os.listdir(folderPath):
    filePath = os.path.join(folderPath, fileName)
    loader = TextLoader(filePath, encoding='latin1')  # UTF-8 not working for the files
    doc = loader.load()
    sportsArticles.extend(doc)

In [12]:
# Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(sportsArticles)

In [13]:
# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

In [15]:
# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 documents

# To get retrieved documents:
# retrievedDocuments = retriever.invoke("query")

In [21]:
# Setting up the RAG Chain

# Function to format documents into the prompt
def formatDocs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Setup RAG Chain
prompt = hub.pull("rlm/rag-prompt")
ragChain = (
    {"context": retriever | formatDocs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [22]:
# Results
for chunk in ragChain.stream("Explain the offside rule in soccer."):
    print(chunk, end="", flush=True)

The offside rule in soccer states that a player is in an offside position if they are nearer to the opponent's goal line than both the ball and the second-to-last opponent when the ball is played to them, unless they are in their own half or level with the second-to-last opponent. Being in an offside position is not an offense in itself; the player must become involved in active play to be penalized. The rule aims to prevent players from gaining an unfair advantage by lingering near the opponent's goal.

In [2]:
# Test if answers are coming from the llm or from the documents
# Try giving documents that aren't real then asking questions on things off of that
# Avoids the model relying on trained info 
# Play with the system prompt

# Next step after QA - feed answers into the system to make it more conversational
# Implement Gradio
# Knowledge Graph 
# Identifying Metrics - do research!!


In [None]:
'''
Implement with Marist Data
'''

In [4]:
# Load Data
csvPath = "/Users/christiansarmiento/Library/CloudStorage/OneDrive-MaristCollege/Machine Learning/Data/Marist_QA.csv"
maristQA = pd.read_csv(csvPath, header=None)

# To use RecursiveCharacterTextSplitter, we need a list of dictionaries
maristContext = [Document(page_content=text) for text in maristQA[1].tolist()]

In [5]:
# Split Documents into Chunks
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
texts = textSplitter.split_documents(maristContext)

In [6]:
# Store Documents in Vector DB (Chroma)
vectorDB = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

In [7]:
# Setup Retrieval System
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Retrieves 3 documents

In [9]:
# Prompts
systemPrompt = (
    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
    
)

prompt = ChatPromptTemplate.from_messages(
    
    [
        ("system", systemPrompt),
        ("human", "{input}"),
    ]
    
)

contextualizeSystemPrompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualizePrompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualizeSystemPrompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
historyAwareRetriever = create_history_aware_retriever(
    llm, retriever, contextualizePrompt
)

qaPrompt = ChatPromptTemplate.from_messages(
    [
        ("system", systemPrompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [10]:
# Make chains
questionAnswerChain = create_stuff_documents_chain(llm, qaPrompt)
ragChain = create_retrieval_chain(historyAwareRetriever, questionAnswerChain)

In [11]:
# Talking to ChatGPT, making RAG conversational
conversationHistory = []
userQuery = input("Prompt (0 to quit): ")
while userQuery != '0':

    # Print input - this is just for a VSCode enviornment to see I/O together, feel free to comment out in Jupyter
    print(f"User: {userQuery}")

    # Call ChatGPT using RAG chain
    llmResponse = ragChain.invoke({"input": userQuery, "chat_history": conversationHistory})
    print(f"LLM: {llmResponse['answer']}")
    print()
    conversationHistory.extend([
        
        HumanMessage(content=userQuery),
        AIMessage(content=llmResponse["answer"]),
    ])

    # New prompt
    userQuery = input("Prompt (0 to quit): ")

Marist College is located on the banks of the Hudson River and also has a campus in Florence, Italy.

Marist College is in Poughkeepsie, New York, situated along the Hudson River. The Florence campus is located in Florence, Italy.

