In [2]:
# obtain homemade search engine
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-10-17 01:13:00--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.3’


2024-10-17 01:13:00 (16.8 MB/s) - ‘minsearch.py.3’ saved [3832/3832]



In [3]:
# import libraries
import minsearch
import json
import os
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd

## Ingestion

In [4]:
# load the cleaned up json file
with open('cleaned_Data.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
# add the actual course (only one is ASU online) to the question-level info
documents = []

for id, doc in enumerate(docs_raw['documents']):
    doc['id'] = id #set up a unique id
    doc['course'] = docs_raw['course']
    documents.append(doc)

In [6]:
documents[10]

{'text': 'Textbook costs are not included in tuition.',
 'section': 'Common questions about ASU Online',
 'question': 'Are textbook costs included in tuition?',
 'id': 10,
 'course': 'ASU Online'}

In [7]:
# setup data indexing using minsearch
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course","id"]
)

In [8]:
#actually index the data
index.fit(documents)

<minsearch.Index at 0x1680d4d40>

## RAG flow

In [9]:
# setup API key
load_dotenv('.envrc') 
openai_api_key = os.getenv('OPENAI_API_KEY')

In [10]:
# start an openAI client
client = OpenAI()

In [11]:
# set up RAG definitions
def search(query):
    '''
    This function retrieves the top 5 results from an indexed search enging.
    We are using a homemade engine called 'minsearch' which has been
    developed by alexey grigorev.
    '''
    boost = {'question': 3.0, 'section': 0.5} #weights

    results = index.search(
        query = query,
        filter_dict={'course':'ASU Online'}, #this is a bit moo, but done for continuity
        boost_dict=boost,
        num_results=10
        )
    
    return results

def build_prompt(query,search_results):
    '''  
    This function creates an LLM friendly prompt using the results from a search engine
    as background information input.
    '''
    prompt_template = """ 
    You are a course teaching assistant. Please answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: {context}

    """.strip()

    context= ""

    # concatenate search results as one text string
    for doc in search_results:
        context = context + f'section: {doc['section']} \nquestion: {doc['question']} \nanswer: {doc['text']}\n\n'

    # fill out the prompt template
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

def llm(prompt):
    '''  
    This function contacts sets up the LLM model and runs the formatted prompt
    '''
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content
    


In [12]:
# set up the RAG using the 3 steps above
def rag(query):
    ''' 
    This function generates a Retrieval-Augmented generation model architecture.
    It combines search engine retrieval results with LLM to give a user-friendly answer.
    '''
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [13]:
# try out a query
# test a search
query = 'what do I need to enroll to online graduate classes?'
answer = rag(query)

In [14]:
print(answer)

To enroll in online graduate classes at ASU Online, you need to complete the following steps:

1. **Application**: You can apply to a graduate program while in the final year of your undergraduate degree. You will need to provide your junior-senior GPA and submit unofficial transcripts as part of your application.

2. **Transcripts**: If accepted, you will need to send official transcripts later.

3. **Application Deadlines**: Be aware of the graduate application deadlines, which vary by program. It's usually necessary to apply at least a month or two in advance of your chosen start date.

4. **Login to My ASU**: Once you are ready to enroll in classes, log in to My ASU using your ASURITE ID and password.

5. **Class Search**: Locate the 'Class Search' feature at the bottom of the 'My Classes' tab. Choose your subject and the online campus.

6. **Enrollment**: After finding your class, click 'Add' and follow the enrollment steps.

Ensure you have the necessary tools, such as a standard

## Retrieval Evaluation

In [15]:
df_question = pd.read_csv('ground_truth_data.csv')


In [16]:
df_question

Unnamed: 0,id,course,question
0,0,ASU Online,Are the credits earned through ASU Online the ...
1,0,ASU Online,How do other four-year universities view ASU O...
2,0,ASU Online,Do transcripts show whether courses were taken...
3,0,ASU Online,Who decides if ASU Online credits can be trans...
4,0,ASU Online,Can I find out how my ASU Online credits apply...
...,...,...,...
260,52,ASU Online,What resources does ASU Online provide for aca...
261,52,ASU Online,Is there a specific process to locate my advisor?
262,52,ASU Online,How can I contact my advisor once I find them?
263,52,ASU Online,What support services are available for online...
