In [1]:
# obtain homemade search engine
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-10-20 22:40:33--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.5’


2024-10-20 22:40:33 (15.4 MB/s) - ‘minsearch.py.5’ saved [3832/3832]



In [2]:
# import libraries
import minsearch
import json
import os
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm


## Ingestion

In [3]:
# load the cleaned up json file
with open('cleaned_Data.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
# add the actual course (only one is ASU online) to the question-level info
documents = []

for id, doc in enumerate(docs_raw['documents']):
    doc['id'] = id #set up a unique id
    doc['course'] = docs_raw['course']
    documents.append(doc)

In [5]:
documents[10]

{'text': 'Textbook costs are not included in tuition.',
 'section': 'Common questions about ASU Online',
 'question': 'Are textbook costs included in tuition?',
 'id': 10,
 'course': 'ASU Online'}

In [6]:
# setup data indexing using minsearch
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course","id"]
)

In [7]:
#actually index the data
index.fit(documents)

<minsearch.Index at 0x14b357170>

## RAG flow

In [8]:
# setup API key
load_dotenv('.envrc') 
openai_api_key = os.getenv('OPENAI_API_KEY')

In [9]:
# start an openAI client
client = OpenAI()

In [10]:
# set up RAG definitions
def search(query,boost=None):
    '''
    This function retrieves the top 5 results from an indexed search enging.
    We are using a homemade engine called 'minsearch' which has been
    developed by alexey grigorev.
    '''
    if boost is None:
        boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query = query,
        filter_dict={'course':'ASU Online'}, #this is a bit moo, but done for continuity
        boost_dict=boost,
        num_results=10
        )
    
    return results

def build_prompt(query,search_results):
    '''  
    This function creates an LLM friendly prompt using the results from a search engine
    as background information input.
    '''
    prompt_template = """ 
    You are a course teaching assistant. Please answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: {context}

    """.strip()

    context= ""

    # concatenate search results as one text string
    for doc in search_results:
        context = context + f'section: {doc['section']} \nquestion: {doc['question']} \nanswer: {doc['text']}\n\n'

    # fill out the prompt template
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

def llm(prompt):
    '''  
    This function contacts sets up the LLM model and runs the formatted prompt
    '''
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content
    


In [11]:
# set up the RAG using the 3 steps above
def rag(query):
    ''' 
    This function generates a Retrieval-Augmented generation model architecture.
    It combines search engine retrieval results with LLM to give a user-friendly answer.
    '''
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [12]:
# try out a query
# test a search
query = 'what do I need to enroll to online graduate classes?'
answer = rag(query)

In [13]:
print(answer)

To enroll in online graduate classes at ASU Online, you need to complete the following steps:

1. **Application**: You can apply to a graduate program while in the final year of your undergraduate degree. Provide your junior-senior GPA and unofficial transcripts as part of your application. Official transcripts will be required if accepted.

2. **Enrollment Steps**: Once accepted, log in to My ASU with your ASURITE ID and password. Use the 'Class Search' feature to find your desired classes and follow the enrollment steps.

3. **Meeting Deadlines**: Be sure to apply at least a month or two in advance of your chosen start date. 

If you are an international student, note that you should select the visa type "JN" when applying, as student visas are not available for online students.


## Retrieval Evaluation
Set up functions for retrieval evaluation metrics, and apply to ground truth data.

In [14]:
# set up functions for evaluation metrics
def hit_rate(relevance_total):
    ''' 
    This fcn evaluates hit rate
    '''
    cnt = 0
    # count if any True found on line
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    '''
    this fcn evaluates mean reciprocal rank
    '''
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break

    return total_score / len(relevance_total)


In [15]:
def evaluate(ground_truth, search_function):
    ''' 
    This function uses any specific search engine you set up to 
    check out the ground truch, and it evaluates the 2 metrics 
    '''
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [16]:
#read in data
df_question = pd.read_csv('ground_truth_data.csv')

# convert to dict
ground_truth = df_question.to_dict(orient='records')

ground_truth[0]

{'id': 0,
 'course': 'ASU Online',
 'question': 'Are the credits earned through ASU Online the same as those from on-campus classes?'}

In [17]:
# evaluate how good the search engine is depending on boosting
evaluate(ground_truth, lambda q: search(q['question'],boost={}))

  0%|          | 0/265 [00:00<?, ?it/s]

{'hit_rate': 0.7396226415094339, 'mrr': 0.4746466007786763}

### Finding the best parameters
Let us find the best parameters that maximize eval metrics

In [18]:
from hyperopt import fmin, tpe, hp, Trials
import numpy as np
# if hyperopt not running do pip install --upgrade setuptools

In [23]:
# get some subset of data to validate against
df_validation = df_question[:100]
data_val = df_validation.to_dict(orient='records')

In [20]:
# Define the search space for the boosts
search_space = {
    'question_boost': hp.uniform('question_boost', 0.0, 5.0),  # Boost for 'question' between 1.0 and 5.0
    'section_boost': hp.uniform('section_boost', 0.0, 2.0)    # Boost for 'section' between 0.1 and 2.0
}

def objective(params):
    # Use the evaluate function to get hit rate and MRR for current boost values
    boost = {'question': params['question_boost'], 'section': params['section_boost']}
    
    # Call the evaluate function with the new boost values
    results = evaluate(data_val, lambda q: search(q['question'], boost=boost))
      
    # Combine hit rate and MRR for optimization
    combined_score = 0.5 * results['hit_rate'] + 0.5 * results['mrr']  # Adjust weights as needed
    
    return -combined_score


# Set up hyperopt search
trials = Trials()  # To store information about each trial

best_boost = fmin(
    fn=objective,           # Objective function to minimize
    space=search_space,      # The parameter space
    algo=tpe.suggest,        # Tree of Parzen Estimators algorithm
    max_evals=50,            # Number of trials
    trials=trials,           # To store the trials
    rstate=np.random.default_rng(42)  # Set a random seed for reproducibility
)

# Print the best boost values found by hyperopt
print("Best boost values:", best_boost)


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  4%|▍         | 2/50 [00:00<00:04, 10.80trial/s, best loss: -0.6953452380952383]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  8%|▊         | 4/50 [00:00<00:04,  9.91trial/s, best loss: -0.845561507936508] 

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 12%|█▏        | 6/50 [00:00<00:04,  9.54trial/s, best loss: -0.845561507936508]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 16%|█▌        | 8/50 [00:00<00:04, 10.31trial/s, best loss: -0.845561507936508]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 20%|██        | 10/50 [00:00<00:03, 10.02trial/s, best loss: -0.845561507936508]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 24%|██▍       | 12/50 [00:01<00:03, 10.38trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 28%|██▊       | 14/50 [00:01<00:03,  9.72trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 32%|███▏      | 16/50 [00:01<00:03, 10.08trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 36%|███▌      | 18/50 [00:01<00:03,  9.54trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 40%|████      | 20/50 [00:02<00:03,  9.47trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

 42%|████▏     | 21/50 [00:02<00:03,  9.53trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 46%|████▌     | 23/50 [00:02<00:02, 10.09trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 50%|█████     | 25/50 [00:02<00:02,  9.81trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 54%|█████▍    | 27/50 [00:02<00:02,  9.52trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 58%|█████▊    | 29/50 [00:02<00:02,  9.99trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 62%|██████▏   | 31/50 [00:03<00:01,  9.51trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 66%|██████▌   | 33/50 [00:03<00:01,  9.98trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 70%|███████   | 35/50 [00:03<00:01,  9.50trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 74%|███████▍  | 37/50 [00:03<00:01,  9.17trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 78%|███████▊  | 39/50 [00:04<00:01,  9.59trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 82%|████████▏ | 41/50 [00:04<00:00,  9.43trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 86%|████████▌ | 43/50 [00:04<00:00,  9.99trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 90%|█████████ | 45/50 [00:04<00:00,  9.69trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

 92%|█████████▏| 46/50 [00:04<00:00,  9.68trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

 94%|█████████▍| 47/50 [00:04<00:00,  9.05trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 98%|█████████▊| 49/50 [00:05<00:00,  9.60trial/s, best loss: -0.8862559523809523]

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:05<00:00,  9.65trial/s, best loss: -0.8862559523809523]
Best boost values: {'question_boost': np.float64(0.20630178685456357), 'section_boost': np.float64(0.0694068717136398)}


In [21]:
# check final evaluation using optimized values
evaluate(ground_truth, lambda q: search(q['question'],boost={'question': 0.2, 'section': 0.07}))


  0%|          | 0/265 [00:00<?, ?it/s]

{'hit_rate': 0.9433962264150944, 'mrr': 0.7722926025756213}