In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

from ragbooster import BingRetriever, HuggingfaceQAGenerator, RAGModel, RAGBooster, score
from ragbooster.demo import load_imputation_dataset

In [2]:
np.random.seed(42)

questions = load_imputation_dataset('demo_data/restaurant.csv', 
                                    impute='city', 
                                    based_on=['name', 'address', 'phone'])

validation_questions, test_questions = train_test_split(questions, test_size=0.5)
validation_questions[0]

Question(text='name: border grill; address: 4th st.; phone: 310/451-1655', correct_answers=['los angeles'], metadata={})

In [3]:
class MyGenerator(HuggingfaceQAGenerator):

    def _model_name(self):
        return "deepset/minilm-uncased-squad2"

    def _qa_question(self):
        return "What is the name of the city in which this restaurant is located?"

    def _create_context(self, question, snippet=None):
        if snippet is None:
            return question.text
        else:
            return f'{snippet};{question.text}'
        
    def _extract_answer(self, response):
        return response['answer'].lower()

In [4]:
llm = MyGenerator()

score(test_questions, llm)

  0%|          | 0/432 [00:00<?, ?it/s]

0.05555555555555555

In [5]:
class MyBingWebsearch(BingRetriever):
    def create_query(self, question):
        return question.text
    
bing_websearch = MyBingWebsearch()  

In [6]:
example_question = validation_questions[11]
example_question

Question(text="name: scala's bistro; address: 432 powell st.; phone: 415/395-8555", correct_answers=['san francisco'], metadata={})

In [7]:
retrieved = bing_websearch.retrieve(example_question)
for snippet, url in retrieved[:3]:
    print(url, '-', snippet, '\n')

https://tableagent.com/san-francisco/scalas-bistro/ - Reservations Scala's Bistro Reservations Date Time Party Size Business Info + − Leaflet | © OpenStreetMap Address: 432 Powell Street, San Francisco CA 94102 Cross Street: Post Street Location: San Francisco | Union Square Cuisine: French | Italian | Pasta | Cost: | Moderate Category: Fine Dining Star Rating: Reservations: Unknown 

https://www.yellowpages.com/san-francisco-ca/mip/scalas-bistro-4887204 - ﻿ $$$ Italian Restaurants, Bars, Continental Restaurants (2) (2076) 7.1 OPEN NOW Today: 8:00 am - 11:00 pm 21 YEARS IN BUSINESS Amenities: (415) 395-8555 Map & Directions 432 Powell StSan Francisco, CA 94102 Write a Review Is this your business? Customize this page. Claim This Business Hours Regular Hours Scala's Bistro 432 Powell St, San Francisco 

https://www.chamberofcommerce.com/united-states/california/san-francisco/italian-restaurant/2006879304-scala-s-bistro - Scala's Bistro at 432 Powell St, San Francisco, CA 94102. Get Scal

In [8]:
rag10 = RAGModel(bing_websearch, llm, k=10)

accuracy_rag_10 = score(test_questions, rag10)

f'The accuracy with retrieval augmentation and k=10 on the test set is {accuracy_rag_10}'

  0%|          | 0/432 [00:00<?, ?it/s]

'The accuracy with retrieval augmentation and k=10 on the test set is 0.8009259259259259'

In [9]:
refined_rag_model = RAGBooster(rag10, validation_questions)

Computing validation corpus...


  0%|          | 0/432 [00:00<?, ?it/s]

Learning importance weights for data sources...
Tuning threshold for corpus pruning...
Achieved accuracy of 0.870 with a pruning threshold of 0.57714 on the validation set.


In [10]:
accuracy_refined = score(test_questions, refined_rag_model)
improvement = accuracy_refined - accuracy_rag_10

f'RAGBooster improved the accuracy with retrieval augmentation by {improvement:.3f}'+\
f' to {accuracy_refined}!'

  0%|          | 0/432 [00:00<?, ?it/s]

'RAGBooster improved the accuracy with retrieval augmentation by 0.044 to 0.8449074074074074!'