In [1]:
pwd

'/mnt/f/LLM_Evaluation/notebooks'

In [2]:
import os
import requests

In [3]:
local_only = os.path.join(os.getcwd(), '..', 'local_only')
local_only = os.path.abspath(local_only)

In [4]:
local_only

'/mnt/f/LLM_Evaluation/local_only'

In [5]:
from datasets import load_dataset
ds = load_dataset("google/boolq", cache_dir=local_only)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
train = ds['train']

In [7]:
train

Dataset({
    features: ['question', 'answer', 'passage'],
    num_rows: 9427
})

In [8]:
train_df = train.to_pandas()

In [9]:
train_df.head()

Unnamed: 0,question,answer,passage
0,do iran and afghanistan speak the same language,True,"Persian (/ˈpɜːrʒən, -ʃən/), also known by its ..."
1,do good samaritan laws protect those who help ...,True,Good Samaritan laws offer legal protection to ...
2,is windows movie maker part of windows essentials,True,Windows Movie Maker (formerly known as Windows...
3,is confectionary sugar the same as powdered sugar,True,"Powdered sugar, also called confectioners' sug..."
4,is elder scrolls online the same as skyrim,False,As with other games in The Elder Scrolls serie...


In [10]:
def generate_prompt(question: str, passage: str) -> str:
    return f"""You are a helpful assistant. Given a passage, answer the question with True or False.

Context: The Eiffel Tower is in Paris.
Question: Is the Eiffel Tower located in Rome?
Answer: False

Context: {passage}
Question: {question}
Answer:"""

import requests
class LLM:
    def __init__(self, url=None, model=None):
        self.url = url or 'http://localhost:11434/api/generate'
        self.model = model #or 'phi4:latest'
        print(f'LLM used -> {self.model}, requests sent to URL -> {self.url}')
    def __call__(self, prompt):
        payload  = dict(model=self.model, prompt=prompt, stream=False)
        response = requests.post(self.url, json = payload)
        return response

In [12]:
model = 'mistral_7b_instruct_v0.3_gguf:latest'

llm = LLM(model=model)

from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError

client = MongoClient("mongodb://localhost:27017/")
db_name = model.split('_')[0]
db = client[db_name]
#db = client["test"]

collection = db[model]
collection.create_index([("question", 1), ("prompt", 1)], unique=True)

LLM used -> {self.model}, requests sent to URL -> {self.url}


'question_1_prompt_1'

In [13]:
%%time
print('Num records -> ', len(train_df))
for row_num, *row in train_df.iloc[:,:].itertuples(index=True):
    if row_num % 5 == 0:
        print(row_num, end='\t')
    question,ground_truth, passage = row
    prompt = generate_prompt(question = question, passage = passage)
    
    match_query = {
        "question": question,
        "prompt": prompt
    }
    
    # Check if the result already exists
    if collection.find_one(match_query):
        #print("Already exists", end = '\t')
        continue
        
    try:
        llm_response = llm(prompt).json()
        answer = llm_response['response']
        #llm_response = 'DummyLLM called'
    except Exception as e:
        print(f"Error calling LLM for row {row_num}: {e}")
        continue
        
    result = dict(question=question, prompt=prompt, llm_response=llm_response, ground_truth=ground_truth,answer=answer)
    try:
        collection.insert_one(result)
    except DuplicateKeyError:
        print("Duplicate detected on insert", end='\t')
    except Exception as e:
        print(f"Insert error at row {row_num}: {e}")

Num records ->  9427
0	5	10	15	20	25	30	35	40	45	50	55	60	65	70	75	80	85	90	95	100	105	110	115	120	125	130	135	140	145	150	155	160	165	170	175	180	185	190	195	200	205	210	215	220	225	230	235	240	245	250	255	260	265	270	275	280	285	290	295	300	305	310	315	320	325	330	335	340	345	350	355	360	365	370	375	380	385	390	395	400	405	410	415	420	425	430	435	440	445	450	455	460	465	470	475	480	485	490	495	500	505	510	515	520	525	530	535	540	545	550	555	560	565	570	575	580	585	590	595	600	605	610	615	620	625	630	635	640	645	650	655	660	665	670	675	680	685	690	695	700	705	710	715	720	725	730	735	740	745	750	755	760	765	770	775	780	785	790	795	800	805	810	815	820	825	830	835	840	845	850	855	860	865	870	875	880	885	890	895	900	905	910	915	920	925	930	935	940	945	950	955	960	965	970	975	980	985	990	995	1000	1005	1010	1015	1020	1025	1030	1035	1040	1045	1050	1055	1060	1065	1070	1075	1080	1085	1090	1095	1100	1105	1110	1115	1120	1125	1130	1135	1140	1145	1150	1155	1160	1165	1170	1175	1180	1185	1190	1195	1

%%time
print('Num records -> ', len(train_df))
for row_num, *row in train_df.iloc[:100,:].itertuples(index=True):
    if row_num % 5 == 0:
        print(row_num, end='\t')
    question,ground_truth, passage = row
    prompt = generate_prompt(question = question, passage = passage)
    
    match_query = {
        "question": question,
        "prompt": prompt
    }
    
    # Check if the result already exists
    if collection.find_one(match_query):
        print("Already exists", end = '\t')
        continue
        
    try:
        llm_response = llm(prompt).json()['response']
        #llm_response = 'DummyLLM called'
    except Exception as e:
        print(f"Error calling LLM for row {row_num}: {e}")
        continue
        
    result = dict(question=question, prompt=prompt, llm_response=llm_response, ground_truth=ground_truth)
    try:
        collection.insert_one(result)
    except DuplicateKeyError:
        print("Duplicate detected on insert", end='\t')
    except Exception as e:
        print(f"Insert error at row {row_num}: {e}")

In [15]:
databases = client.list_database_names()
databases


['admin', 'config', 'local', 'mistral', 'phi4']