In [1]:
%load_ext autoreload
%autoreload 2
    
from llama_index import download_loader
from ragas.testset import TestsetGenerator
from dotenv import load_dotenv
import pandas as pd
from string import punctuation
load_dotenv('/home/elastic/notebooks/vector_search_applications/.env', override=True)
import openai
import os
openai.api_key = os.environ['OPENAI_API_KEY']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [410]:
SemanticScholarReader = download_loader("SemanticScholarReader")
loader = SemanticScholarReader()
query_space = "large language models"
documents = loader.load_data(query=query_space, limit=100)

In [427]:
testsetgenerator = TestsetGenerator.from_default()
test_size = 30

In [421]:
len(docs)

9

In [428]:
testset = testsetgenerator.generate(docs[:3], test_size=test_size)
test_df = testset.to_pandas()
test_df.head()

Parsing documents into nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Text Chunks: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12985.46it/s]




Text Chunks: 100%|██████████████████████████████████████████████████████| 2/2 [00:00<00:00, 29026.33it/s]
Text Chunks: 100%|██████████████████████████████████████████████████████| 2/2 [00:00<00:00, 24244.53it/s]




Text Chunks: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13315.25it/s]
Text Chunks: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12595.51it/s]




Text Chunks: 100%|██████████████████████████████████████████████████████| 2/2 [00:00<00:00, 29228.60it/s]
Text Chunks: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10754.63it/s]




Text Chunks: 100%|██████████████████████████████████████████████████████| 2/2 [00:00<00:00, 25575.02it/s]
Text Chunks: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13751.82it/s]




Text Chunks: 100%|████████

Unnamed: 0,question,context,answer,question_type,episode_done
0,What advice would the Stoics give to someone s...,And so cognitive dissonance is a powerful forc...,The Stoics would advise someone struggling wit...,conditional,False
1,How would the Stoics approach the issue of ill...,And so cognitive dissonance is a powerful forc...,The Stoics would likely approach the issue of ...,conditional,True
2,Should individuals have the right to choose CO...,So you felt that it was obviously wrong to for...,The answer is not explicitly stated in the giv...,conditional,True
3,How does the concern about AI bias and its imp...,But what scares me is if you plug in AI bias i...,The concern about AI bias and its impact on sh...,reasoning,True
4,How does the proliferation of AI impact the dy...,"- ""you've got a dynamic between the US and Chi...",The proliferation of AI impacts the dynamic be...,simple,True


In [432]:
test_df['question'][2]

'Should individuals have the right to choose COVID vaccination, considering risks to others, like children?'

In [434]:
print(test_df['context'][2])

So you felt that it was obviously wrong to force people to get the mRNA vaccine for COVID, right?
But if we change a few of the variables, I think your ethical intuitions and certainly political intuitions would totally change.
So now we're in an environment where you're deciding not to get vaccinated is putting my kids at risk, right?
Do you get to make that choice, right?
And you might say, oh, yes, yeah, I should be able to make that choice.


In [429]:
def show(df: pd.DataFrame, num: int):
    print(df['question'][num])
    print(df['context'][num])
    print(df['answer'][num])

In [17]:
from ragas.metrics import (
    context_precision,
    context_recall,
)

metrics = [
    context_precision,
    context_recall,
]

In [3]:
from ragas.llama_index import evaluate
import json

In [4]:
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index import download_loader
from llama_index import Document
from transformers import AutoTokenizer
import tiktoken # bad ass tokenizer library for use with OpenAI LLMs 
from llama_index.text_splitter import SentenceSplitter #one of the best on the market

#instantiate tokenzier for our embedding model of choice
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

#instantiate tokenizer for use with ChatGPT-3.5-Turbo
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')

### Converting JSON into single json files

In [56]:
with open('vector_search_applications/data/impact_theory_data.json') as f:
    data = json.load(f)
for file in data[:9]:
    filename = file['title'].replace(' ', '_')
    for pun in punctuation:
        filename = filename.replace(pun, '')
    with open(f'./vector_search_applications/practice_data/{filename}.json', 'w') as f:
        json.dump(file, f)

In [258]:
dir_path = './vector_search_applications/practice_data/ragas_json_files/'
paths = [os.path.join(dir_path, file) for file in os.listdir(dir_path) if file.endswith('json')]

In [263]:
docs = []
for path in paths:
    with open(path) as f:
        data = json.load(f)
        unwanted_fields = ['keywords', 'age_restricted', 'description', 'publish_date']
        for field in unwanted_fields:
            del data[field]
    doc = Document(text=data['content'], metadata={k:v for k,v in data.items() if k != 'content'})
    docs.append(doc)

In [266]:
gpt35_txt_splitter = SentenceSplitter(chunk_size=256, tokenizer=encoding.encode, chunk_overlap=0)

In [267]:
parser = SimpleNodeParser.from_defaults(text_splitter=gpt35_txt_splitter, include_prev_next_rel=False, include_metadata=False)

In [423]:
nodes = parser.get_nodes_from_documents(docs, show_progress=True)

Parsing documents into nodes:   0%|          | 0/9 [00:00<?, ?it/s]

In [270]:
nodes[0].metadata

{}

In [271]:
for node in nodes:
    node.metadata = list(node.relationships.values())[0].metadata

In [272]:
nodes[0].metadata

{'author': 'Tom Bilyeu',
 'title': 'The Stoic Advice Every Man Learns Too Late In Life | Ryan Holiday',
 'video_id': 'gzNLzqI5oTE',
 'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
 'channel_id': 'UCnYMOamNKLGVlJgRUbamveA',
 'length': 10158,
 'thumbnail_url': 'https://i.ytimg.com/vi/gzNLzqI5oTE/hq720.jpg',
 'views': 120714,
 'episode_url': 'https://www.youtube.com/watch?v=gzNLzqI5oTE&list=PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo'}

In [273]:
from vector_search_applications.weaviate_interface import WeaviateClient

In [274]:
#read env vars from local .env file
api_key = os.environ['WEAVIATE_API_KEY']
url = os.environ['WEAVIATE_ENDPOINT']

#instantiate client
client = WeaviateClient(api_key, url)

#check if WCS instance is live and ready
client.is_live(), client.is_ready()

(True, True)

In [275]:
from llama_index.vector_stores import WeaviateVectorStore
from llama_index import VectorStoreIndex, StorageContext
from sentence_transformers import SentenceTransformer
# construct vector store


In [276]:
model = SentenceTransformer(model_name)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [277]:
texts = [node.text for node in nodes]
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [278]:
for i, emb in enumerate(embeddings):
    nodes[i].embedding = emb.tolist()

In [280]:
for node in nodes:
    assert isinstance(node.embedding, list)
    assert len(node.embedding) == 384

In [None]:
index_name ='TestRagas'
client.schema.delete_class(index_name)

In [354]:
# setting up the storage for the embeddings
from llama_index.vector_stores.types import VectorStoreQuery as vquery
vector_store = WeaviateVectorStore(weaviate_client = client, index_name="TestRagas", text_key="content")
storage_context = StorageContext.from_defaults(vector_store = vector_store)
service_context = ServiceContext.from_defaults(embed_model=minilm)

In [None]:
vector_store.query(query=vquery(

In [327]:
# set up the index
weaviate_index = VectorStoreIndex(nodes, storage_context = storage_context, service_context=service_context, embed_model = minilm)

In [449]:
# client.schema.delete_class('TestRagas')


In [450]:
test_questions = test_df['question'].values.tolist()
test_answers = [[item] for item in test_df['answer'].values.tolist()]

In [12]:
import nest_asyncio
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext,OpenAIEmbedding
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.indices.query.schema import QueryBundle
import pandas as pd
from ragas.metrics import (
    context_precision,
    context_recall,
)

metrics = [
    context_precision,
    context_recall,
]
nest_asyncio.apply()

minilm = HuggingFaceEmbeddings(model_name=model_name)
def build_query_engine(vector_index, embedding_model):
    query_engine = vector_index.as_query_engine(similarity_top_k=3, embedding_model=model)
    return query_engine

"""Async utils."""
import asyncio
from itertools import zip_longest
from typing import Any, Coroutine, Iterable, List


def run_async_tasks(
    tasks: List[Coroutine],
    show_progress: bool = False,
    progress_bar_desc: str = "Running async tasks",
) -> List[Any]:
    """Run a list of async tasks."""
    tasks_to_execute: List[Any] = tasks
    if show_progress:
        try:
            import nest_asyncio
            from tqdm.asyncio import tqdm

            # jupyter notebooks already have an event loop running
            # we need to reuse it instead of creating a new one
            nest_asyncio.apply()
            loop = asyncio.get_event_loop()

            async def _tqdm_gather() -> List[Any]:
                return await tqdm.gather(*tasks_to_execute, desc=progress_bar_desc)

            tqdm_outputs: List[Any] = loop.run_until_complete(_tqdm_gather())
            return tqdm_outputs
        # run the operation w/o tqdm on hitting a fatal
        # may occur in some environments where tqdm.asyncio
        # is not supported
        except Exception:
            pass

    async def _gather() -> List[Any]:
        return await asyncio.gather(*tasks_to_execute)

    outputs: List[Any] = asyncio.run(_gather())
    return outputs


def chunks(iterable: Iterable, size: int) -> Iterable:
    args = [iter(iterable)] * size
    return zip_longest(*args, fillvalue=None)


async def batch_gather(
    tasks: List[Coroutine], batch_size: int = 10, verbose: bool = False
) -> List[Any]:
    output: List[Any] = []
    for task_chunk in chunks(tasks, batch_size):
        output_chunk = await asyncio.gather(*task_chunk)
        output.extend(output_chunk)
        if verbose:
            print(f"Completed {len(output)} out of {len(tasks)} tasks")
    return output

async def aquery(query: str, client) -> dict:
    response = await client.hybrid_search(query, class_name=index_name, alpha=0.25)
    return response

In [379]:
minilm

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [331]:
minilm_query_engine = build_query_engine(weaviate_index, minilm)

In [333]:
# result = evaluate(minilm_query_engine, metrics, test_questions, test_answers)

In [355]:
query = 'what are the effects of Aspartame'
emb = model.encode(query)
bundle = QueryBundle(query_str=query, 
                     embedding=emb)

In [358]:
res = minilm_query_engine.query(bundle)

In [382]:
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [438]:
def get_response(query: str, class_name: str, client: WeaviateClient):
    response = client.hybrid_search(query, class_name, limit=3, alpha=0.25)
    return (query, response)

In [451]:
%%time

results = [get_response(q, index_name, client) for q in tqdm(test_questions)]
# progress = tqdm(unit="Retriever", total=len(test_questions))

# # with ThreadPoolExecutor(max_workers=os.cpu_count()) as exec:
# #     futures = [exec.submit(get_response, query, index_name, client) for query in test_questions]
# #     for future in as_completed(futures):
# #         results.append(future.result())
# #         progress.update(1)

  0%|          | 0/34 [00:00<?, ?it/s]

CPU times: user 460 ms, sys: 41.8 ms, total: 501 ms
Wall time: 4.92 s


In [15]:
from ragas import evaluate as rag_eval
from prompt_templates import question_answering_prompt
from retriever_pipeline import generate_prompt
from openai_interface import GPT_Turbo

ModuleNotFoundError: No module named 'prompt_templates'

In [486]:
gpt = GPT_Turbo()
context = [d['content'] for d in results[0][1]]

In [491]:
def get_answers(results: List[dict]):
    responses = []
    contexts = []
    for res in tqdm(results):
        context = ''
        c = [d['content'] for d in res[1]]
        contexts.append(c)
        for i, d in enumerate(res[1], 1):
            context += f'Context Block: {i}\n{d["content"]}\n'
            c = d['content']
        prompt = question_answering_prompt.format(context=context, question=res[0])
        llm_response = gpt.get_completion_from_messages(prompt)
        responses.append(llm_response)
    return responses, contexts

In [492]:
answers, contexts = get_answers(results)

  0%|          | 0/34 [00:00<?, ?it/s]

In [502]:

# for r in responses:
#     answers.append(r.response)
#     contexts.append([c.node.get_content() for c in r.source_nodes])
dataset_dict = {
    "question": test_questions,
    "answer": answers,
    "contexts": contexts,
}

In [7]:
from datasets import Dataset
# ground_truths = test_answers

In [506]:
if ground_truths is not None:
    dataset_dict["ground_truths"] = ground_truths
ds = Dataset.from_dict(dataset_dict)
# result = ragas_evaluate(ds, metrics)

In [513]:
ds.save_to_disk('./practice_data/ragas_dict.parquet')

Saving the dataset (0/1 shards):   0%|          | 0/34 [00:00<?, ? examples/s]

In [10]:
ds = Dataset.load_from_disk('./vector_search_applications/practice_data/ragas_dict.parquet/')
ds

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 34
})

In [16]:
rag_metrics = rag_eval(ds, metrics)

evaluating with [context_precision]


  0%|                                                                        | 0/3 [00:00<?, ?it/s]


AuthenticationError: Incorrect API key provided: sk-wJ4r3***************************************I17Y. You can find your API key at https://platform.openai.com/account/api-keys.