In [148]:
%load_ext autoreload 
%autoreload 2
    
import json
import os
import re
from preprocessing import FileIO
from typing import List, Dict, Tuple, Union
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import NodeParser, SimpleNodeParser
from llama_index import Document
from tqdm.notebook import tqdm
from openai_interface import GPT_Turbo
import uuid

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Data Ingest and Parsing

In [32]:
data_path = './data/impact_theory_data.json'
parquet_path = './data/impact_theory_minilm_256.parquet'

In [23]:
with open(data_path) as f:
    data = json.load(f)

In [33]:
parquet = FileIO().load_parquet(parquet_path)

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


In [68]:
def create_documents(data: List[dict]) -> List[Document]:
    '''
    Given a dataset of a list of dictionaries converts each dict 
    to a llama_index Document and returns a List of Documents.
    '''
    docs = []
    for d in tqdm(data):
        unwanted_fields = ['content', 'content_embedding']
        emb = d['content_embedding']
        content=d['content']
        meta = {k:v for k,v in d.items() if k not in unwanted_fields}
        doc = Document(embedding=emb, metadata=meta, text=content)
        docs.append(doc)
    return docs

In [69]:
unwanted_fields = ['content', 'content_embedding']
# Document(text=parquet[0]['content'], embedding=parquet[0]['content_embedding'], metadata={k:v for k,v in parquet[0].items() if k not in unwanted_fields}).dict()

In [70]:
docs = create_documents(parquet)

  0%|          | 0/26448 [00:00<?, ?it/s]

In [72]:
output_dir = '/home/elastic/notebooks/vsa_practice/practice_data/individual_jsons_vectors/'

In [73]:
def create_individual_json_files(data: List[dict], output_dir: str=output_dir) -> None:
    '''
    Given a dataset consisting of a list of dicts i.e. one dict
    per pdocast episode, function will save each episode (dict)
    to disk in json format.
    '''
    for i, d in enumerate(data, 1):
        try:
            video_id = d['video_id']
            filename = f'{video_id}_Episode_{i}.json'
            path = os.path.join(output_dir, filename)
            with open(path, 'w') as f:
                json.dump(d, f)
        except Exception:
            print(Exception)
            continue
            
    print(f'Completed saving {i} json files')
        

In [74]:
# create_individual_json_files(parquet)

In [None]:
def load_corpus(docs, for_training=False, verbose=False):
    parser = SimpleNodeParser.from_defaults()
    if for_training:
        nodes = parser.get_nodes_from_documents(docs[:90], show_progress=verbose)
    else:
        nodes = parser.get_nodes_from_documents(docs[91:], show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    return nodes


reader = SimpleDirectoryReader(input_files=SEC_FILE)
docs = reader.load_data()
print(f'Loaded {len(docs)} docs')

train_nodes = load_corpus(docs, for_training=True, verbose=True)
val_nodes = load_corpus(docs, for_training=False, verbose=True)

In [81]:
parser = SimpleNodeParser.from_defaults(chunk_size=800, chunk_overlap=0, include_prev_next_rel=False)

In [82]:
nodes = parser.get_nodes_from_documents(docs, show_progress=True)

Parsing documents into nodes:   0%|          | 0/26448 [00:00<?, ?it/s]

# 2. Question Answer Dataset Generation

In [93]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.llms import OpenAI
import openai
import random
from dotenv import load_dotenv
load_dotenv('.env', override=True)

True

In [90]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [96]:
random.shuffle(nodes)

In [157]:
train_nodes = nodes[:125]
val_nodes = nodes[125:175]
test_nodes = nodes[300:310]

In [158]:
llm = OpenAI(model='gpt-3.5-turbo-0613')

### 250 Questions Experiment

In [None]:
# train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm, num_questions_per_chunk=1)
# val_dataset = generate_qa_embedding_pairs(val_nodes, num_questions_per_chunk=1)

# train_dataset.save_json("train_dataset.json")
# val_dataset.save_json("val_dataset.json")

# train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
# val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [102]:
test = generate_qa_embedding_pairs(test_nodes, llm=llm, num_questions_per_chunk=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:45<00:00,  4.51s/it]


In [156]:
# test.dict()['queries']

In [110]:
from llama_index.schema import MetadataMode, TextNode

In [129]:
nodes[0].dict()['metadata']['summary']

'In this episode, Donald Hoffman discusses the concept of cause and effect and its relationship to our perception of reality. He argues that cause and effect is a useful fiction created by evolution, comparing it to the fictional causality in video games. He suggests that our perception of cause and effect is an illusion within the "headset" of space and time, and that outside of this headset, cause and effect may not exist. He also explores the idea of consciousness and its relationship to mathematical structure, referencing Gödel\'s incompleteness theorem to suggest that there is an endless exploration of mathematical structure. Hoffman\'s theory of conscious agents proposes that consciousness is fundamental and that our perception of reality is a construction created by these conscious agents. He also discusses the implications of this theory on the concept of self and the nature of consciousness after death. Throughout the conversation, Hoffman emphasizes the importance of using ma

In [117]:
qa_generation_prompt = '''
Show summary and show guest are below.

---------------------
Summary: {summary}
---------------------
Guest: {guest}
---------------------
Given the show Summary and Guest of the show as context \
use the following randomly selected transcript section \  
of the show and not prior knowledge, generate questions that can \
be answered by the transcript section: 

---------------------
{transcript}
---------------------

Your task is to create {num_questions_per_chunk} questions that can \
only be answered given the previous context and transcript details. \
When possible try to use questions that start with How or Why.  
'''

In [161]:
def generate_qa_embedding_pairs(
    nodes: List[TextNode],
    llm: GPT_Turbo,
    qa_generate_prompt_tmpl: str,
    num_questions_per_chunk: int = 2,
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes."""
   
    queries = {}
    relevant_docs = {}
    corpus = {node.node_id: node.get_text() for node in nodes}
    for node in tqdm(nodes):
        node_dict = node.dict()
        summary = node_dict['metadata']['summary']
        guest = node_dict['metadata']['guest']
        transcript = node_dict['text']
        node_id = node_dict['id_']
        query = qa_generate_prompt_tmpl.format(summary=summary, 
                                               guest=guest,
                                               transcript=transcript,
                                               num_questions_per_chunk=num_questions_per_chunk)
        try:
            response = llm.get_completion_from_messages(prompt=query, temperature=0.1, max_tokens=100)
        except Exception:
            print(Exception)
            continue
        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

    # construct dataset
    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=corpus, relevant_docs=relevant_docs
    )

In [162]:
gpt = GPT_Turbo()

In [163]:
%%time
testqa = generate_qa_embedding_pairs(train_nodes, gpt, qa_generation_prompt)

  0%|          | 0/125 [00:00<?, ?it/s]

CPU times: user 716 ms, sys: 30.4 ms, total: 747 ms
Wall time: 3min 3s


In [180]:
trainqa = testqa

In [165]:
testqa.save_json('train_dataset.json')

In [168]:
validqa = generate_qa_embedding_pairs(val_nodes, gpt , qa_generation_prompt)

  0%|          | 0/50 [00:00<?, ?it/s]

In [178]:
validqa.save_json('valid_dataset.json')

# 3. Fine Tune Embedding Model

In [179]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
model_path = 'sentence-transformers/all-MiniLM-L6-v2'

In [186]:
finetune_engine = SentenceTransformersFinetuneEngine(
    trainqa,
    batch_size=32,
    model_id=model_path,
    model_output_path="fine_tuned_minilm",
    val_dataset=validqa,
    epochs=15
)

In [187]:
%%time
finetune_engine.finetune()

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

CPU times: user 18.9 s, sys: 1.21 s, total: 20.1 s
Wall time: 17.7 s


In [188]:
embed_model = finetune_engine.get_finetuned_model()

In [196]:
finetune_engine.

15

In [197]:
from sentence_transformers import SentenceTransformer

In [198]:
ft_model = SentenceTransformer('./fine_tuned_minilm/')

# Evaluate Model on Dataset

In [221]:
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

# function for hit rate evals
def evaluate(
    dataset: EmbeddingQAFinetuneDataset,
    embed_model,
    top_k=1,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, service_context=service_context, show_progress=True)
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [217]:
minilm = SentenceTransformer(model_path)
minilm = HuggingFaceEmbedding(model_path)
minilm

HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7f4e9a31b370>, tokenizer_name='sentence-transformers/all-MiniLM-L6-v2', max_length=512, pooling='cls', query_instruction=None, text_instruction=None, cache_folder=None)

In [222]:
eval_results = evaluate(validqa, minilm, verbose=True)

Generating embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [223]:
len(eval_results)

100

In [224]:
sum([d['is_hit'] for d in eval_results])

69

In [225]:
trainqa + validqa

TypeError: unsupported operand type(s) for +: 'EmbeddingQAFinetuneDataset' and 'EmbeddingQAFinetuneDataset'