In [14]:
%load_ext autoreload 
%autoreload 2
    
import json
import os
import re
from preprocessing import FileIO
from typing import List, Dict, Tuple, Union
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import NodeParser, SimpleNodeParser
from llama_index import Document
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
from openai_interface import GPT_Turbo
import uuid

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Data Ingest and Parsing

In [2]:
data_path = './data/impact_theory_data.json'
parquet_path = './data/impact_theory_minilm_256.parquet'

In [3]:
with open(data_path) as f:
    data = json.load(f)

In [4]:
parquet = FileIO().load_parquet(parquet_path)

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


In [16]:
def create_documents(data: List[dict]) -> List[Document]:
    '''
    Given a dataset of a list of dictionaries converts each dict 
    to a llama_index Document and returns a List of Documents.
    '''
    docs = []
    for d in tqdm(data):
        unwanted_fields = ['content', 'content_embedding']
        emb = d['content_embedding']
        content=d['content']
        meta = {k:v for k,v in d.items() if k not in unwanted_fields}
        doc = Document(embedding=emb, metadata=meta, text=content, excluded_embed_metadata_keys=list(meta.keys()))
        docs.append(doc)
    return docs

In [27]:
def create_text_nodes(data: List[dict]) -> List[TextNode]:
    '''
    Given a dataset of a list of dictionaries converts each dict 
    to a llama_index Document and returns a List of Documents.
    '''
    docs = []
    for d in tqdm(data):
        unwanted_fields = ['content', 'content_embedding']
        emb = d['content_embedding']
        content=d['content']
        meta = {k:v for k,v in d.items() if k not in unwanted_fields}
        doc = TextNode(embedding=emb, metadata=meta, text=content, excluded_embed_metadata_keys=list(meta.keys()))
        docs.append(doc)
    return docs

In [17]:
docs = create_documents(parquet)

  0%|          | 0/26448 [00:00<?, ?it/s]

In [28]:
nodes = create_text_nodes(parquet)

  0%|          | 0/26448 [00:00<?, ?it/s]

In [72]:
output_dir = '/home/elastic/notebooks/vsa_practice/practice_data/individual_jsons_vectors/'

In [73]:
def create_individual_json_files(data: List[dict], output_dir: str=output_dir) -> None:
    '''
    Given a dataset consisting of a list of dicts i.e. one dict
    per pdocast episode, function will save each episode (dict)
    to disk in json format.
    '''
    for i, d in enumerate(data, 1):
        try:
            video_id = d['video_id']
            filename = f'{video_id}_Episode_{i}.json'
            path = os.path.join(output_dir, filename)
            with open(path, 'w') as f:
                json.dump(d, f)
        except Exception:
            print(Exception)
            continue
            
    print(f'Completed saving {i} json files')
        

In [74]:
# create_individual_json_files(parquet)

# 2. Question Answer Dataset Generation

In [29]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.llms import OpenAI
import openai
import random
from dotenv import load_dotenv
load_dotenv('.env', override=True)

True

In [30]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [36]:
from copy import deepcopy
def train_val_split(n_train_questions: int, 
                    n_val_questions: int, 
                    nodes: List[TextNode], 
                    n_questions_per_chunk: int=2):
    training_data = deepcopy(nodes)
    random.shuffle(training_data)
    train_index = n_train_questions//n_questions_per_chunk
    valid_index = n_val_questions//n_questions_per_chunk
    train_nodes = training_data[:train_index]
    valid_nodes = training_data[train_index:valid_index + train_index]
    print(f'Length Training Nodes: {len(train_nodes)}')
    print(f'Length Validation Nodes: {len(valid_nodes)}')
    return train_nodes, valid_nodes

In [35]:
num_train_questions = 250
num_valid_questions = 100

In [37]:
train_set, valid_set = train_val_split(250, 100, nodes)

Length Training Nodes: 125
Length Validation Nodes: 50


In [42]:
llm = OpenAI(model='gpt-3.5-turbo-0613')

### 250 Questions Experiment

In [None]:
# train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm, num_questions_per_chunk=1)
# val_dataset = generate_qa_embedding_pairs(val_nodes, num_questions_per_chunk=1)

# train_dataset.save_json("train_dataset.json")
# val_dataset.save_json("val_dataset.json")

# train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
# val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [43]:
from llama_index.schema import MetadataMode, TextNode

In [44]:
qa_generation_prompt = '''
Show summary and show guest are below.

---------------------
Summary: {summary}
---------------------
Guest: {guest}
---------------------
Given the show Summary and Guest of the show as context \
use the following randomly selected transcript section \  
of the show and not prior knowledge, generate questions that can \
be answered by the transcript section: 

---------------------
{transcript}
---------------------

Your task is to create {num_questions_per_chunk} questions that can \
only be answered given the previous context and transcript details. \
When possible try to use questions that start with How or Why.  
'''

In [47]:
train_set[0].metadata['doc_id']

'PeK9EeKNXDM_11'

In [48]:
def generate_qa_embedding_pairs(
    nodes: List[TextNode],
    llm: GPT_Turbo,
    qa_generate_prompt_tmpl: str,
    num_questions_per_chunk: int = 2,
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes."""
   
    queries = {}
    relevant_docs = {}
    corpus = {node.metadata['doc_id'] : node.get_text() for node in nodes}
    for node in tqdm(nodes):
        summary = node.metadata['summary']
        guest = node.metadata['guest']
        transcript = node.get_text() 
        node_id = node.metadata['doc_id']
        query = qa_generate_prompt_tmpl.format(summary=summary, 
                                               guest=guest,
                                               transcript=transcript,
                                               num_questions_per_chunk=num_questions_per_chunk)
        try:
            response = llm.get_completion_from_messages(prompt=query, temperature=0.1, max_tokens=100)
        except Exception:
            print(Exception)
            continue
        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

    # construct dataset
    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=corpus, relevant_docs=relevant_docs
    )

In [49]:
gpt = GPT_Turbo()

In [53]:
%%time
train_qa = generate_qa_embedding_pairs(train_set, gpt, qa_generation_prompt)

  0%|          | 0/125 [00:00<?, ?it/s]

CPU times: user 618 ms, sys: 58.1 ms, total: 676 ms
Wall time: 2min 41s


In [58]:
train_qa.save_json('training_dataset.json')

In [59]:
valid_qa = generate_qa_embedding_pairs(valid_set, gpt , qa_generation_prompt)

  0%|          | 0/50 [00:00<?, ?it/s]

In [60]:
valid_qa.save_json('validation_dataset.json')

# 3. Fine Tune Embedding Model

In [61]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
model_path = 'sentence-transformers/all-MiniLM-L6-v2'

In [63]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_qa,
    batch_size=32,
    model_id="fine_tuned_minilm",
    model_output_path="new_ft_model",
    val_dataset=valid_qa,
    epochs=15
)

In [64]:
%%time
finetune_engine.finetune()

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8 [00:00<?, ?it/s]

CPU times: user 20 s, sys: 1.1 s, total: 21.1 s
Wall time: 18.3 s


In [188]:
embed_model = finetune_engine.get_finetuned_model()

In [196]:
finetune_engine.

15

In [66]:
from sentence_transformers import SentenceTransformer

In [67]:
ft_model = SentenceTransformer('./new_ft_model/')

# Evaluate Model on Dataset

In [74]:
testing = [TextNode(id_=node.metadata['doc_id'], text=node.get_text()) for node in tqdm(nodes)]

  0%|          | 0/26448 [00:00<?, ?it/s]

In [75]:
testing[0]

TextNode(id_='nXJBccSwtB8_0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='18b2e122be4b09e3d0cc93aa4f09b594b0c2e7343d3de626575cf49e906de4b7', text="You said these are dangerous times. The world order is shifting before our eyes. We also both know that with hyper disruptive technologies like AI on the horizon, a good outcome is not guaranteed. Why do you think big tech will become the third superpower and what are the dangers and opportunities if it does? Big tech is essentially sovereign over the digital world. The fact that former President Trump was de-platformed from Facebook and from Twitter when he was president, you know, most powerful political figure on the planet. And he's just taken off of those networks and as a consequence, hundreds of millions of people that would be regularly engaging with him in real time suddenly can't see it. That wasn't a decision that was made by a government. It wasn't a decisio

In [95]:
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

# function for hit rate evals
def evaluate(
    dataset: EmbeddingQAFinetuneDataset,
    full_corpus: List[TextNode],
    embed_model: HuggingFaceEmbedding,
    top_k=3,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=node.metadata['doc_id'], text=node.get_text()) for node in tqdm(full_corpus, 'Text Nodes')]
    index = VectorStoreIndex(nodes, service_context=service_context, show_progress=True)
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items(), "Submitting Queries"):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [90]:
minilm = HuggingFaceEmbedding('./new_ft_model/')
minilm

HuggingFaceEmbedding(model_name='./new_ft_model/', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7feb681a5d30>, tokenizer_name='./new_ft_model/', max_length=512, pooling='cls', query_instruction=None, text_instruction=None, cache_folder=None)

In [96]:
eval_results = evaluate(valid_qa, nodes, minilm, top_k=5, verbose=True)

Text Nodes:   0%|          | 0/26448 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/26448 [00:00<?, ?it/s]

Submitting Queries:   0%|          | 0/100 [00:00<?, ?it/s]

In [97]:
len(eval_results)

100

In [98]:
sum([d['is_hit'] for d in eval_results])

51

Bad pipe message: %s [b'`\x14:\x0e+\xf8\xea,\xe6\xea]c\xe8DD7q\x07 \xafT\xbb\x8d\x0f\xa3\xdea\xabl\xf7h\\.t{\xcb\xf5i`\x07\x9bN7<\x02\xa1\x08S\x03Z~\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07', b'\x08\t\x08\n\x08\x0b\x08']
Bad pipe message: %s [b'\x05\x08\x06']
Bad pipe message: %s [b'\x05\x01\x06', b'']
Bad pipe message: %s [b'\xdc\x92\x83\xeb\x97\xf7(Oa\x8a\xc8\xa0?\r&\xd28\xf6 \x1c\xfb\t+\xd6\xa4\x9cu\x98\xcd\x13\xbbX\xef\xe8$o\xb2>t\x15\x87\xfez\xcf\xe1~\x04C\xc4\xbeL\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x0