In [3]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine, EmbeddingQAFinetuneDataset, generate_qa_embedding_pairs

In [174]:
import os
import json
import openai
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode
from dotenv import load_dotenv
load_dotenv('./.env', override=True)
from preprocessing import FileIO
from retrieval_evaluation import generate_dataset
from typing import List, Dict, Tuple, Union, Callable

In [20]:
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
EmbeddingQAFinetuneDataset(

In [6]:
TRAIN_FILES = ["./practice_data/10k/lyft_2021.pdf"]
VAL_FILES = ["./practice_data/10k/uber_2021.pdf"]

# TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
# VAL_CORPUS_FPATH = "./data/val_corpus.json"

In [7]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes


In [8]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
valid_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['./practice_data/10k/lyft_2021.pdf']
Loaded 238 docs


Parsing documents into nodes:   0%|          | 0/238 [00:00<?, ?it/s]

Parsed 352 nodes
Loading files ['./practice_data/10k/uber_2021.pdf']
Loaded 307 docs


Parsing documents into nodes:   0%|          | 0/307 [00:00<?, ?it/s]

Parsed 427 nodes


In [21]:
val_dataset = generate_qa_embedding_pairs(valid_nodes[:10], num_questions_per_chunk=2)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00,  2.21s/it]


In [22]:
type(val_dataset)

llama_index.finetuning.embeddings.common.EmbeddingQAFinetuneDataset

In [67]:
val_dataset.queries

{'cd620b3f-c1e0-4f60-8d11-95fd77ec852e': 'According to the context information, what is the exact name of the registrant as specified in its charter?',
 'fb7e8466-ea63-448f-9ec1-7497a66f8718': 'Based on the context information, has the registrant submitted every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T during the preceding 12 months?',
 'd4f29374-2df6-41f0-bdd9-980485e53f6a': 'According to the context information, what is the aggregate market value of the voting and non-voting common equity held by non-affiliates of the registrant as of June 30, 2021?',
 '4d9c3b9a-d024-4a72-a1d7-0cb9d98a40e2': "Based on the context information, how many shares of the registrant's common stock were outstanding as of February 22, 2022?",
 '06a55fd3-0deb-46b5-8cc2-72e0d8d7f389': "In the context of Uber Technologies, Inc.'s financial statements, what is the purpose of Item 9A - Controls and Procedures? How does it relate to the company's overall financial disclo

In [68]:
val_dataset.corpus

{'ec4b14d1-1f05-4054-a62e-bd969e1bb12b': 'UNITED STATESSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n____________________________________________ \nFORM\n 10-K____________________________________________ \n(Mark One)\n☒\n ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended\n December 31, 2021OR\n☐\n TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the transition period from_____ to _____            \nCommission File Number: 001-38902\n____________________________________________ \nUBER TECHNOLOGIES, INC.\n(Exact name of registrant as specif\nied in its charter)____________________________________________ \nDelaware\n45-2647441 (State or other jurisdiction of inco\nrporation or organization)(I.R.S. Employer Identification No.) 1515 3rd Street\nSan Francisco, California 94158\n(Address of principal executive offic\nes, including zip code)(415) 612-8582\n(Registrant’s telepho

In [35]:
for node in valid_nodes:
    if node.dict()['id_'] == 'cd620b3f-c1e0-4f60-8d11-95fd77ec852e':
        print(node)

In [162]:
val_dataset.save_json('./delme_train.json')

In [48]:
for node in valid_nodes:
    if node.dict()['id_'] == 'ec4b14d1-1f05-4054-a62e-bd969e1bb12b':
        print(node.dict())

{'id_': 'ec4b14d1-1f05-4054-a62e-bd969e1bb12b', 'embedding': None, 'metadata': {'page_label': '1', 'file_name': 'uber_2021.pdf'}, 'excluded_embed_metadata_keys': [], 'excluded_llm_metadata_keys': [], 'relationships': {<NodeRelationship.SOURCE: '1'>: {'node_id': 'f612fc93-837b-46f2-b63a-36a80afadc95', 'node_type': None, 'metadata': {'page_label': '1', 'file_name': 'uber_2021.pdf'}, 'hash': '8236c7b95b366f678fd506883779c1f7b8349b0d0b466a9da6694959abfab3c6'}}, 'hash': '43981224f0807fa7ad41a8d41ce55a45eea6f8e905ee30fc003295256829d36f', 'text': 'UNITED STATESSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n____________________________________________ \nFORM\n 10-K____________________________________________ \n(Mark One)\n☒\n ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended\n December 31, 2021OR\n☐\n TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the transition period from____

In [54]:
data = FileIO().load_parquet('./practice_data/impact_theory_minilm_196.parquet')

Shape of data: (37007, 17)
Memory Usage: 4.55+ MB


In [57]:
for d in data:
    del d['content_embedding']

In [59]:
corpus = {d['doc_id']:d['content'] for d in data}

In [88]:
val_dataset.queries

{'cd620b3f-c1e0-4f60-8d11-95fd77ec852e': 'According to the context information, what is the exact name of the registrant as specified in its charter?',
 'fb7e8466-ea63-448f-9ec1-7497a66f8718': 'Based on the context information, has the registrant submitted every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T during the preceding 12 months?',
 'd4f29374-2df6-41f0-bdd9-980485e53f6a': 'According to the context information, what is the aggregate market value of the voting and non-voting common equity held by non-affiliates of the registrant as of June 30, 2021?',
 '4d9c3b9a-d024-4a72-a1d7-0cb9d98a40e2': "Based on the context information, how many shares of the registrant's common stock were outstanding as of February 22, 2022?",
 '06a55fd3-0deb-46b5-8cc2-72e0d8d7f389': "In the context of Uber Technologies, Inc.'s financial statements, what is the purpose of Item 9A - Controls and Procedures? How does it relate to the company's overall financial disclo

In [92]:
dataset = {'queries': {}, 'corpus': corpus, 'relevant_docs': {}}

In [70]:
import uuid

In [71]:
random_uuid = uuid.uuid4()

In [85]:
random_uuid.hex, random_uuid.bytes

('786bc78b2ee049eda34243bbb365bc3b',
 b'xk\xc7\x8b.\xe0I\xed\xa3BC\xbb\xb3e\xbc;')

In [86]:
with open('./practice_data/100_questions_2023-10-24:19:42:33.json') as f:
    questions = json.load(f)

In [87]:
questions

{'LCHPSo79rB4_39': ['What is the hypothesis behind founding Impact Theory?',
  'What is the leading cause of death among young men?'],
 'Dzlg17y0IMM_100': ['What is one of the powers of modern medicine?',
  'What kind of patients does the doctor often see?'],
 'J8Gg9twRyJ4_82': ['How does the podcast cater to a diverse range of interests?',
  'What kind of topics are covered in the podcast?'],
 'rKByaM5asU8_106': ['What is the moral composition of society based on acquiring wealth?',
  'Why should one be ashamed of taking value from someone else?'],
 '0-kvH8Zv8Bs_13': ['How does setting low goals initially affect performance in the high jump?',
  'What comparison does the speaker make between setting goals and the high jump?'],
 'd2pFo5C5KwE_111': ['Why did people come to the United States in the past?',
  "What was the religious background of the speaker's family?"],
 'J8Gg9twRyJ4_27': ['What was the motivation for the speaker to focus on a specific thing?',
  'Is the speaker contempl

In [97]:
for k, value in questions.items():
    for q in value:
        dataset['relevant_docs'][uuid.uuid4().hex] = [k]

In [102]:
dataset['relevant_docs']

{'66bf9d0acc1546adab8478b20419b46f': ['LCHPSo79rB4_39'],
 '34a5b6460e7b44629749ad3acb35d7f4': ['LCHPSo79rB4_39'],
 '6d673831620944ffaba0bc99497282ed': ['Dzlg17y0IMM_100'],
 '643021e2bbb74198aad23976355ea46e': ['Dzlg17y0IMM_100'],
 'fbb4dae8610242aba391fd047c7de165': ['J8Gg9twRyJ4_82'],
 'd688c1b269f14bfaa186cac3dcaccb8e': ['J8Gg9twRyJ4_82'],
 'ec26d3d6b03d44b7a86fc33676d2c818': ['rKByaM5asU8_106'],
 'b17142e4f6cb4ad2a5b63e7c0e98a6fa': ['rKByaM5asU8_106'],
 '2c515084c12a4846b477976fadd5fc7b': ['0-kvH8Zv8Bs_13'],
 'ab81e73bacac4d87a49d748f109a2043': ['0-kvH8Zv8Bs_13'],
 '329554db42dc4f7aadb6ea6d15dfe504': ['d2pFo5C5KwE_111'],
 '70d4e7fc0bde4600b4bff25de1116844': ['d2pFo5C5KwE_111'],
 'e01fad57a6cb4f998316176060323a3e': ['J8Gg9twRyJ4_27'],
 '304b4cb15b08410c9df95fa59c75246b': ['J8Gg9twRyJ4_27'],
 '2e59316a90754fce954eaa8a8ccf503a': ['Q0tHddpViWM_13'],
 '73fd7eeea5374bcfbf84f1fc59c96978': ['Q0tHddpViWM_13'],
 '14af812055c54bedab27d110aa5a978c': ['SuZ8lEHtDI8_78'],
 'eb2533f9d2bd4c7cb9001f9

In [104]:
dataset['queries']

{}

In [114]:
# for hash_id in dataset['relevant_docs'].keys():
#     doc_id = dataset['relevant_docs'][hash_id]
#     qs = questions[doc_id[0]]
#     print(qs)

In [115]:
dataset['relevant_docs']

{'66bf9d0acc1546adab8478b20419b46f': ['LCHPSo79rB4_39'],
 '34a5b6460e7b44629749ad3acb35d7f4': ['LCHPSo79rB4_39'],
 '6d673831620944ffaba0bc99497282ed': ['Dzlg17y0IMM_100'],
 '643021e2bbb74198aad23976355ea46e': ['Dzlg17y0IMM_100'],
 'fbb4dae8610242aba391fd047c7de165': ['J8Gg9twRyJ4_82'],
 'd688c1b269f14bfaa186cac3dcaccb8e': ['J8Gg9twRyJ4_82'],
 'ec26d3d6b03d44b7a86fc33676d2c818': ['rKByaM5asU8_106'],
 'b17142e4f6cb4ad2a5b63e7c0e98a6fa': ['rKByaM5asU8_106'],
 '2c515084c12a4846b477976fadd5fc7b': ['0-kvH8Zv8Bs_13'],
 'ab81e73bacac4d87a49d748f109a2043': ['0-kvH8Zv8Bs_13'],
 '329554db42dc4f7aadb6ea6d15dfe504': ['d2pFo5C5KwE_111'],
 '70d4e7fc0bde4600b4bff25de1116844': ['d2pFo5C5KwE_111'],
 'e01fad57a6cb4f998316176060323a3e': ['J8Gg9twRyJ4_27'],
 '304b4cb15b08410c9df95fa59c75246b': ['J8Gg9twRyJ4_27'],
 '2e59316a90754fce954eaa8a8ccf503a': ['Q0tHddpViWM_13'],
 '73fd7eeea5374bcfbf84f1fc59c96978': ['Q0tHddpViWM_13'],
 '14af812055c54bedab27d110aa5a978c': ['SuZ8lEHtDI8_78'],
 'eb2533f9d2bd4c7cb9001f9

In [116]:
questions

{'LCHPSo79rB4_39': ['What is the hypothesis behind founding Impact Theory?',
  'What is the leading cause of death among young men?'],
 'Dzlg17y0IMM_100': ['What is one of the powers of modern medicine?',
  'What kind of patients does the doctor often see?'],
 'J8Gg9twRyJ4_82': ['How does the podcast cater to a diverse range of interests?',
  'What kind of topics are covered in the podcast?'],
 'rKByaM5asU8_106': ['What is the moral composition of society based on acquiring wealth?',
  'Why should one be ashamed of taking value from someone else?'],
 '0-kvH8Zv8Bs_13': ['How does setting low goals initially affect performance in the high jump?',
  'What comparison does the speaker make between setting goals and the high jump?'],
 'd2pFo5C5KwE_111': ['Why did people come to the United States in the past?',
  "What was the religious background of the speaker's family?"],
 'J8Gg9twRyJ4_27': ['What was the motivation for the speaker to focus on a specific thing?',
  'Is the speaker contempl

In [117]:
def hash_func():
    return uuid.uuid4().hex

## 1.) Create temp list of hash:query pairs with doc_id as key

In [123]:
temp_list = []
for doc_id in questions:
    temp_list.append({doc_id: {hash_func():query for query in questions[doc_id]}})

## 2.) Build relevant_docs key first

In [137]:
for d in temp_list:
    for k,v in d.items():
        for el in v:
            dataset['relevant_docs'][el] = [k]

In [144]:
dataset['queries']

{'7551e8cebacb48e39bfe2a3712611c69': {'7551e8cebacb48e39bfe2a3712611c69': 'What is the hypothesis behind founding Impact Theory?',
  'b19aa82c9dc84be8915b5d1175f89520': 'What is the leading cause of death among young men?'},
 'b19aa82c9dc84be8915b5d1175f89520': {'7551e8cebacb48e39bfe2a3712611c69': 'What is the hypothesis behind founding Impact Theory?',
  'b19aa82c9dc84be8915b5d1175f89520': 'What is the leading cause of death among young men?'},
 'ecaab4e64d3d4f95ac130ff9e9448d0d': {'ecaab4e64d3d4f95ac130ff9e9448d0d': 'What is one of the powers of modern medicine?',
  '6671e362c0e04bdeab65a482c86f9aae': 'What kind of patients does the doctor often see?'},
 '6671e362c0e04bdeab65a482c86f9aae': {'ecaab4e64d3d4f95ac130ff9e9448d0d': 'What is one of the powers of modern medicine?',
  '6671e362c0e04bdeab65a482c86f9aae': 'What kind of patients does the doctor often see?'},
 '44c0f4550c454097a3eda3a54c98546b': {'44c0f4550c454097a3eda3a54c98546b': 'How does the podcast cater to a diverse range o

## 3.) Build queries key next

In [151]:
for d in temp_list:
    for k, v in d.items():
        for el in v:
            dataset['queries'][el] = v[el]

In [156]:
dataset['queries']['dc6f4c7a0e564a688fdaf880bd227acb']

'How does the person feel about not being productive in bed?'

In [153]:
dataset['relevant_docs']['dc6f4c7a0e564a688fdaf880bd227acb']

['Tigt75AcLLA_106']

In [155]:
dataset['corpus']['Tigt75AcLLA_106']

"I can't allow myself to. Immediately as soon as I wake up. Well, if I'm doing something in bed, I suppose I should change it to working. That I have 10 minutes to be productive is probably the right way to think about it. And because there are times where there's something that I can do in bed, like this morning, when I could start researching you, the second I wake up, then I would still call that a win. But if I don't do the thing that I said I was going to do, then I force myself to acknowledge to myself with no, I don't let myself run. So I don't let myself be distracted. I'm just like, you said you were going to do something. You did not do this thing. And therefore you should not feel good about the behaviors that you enacted in this moment."

In [157]:
llama_dataset = EmbeddingQAFinetuneDataset(queries=dataset['queries'], corpus=dataset['corpus'], relevant_docs=dataset['relevant_docs'])

In [165]:
len(llama_dataset.queries)

200

In [167]:
len(data)

37007

In [169]:
# generate_dataset(data, dir_path='./practice_data/', num_questions=25)

In [172]:
with open('./practice_data/25_questions_2023-11-07:02:50:58.json') as f:
    valid_questions = json.load(f)

In [185]:
def convert_data_to_llama_format(corpus: List[dict], 
                                 questions: Dict[str, List], 
                                 hash_fx: Callable,
                                 doc_id_field: str='doc_id',
                                 content_field: str='content',
                                 ) -> EmbeddingQAFinetuneDataset:
    corpus = {d[doc_id_field]:d[content_field] for d in corpus}
    dataset = {'queries': {}, 'corpus': corpus, 'relevant_docs': {}}
    temp_list = []
    for doc_id in questions:
        temp_list.append({doc_id: {hash_fx():query for query in questions[doc_id]}})
    for d in temp_list:
        for k,v in d.items():
            for el in v:
                dataset['relevant_docs'][el] = [k]
    for d in temp_list:
        for k, v in d.items():
            for el in v:
                dataset['queries'][el] = v[el]
    return EmbeddingQAFinetuneDataset(queries=dataset['queries'], corpus=dataset['corpus'], relevant_docs=dataset['relevant_docs'])

In [186]:
valid = convert_data_to_llama_format(data, valid_questions, hash_func)

### Launch Fine Tuning

In [189]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [190]:
finetune_engine = SentenceTransformersFinetuneEngine(dataset=llama_dataset, model_id='sentence-transformers/all-MiniLM-L6-v2', model_output_path='./test_model', val_dataset=valid)

In [195]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

In [196]:
ft_embed_model = finetune_engine.get_finetuned_model()

In [198]:
ft_embed_model.model_name

'./test_model'