# Generate Synthetic Dataset with LLM

In this notebook, we generate a synthetic dataset of (query, relevant documents) pairs from a corpus of documents *without labelers* by leveraging LLM.

### Generate Corpus

In [1]:
!pip install llama-index==0.8.5.post2
!pip install sentence-transformers==2.2.2
!pip install pypdf


Collecting llama-index==0.8.5.post2
  Downloading llama_index-0.8.5.post2-py3-none-any.whl (686 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m686.8/686.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken (from llama-index==0.8.5.post2)
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from llama-index==0.8.5.post2)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting langchain<=0.0.266,>=0.0.262 (from llama-index==0.8.5.post2)
  Downloading langchain-0.0.266-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=0.26.4 (from llama-index==0.8.5.post2)
  Downloading openai-1.14.2-py3-none-any.whl (262 kB)
[2K     [90m━

In [67]:
!pip install openai==0.28.1
!pip install langchain==0.0.330

Collecting openai==0.28.1
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.14.2
    Uninstalling openai-1.14.2:
      Successfully uninstalled openai-1.14.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index 0.8.5.post2 requires langchain<=0.0.266,>=0.0.262, but you have langchain 0.1.13 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.1


Collecting langchain==0.0.330
  Downloading langchain-0.0.330-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting langsmith<0.1.0,>=0.0.52 (from langchain==0.0.330)
  Using cached langsmith-0.0.92-py3-none-any.whl (56 kB)
Installing collected packages: langsmith, langchain
  Attempting uninstall: langsmith
    Found existing installation: langsmith 0.1.31
    Uninstalling langsmith-0.1.31:
      Successfully uninstalled langsmith-0.1.31
  Attempting uninstall: langchain
    Found existing installation: langchain 0.1.13
    Uninstalling langchain-0.1.13:
      Successfully uninstalled langchain-0.1.13
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-community 0.0.29 requires langsmith<0.2.0,>=0.1.0, but you have langsmith 0.0.92 which is incompat

First, we create the corpus of text chunks by leveraging LlamaIndex to load some financial PDFs, and parsing/chunking into plain text chunks.

In [11]:
import json
import os
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode
os.environ['OPENAI_API_KEY'] = 'YOUR_VALUE'


In [2]:
TRAIN_FILES = ['AAPL.pdf']
VAL_FILES = ['PAYX.pdf']

TRAIN_CORPUS_FPATH = './train_corpus.json'
VAL_CORPUS_FPATH = './val_corpus.json'

In [3]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus

We do a very naive train/val split by having the Lyft corpus as the train dataset, and the Uber corpus as the val dataset.

In [4]:
train_corpus = load_corpus(TRAIN_FILES, verbose=True)
val_corpus = load_corpus(VAL_FILES, verbose=True)

Loading files ['AAPL.pdf']
Loaded 80 docs


Parsing documents into nodes:   0%|          | 0/80 [00:00<?, ?it/s]

Parsed 102 nodes
Loading files ['PAYX.pdf']
Loaded 95 docs


Parsing documents into nodes:   0%|          | 0/95 [00:00<?, ?it/s]

Parsed 135 nodes


In [5]:
with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(train_corpus, f)

with open(VAL_CORPUS_FPATH, 'w+') as f:
    json.dump(val_corpus, f)

### Generate synthetic queries

Now, we use an LLM (gpt-3.5-turbo) to generate questions using each text chunk in the corpus as context.

Each pair of (generated question, text chunk used as context) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [6]:
import re
import uuid

from llama_index.llms import OpenAI
from llama_index.schema import MetadataMode
from tqdm.notebook import tqdm

In [7]:
TRAIN_QUERIES_FPATH = './data/train_queries.json'
TRAIN_RELEVANT_DOCS_FPATH = './data/train_relevant_docs.json'

VAL_QUERIES_FPATH = './data/val_queries.json'
VAL_RELEVANT_DOCS_FPATH = './data/val_relevant_docs.json'

In [8]:
with open(TRAIN_CORPUS_FPATH, 'r+') as f:
    train_corpus = json.load(f)

with open(VAL_CORPUS_FPATH, 'r+') as f:
    val_corpus = json.load(f)

In [12]:
def generate_queries(
    corpus,
    num_questions_per_chunk=2,
    prompt_template=None,
    verbose=False,
):
    """
    Automatically generate hypothetical questions that could be answered with
    doc in the corpus.
    """
    llm = OpenAI(model='gpt-3.5-turbo', api_key=os.environ.get("OPENAI_API_KEY"))

    prompt_template = prompt_template or """\
    Context information is below.

    ---------------------
    {context_str}
    ---------------------

    Given the context information and not prior knowledge.
    generate only questions based on the below query.

    You are a Teacher/ Professor. Your task is to setup \
    {num_questions_per_chunk} questions for an upcoming \
    quiz/examination. The questions should be diverse in nature \
    across the document. Restrict the questions to the \
    context information provided."
    """

    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        query = prompt_template.format(context_str=text, num_questions_per_chunk=num_questions_per_chunk)
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]
    return queries, relevant_docs

In [13]:
train_queries, train_relevant_docs = generate_queries(train_corpus)

  0%|          | 0/102 [00:00<?, ?it/s]

In [14]:
val_queries, val_relevant_docs = generate_queries(val_corpus)

  0%|          | 0/135 [00:00<?, ?it/s]

In [16]:
with open(TRAIN_QUERIES_FPATH, 'w+') as f:
    json.dump(train_queries, f)

with open(TRAIN_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(train_relevant_docs, f)

with open(VAL_QUERIES_FPATH, 'w+') as f:
    json.dump(val_queries, f)

with open(VAL_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(val_relevant_docs, f)

### Merge data

Finally, we do some minor re-organization to make it easier to access the dataset for training and evaluation.

In [17]:
TRAIN_DATASET_FPATH = './data/train_dataset.json'
VAL_DATASET_FPATH = './data/val_dataset.json'

In [18]:
train_dataset = {
    'queries': train_queries,
    'corpus': train_corpus,
    'relevant_docs': train_relevant_docs,
}

val_dataset = {
    'queries': val_queries,
    'corpus': val_corpus,
    'relevant_docs': val_relevant_docs,
}

In [19]:
with open(TRAIN_DATASET_FPATH, 'w+') as f:
    json.dump(train_dataset, f)

with open(VAL_DATASET_FPATH, 'w+') as f:
    json.dump(val_dataset, f)

In [29]:
!pip uninstall llama-index
!pip install llama-index --upgrade --no-cache-dir --force-reinstall

Found existing installation: llama-index 0.8.5.post2
Uninstalling llama-index-0.8.5.post2:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/llama_index-0.8.5.post2.dist-info/*
    /usr/local/lib/python3.10/dist-packages/llama_index/*
    /usr/local/lib/python3.10/dist-packages/tests/*
  Would not remove (might be manually added):
    /usr/local/lib/python3.10/dist-packages/llama_index/core/__init__.py
    /usr/local/lib/python3.10/dist-packages/llama_index/core/_static/.gitignore
    /usr/local/lib/python3.10/dist-packages/llama_index/core/_static/nltk_cache/.gitignore
    /usr/local/lib/python3.10/dist-packages/llama_index/core/_static/nltk_cache/corpora/stopwords/README
    /usr/local/lib/python3.10/dist-packages/llama_index/core/_static/nltk_cache/corpora/stopwords/arabic
    /usr/local/lib/python3.10/dist-packages/llama_index/core/_static/nltk_cache/corpora/stopwords/azerbaijani
    /usr/local/lib/python3.10/dist-packages/llama_index/core/_static/nltk_cache/corpora/stopw

In [2]:
!pip install llama-index
!pip install llama-index-llms-openai
!pip install llama-index-embeddings-openai
!pip install llama-index-finetuning
!pip install llama-index-embeddings-huggingface!

Collecting llama-index
  Downloading llama_index-0.10.22-py3-none-any.whl (6.9 kB)
Collecting llama-index-agent-openai<0.2.0,>=0.1.4 (from llama-index)
  Downloading llama_index_agent_openai-0.1.7-py3-none-any.whl (12 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_cli-0.1.11-py3-none-any.whl (26 kB)
Collecting llama-index-core<0.11.0,>=0.10.22 (from llama-index)
  Downloading llama_index_core-0.10.22-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index)
  Downloading llama_index_embeddings_openai-0.1.7-py3-none-any.whl (6.0 kB)
Collecting llama-index-indices-managed-llama-cloud<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.1.4-py3-none-any.whl (6.6 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading 

In [10]:
!pip install llama-index-embeddings-huggingface

Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.1.4-py3-none-any.whl (7.7 kB)
Installing collected packages: llama-index-embeddings-huggingface
Successfully installed llama-index-embeddings-huggingface-0.1.4


In [4]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
train_dataset = EmbeddingQAFinetuneDataset.from_json("./train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("./val_dataset.json")

In [5]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine


In [6]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
    val_dataset=val_dataset,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
finetune_engine.finetune()


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21 [00:00<?, ?it/s]

Iteration:   0%|          | 0/21 [00:00<?, ?it/s]

In [11]:
embed_model = finetune_engine.get_finetuned_model()


In [14]:
embed_model

HuggingFaceEmbedding(model_name='test_model', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7dcbc89a55a0>, tokenizer_name='test_model', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

In [13]:
embed_model.save_pretrained("baai-sm-1")

# push to the hub
embed_model.push_to_hub("baai-sm-1")

AttributeError: 'HuggingFaceEmbedding' object has no attribute 'save_pretrained'

In [15]:
bge = "local:test_model"
bge_val_results = evaluate(val_dataset, bge)

NameError: name 'HuggingFaceEmbeddings' is not defined

In [16]:
from google.colab import files
files.download('test_model')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
!zip -r ./file.zip ./test_model

  adding: test_model/ (stored 0%)
  adding: test_model/modules.json (deflated 62%)
  adding: test_model/special_tokens_map.json (deflated 80%)
  adding: test_model/1_Pooling/ (stored 0%)
  adding: test_model/1_Pooling/config.json (deflated 57%)
  adding: test_model/tokenizer_config.json (deflated 75%)
  adding: test_model/README.md (deflated 56%)
  adding: test_model/model.safetensors (deflated 16%)
  adding: test_model/vocab.txt (deflated 53%)
  adding: test_model/2_Normalize/ (stored 0%)
  adding: test_model/eval/ (stored 0%)
  adding: test_model/eval/Information-Retrieval_evaluation_results.csv (deflated 81%)
  adding: test_model/tokenizer.json (deflated 71%)
  adding: test_model/config.json (deflated 48%)
  adding: test_model/config_sentence_transformers.json (deflated 30%)
  adding: test_model/sentence_bert_config.json (deflated 4%)


In [22]:
from google.colab import files
files.download("./file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>