In [1]:
import openai

In [2]:
import os
import builtins
from getpass import getpass
import glob

In [3]:
os.environ['OPENAI_API_KEY'] = getpass()

In [4]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [20]:
generation_prompt = """Think step-by-step, and do the following tasks referring the the use case document -
1. Analyze the use case and understand the workings of langchain library
2. The prompt format is below and starts and ends with three dash line - "---"
3. The prompt format has 3 sections Preface, Instruction and Response
4. The Preface is always - "Below is an instruction that describes a task. Write a response that appropriately completes the request."
5. In the Instructions section, detail the task you are trying to accomplish using langchain
6. In the response section, write a valid python code that would accomplish the task in Instructions
7. The prompt samples ends with a line containing a three dash line - "---"
8. Follow the prompt format religiously
9. Ensure the Instruction is complex enough to warrant a response that is at least 3 lines long python code
10. Ensure the response is valid python code
11. Ensure the response is moderately complex code
12. Ensure all the necessary classes are imported in the response
13. Assume langchain library is already installed. Provide instructions to install any other library dependencies.
14. Combine multiple information from the use case document to create a single prompt
15. Generate sample prompts by replacing the entitities in the use case with similar entities from knowledge base
16. Generate at least 10 sample prompts, if possible more

prompt format:
---
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Using the python langchain library, {instruction}

### Response:
{response}

---
"""

In [3]:
base_dir = '.'

In [23]:
files = ["response-data/multiple_retrieval.md", "response-data/qa_citations.md"]
for file in files:
    filename = os.path.basename(file)
    name = os.path.splitext(filename)[0]
    response = builtins.open(f"{base_dir}/{file}", 'r').read()
    examples = response.split("---")
    examples = [e.strip() for e in examples]
    examples = [e for e in examples if len(e) > 0]
    print(f"generated {len(examples)} prompts")
    for i, ex in enumerate(examples):
        with builtins.open(f"{base_dir}/training-data/{name}-{i}.md", "w") as f:
            f.write(ex)
            f.write("\n---\n")

generated 2 prompts
generated 1 prompts


In [21]:
# read all files from use-cases-md
import glob

# files = glob.glob("use-cases-md/*.md")
files = ["use-cases-md/multiple_retrieval.md", "use-cases-md/qa_citations.md"]

for file in files:
    print(f"processing use-case: {file}")
    doc = open(file, "r").read()
    # split file into name and extension
    filename = os.path.basename(file)
    name, _ = os.path.splitext(filename)
    completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert programmer."},
            {
                "role": "user",
                "content": "Below is the usage for the python library langchain."
                "langchain is used to interact with Large Language Models and accomplish a variety of tasks. "
                "The code below covers how to use langchain library to make API calls",
            },
            {"role": "user", "content": doc},
            {"role": "user", "content": generation_prompt},
        ],
    )
    response = completion.choices[0].message["content"]
    with builtins.open(f"{base_dir}/response-data/{name}.md", "w") as f:
        f.write(response)
    # split by --- or '''
    examples = response.split("---")
    examples = [e.strip() for e in examples]
    examples = [e for e in examples if len(e) > 0]
    print(f"generated {len(examples)} prompts")
    for i, ex in enumerate(examples):
        with builtins.open(f"{base_dir}/training-data/{name}-{i}.md", "w") as f:
            f.write(ex)
            f.write("\n---\n")

processing use-case: use-cases-md/chatbots.md
generated 10 prompts
processing use-case: use-cases-md/summarization.md
generated 3 prompts
processing use-case: use-cases-md/document-context-aware-QA.md
generated 3 prompts
processing use-case: use-cases-md/extraction.md
generated 5 prompts
processing use-case: use-cases-md/apis.md
generated 3 prompts
processing use-case: use-cases-md/local_retrieval_qa.md
generated 9 prompts
processing use-case: use-cases-md/sql.md
generated 4 prompts
processing use-case: use-cases-md/tagging.md
generated 4 prompts
processing use-case: use-cases-md/conversational_retrieval_agents.md
generated 3 prompts
processing use-case: use-cases-md/vector_db_text_generation.md
generated 20 prompts
processing use-case: use-cases-md/multiple_retrieval.md
generated 2 prompts
processing use-case: use-cases-md/qa_citations.md
generated 1 prompts


In [25]:
# remove incomplete prompt files
files = glob.glob("training-data/*.md")
for file in files:
    # find all the lines below in file, if any don't exist, delete the file
    with builtins.open(file, "r") as f:
        # find line "### Instruction:", "### Response:", "```python"
        lines = f.readlines()
        if "### Instruction:\n" not in lines or "### Response:\n" not in lines or "```python\n" not in lines:
            os.remove(file)
            print(f"removed file {file}")

removed file training-data/vector_db_text_generation-18.md
removed file training-data/vector_db_text_generation-8.md
removed file training-data/vector_db_text_generation-12.md
removed file training-data/tagging-3.md
removed file training-data/vector_db_text_generation-16.md
removed file training-data/extraction-0.md
removed file training-data/vector_db_text_generation-2.md
removed file training-data/extraction-4.md
removed file training-data/vector_db_text_generation-6.md
removed file training-data/vector_db_text_generation-14.md
removed file training-data/vector_db_text_generation-10.md
removed file training-data/vector_db_text_generation-4.md
removed file training-data/extraction-2.md
removed file training-data/vector_db_text_generation-0.md
removed file training-data/apis-2.md
removed file training-data/sql-3.md
removed file training-data/conversational_retrieval_agents-2.md
removed file training-data/multiple_retrieval-1.md


In [4]:
import glob
import builtins

In [14]:
# read file training-data/*.md, find the line "### Instruction:"
# copy all the content below it to variable instruction till "### Response:"
# copy all the content below "### Response:" to variable response till end of file
# remove the "---\n" from response
# store it as a tuple (instruction, response) in a list
files = glob.glob("training-data/*.md")
final_data = []
for file in files:
    with builtins.open(f"{base_dir}/{file}", "r") as f:
        # find line "### Instruction:", "### Response:", "```python"
        lines = f.readlines()
        current_section = ""
        instruction = ""
        response = ""
        for line in lines:
            if "### Response:" in line:
                current_section = "python"
                continue
            if "```python" in line or current_section == "python" and not line == "---\n":
                response = response + line
                continue
            if "### Instruction:" in line:
                current_section = "instruction"
                continue
            if current_section == "instruction":
                instruction = instruction + line
    final_data.append({"instruction": instruction.strip(), "input": "", "output": response.strip()})

In [16]:
import json

In [17]:
json.dump(final_data, open(f"{base_dir}/final-data/training-data.json", "w"), indent=4)

In [4]:
import json
j = json.load(open(f"{base_dir}/final-data/training-data.json", "r"))

In [5]:
len(j)

48