In [3]:
import os
from dotenv import load_dotenv
from openai import OpenAI

In [14]:
examples_of_data = '''flower:True, flower_type:sunflower, colour:yellow; flower:False, flower_type:n/a, colour:n/a
'''
gen_sample_size = 100

In [5]:
# environment

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')

In [6]:
# initialize

openai = OpenAI()
OPENAI_MODEL = "gpt-4o-mini"


In [8]:
system_message = "You are an assistant that generates sample data for testing purposes."
system_message += "Make python datastructures to best represent the data and generate a variety of examples of data.\n\n"

In [10]:
def user_prompt_for(examples_of_data, gen_sample_size):
    user_prompt = f"Generate {gen_sample_size} number of sample data, here are some examples: {examples_of_data}"
    user_prompt += "Make a python datastructure to efficiently and usefully store the data\n\n"
    return user_prompt

In [11]:
def messages_for(examples_of_data, gen_sample_size):
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt_for(examples_of_data, gen_sample_size)}
    ]

In [12]:
# write to a file called optimized.cpp

def write_output(output):
    code = output.replace("```python","").replace("```","")
    with open("generated_dataset.py", "w") as f:
        f.write(code)

In [17]:
def gen_sample_data__gpt(examples_of_data, gen_sample_size):    
    stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages_for(examples_of_data, gen_sample_size), stream=True)
    reply = ""
    for chunk in stream:
        fragment = chunk.choices[0].delta.content or ""
        reply += fragment
        print(fragment, end='', flush=True)
    write_output(reply)

In [18]:
gen_sample_data__gpt(examples_of_data, gen_sample_size)

To represent the sample data effectively in Python, we can use a list of dictionaries. Each dictionary will represent an individual record with the relevant attributes (`flower`, `flower_type`, and `colour`). This structure is easy to work with for tasks like filtering, modifying, and accessing data.

Below is the implementation with the generation of 100 example records:

```python
import random

# Sample flower types and colors
flower_types = ['sunflower', 'rose', 'daisy', 'tulip', 'lily', 'orchid']
colors = ['yellow', 'red', 'white', 'pink', 'purple', 'orange']

# Function to generate sample data
def generate_flower_data(num_records):
    data = []
    for _ in range(num_records):
        flower_present = random.choice([True, False])
        if flower_present:
            flower_data = {
                'flower': flower_present,
                'flower_type': random.choice(flower_types),
                'colour': random.choice(colors)
            }
        else:
            flower_d