# Data Generator Exploration
Generate samples and inspect distributions and examples.


In [1]:
from pathlib import Path
import sys

cwd = Path.cwd()
project_root = cwd.parent if cwd.name == 'notebooks' else cwd
sys.path.insert(0, str(project_root))


In [2]:
import pandas as pd

from training.data_generator import NaturalLanguageToolCallGenerator, OutputFormat


In [3]:
generator = NaturalLanguageToolCallGenerator(output_format=OutputFormat.PYTHON)
num_samples = 300
data = [generator.generate_pair() for _ in range(num_samples)]

df = pd.DataFrame({
    'tool': [d['tool'] for d in data],
    'query': [d['query'] for d in data],
    'tool_call': [d['tool_call'] for d in data],
})

df.head(10)


Unnamed: 0,tool,query,tool_call
0,file_read,Open the log file,"file_read(path='/var/log/app.log', encoding='u..."
1,search,Search the web for climate change best practic...,"search(query='climate change best practices', ..."
2,web_fetch,Get posts from JSONPlaceholder,web_fetch(url='https://jsonplaceholder.typicod...
3,web_fetch,Get comments from JSONPlaceholder,web_fetch(url='https://jsonplaceholder.typicod...
4,search,Please search machine learning tutorials and r...,"search(query='machine learning tutorials', max..."
5,web_fetch,Get comments from JSONPlaceholder,web_fetch(url='https://jsonplaceholder.typicod...
6,calculate,What is the absolute value of -42?,calculate(expression='abs(-42)')
7,search,What can you find about AI papers? Show me 49 ...,"search(query='AI papers', max_results=49, date..."
8,search,Can you search for neural networks? I want 11 ...,"search(query='neural networks', max_results=11)"
9,search,Search for NLP overview and return up to 39 items,"search(query='NLP overview', max_results=39)"


In [4]:
df['tool'].value_counts()


tool
file_read         63
search            53
web_fetch         51
calculate         46
database_query    45
send_email        42
Name: count, dtype: int64

In [5]:
for tool in sorted(df['tool'].unique()):
    print(f'Tool: {tool}')
    display(df[df['tool'] == tool].head(5))


Tool: calculate


Unnamed: 0,tool,query,tool_call
6,calculate,What is the absolute value of -42?,calculate(expression='abs(-42)')
20,calculate,What is the absolute value of -42?,calculate(expression='abs(-42)')
22,calculate,What's 88 to the power of 2?,calculate(expression='88 ** 2')
25,calculate,What is the ceiling of 3.2?,calculate(expression='ceil(3.2)')
27,calculate,Round 3.14159 to nearest integer,calculate(expression='round(3.14159)')


Tool: database_query


Unnamed: 0,tool,query,tool_call
11,database_query,List recent transactions over $1000,database_query(sql='SELECT * FROM transactions...
14,database_query,Count how many active products we have,database_query(sql='SELECT COUNT(*) FROM produ...
38,database_query,Top 10 customers by revenue,"database_query(sql='SELECT customer_id, SUM(am..."
39,database_query,Get all pending orders,database_query(sql='SELECT * FROM orders WHERE...
49,database_query,Get total revenue from last quarter (timeout: ...,database_query(sql='SELECT SUM(amount) FROM tr...


Tool: file_read


Unnamed: 0,tool,query,tool_call
0,file_read,Open the log file,"file_read(path='/var/log/app.log', encoding='u..."
10,file_read,Open the training log,"file_read(path='logs/train_intent.log', encodi..."
15,file_read,Read the settings file,"file_read(path='../config/settings.ini', encod..."
16,file_read,Read the exports report,file_read(path='./data/exports/report_2024.csv...
17,file_read,Open the log file,"file_read(path='/var/log/app.log', encoding='u..."


Tool: search


Unnamed: 0,tool,query,tool_call
1,search,Search the web for climate change best practic...,"search(query='climate change best practices', ..."
4,search,Please search machine learning tutorials and r...,"search(query='machine learning tutorials', max..."
7,search,What can you find about AI papers? Show me 49 ...,"search(query='AI papers', max_results=49, date..."
8,search,Can you search for neural networks? I want 11 ...,"search(query='neural networks', max_results=11)"
9,search,Search for NLP overview and return up to 39 items,"search(query='NLP overview', max_results=39)"


Tool: send_email


Unnamed: 0,tool,query,tool_call
13,send_email,Send a thank you email to johnsonjoshua@exampl...,"send_email(to='johnsonjoshua@example.org', sub..."
31,send_email,Email jillrhodes@example.net requesting feedba...,"send_email(to='jillrhodes@example.net', subjec..."
54,send_email,Email garzaanthony@example.org to follow up on...,"send_email(to='garzaanthony@example.org', subj..."
57,send_email,Send jesseguzman@example.net a quick status up...,"send_email(to='jesseguzman@example.net', subje..."
58,send_email,Email jennifermiles@example.com requesting fee...,"send_email(to='jennifermiles@example.com', sub..."


Tool: web_fetch


Unnamed: 0,tool,query,tool_call
2,web_fetch,Get posts from JSONPlaceholder,web_fetch(url='https://jsonplaceholder.typicod...
3,web_fetch,Get comments from JSONPlaceholder,web_fetch(url='https://jsonplaceholder.typicod...
5,web_fetch,Get comments from JSONPlaceholder,web_fetch(url='https://jsonplaceholder.typicod...
18,web_fetch,Fetch weather data from OpenWeatherMap,web_fetch(url='https://api.openweathermap.org/...
19,web_fetch,Get data from the example API,"web_fetch(url='https://api.example.com/data', ..."
