In [1]:
import json
import pandas as pd
import numpy as np
from langchain.chat_models import ChatOpenAI
# from langchain_deepseek import ChatDeepSeek
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Define the prompt template
prompt = PromptTemplate(
    input_variables=["num_samples"],
    template="""
    Generate {num_samples} synthetic grocery product records in JSON format.
    Each record should have:
    - "product_id" (random 6-digit number)
    - "product_name" (realistic product name)
    - "price" (random float between 10 and 100)
    - "lead_time" (integer between 3 and 7 days)
    - "current_stock" (current qty instock, integer between 10 and 100)
    - "daily_demand" (integer between 5 and 20)
    
    Output as a JSON list.
    """
)

In [5]:
# Initialize LLM
llm = ChatOpenAI(model="gpt-4")

# llm = ChatDeepSeek(
#     model="deepseek-chat",
#     temperature=0,
#     max_retries=2
# )

# Create a chain using RunnableSequence
chain = (
    RunnablePassthrough()  # Pass input directly
    | prompt               # Format prompt
    | llm                  # Pass to LLM
)

# Run the chain
num_samples = 100
result = chain.invoke({"num_samples": num_samples})

In [32]:
# Parse and save JSON output
output_data = json.loads(result.content)  # Convert string to Python object

with open("data/products_data.json", "w") as f:
    json.dump(output_data, f, indent=4)  # Save as formatted JSON

print("JSON output saved to products_data.json")

JSON output saved to products_data.json


In [33]:
with open('data/products_data.json') as f:
    products_data = json.load(f)

In [35]:
products_df = pd.DataFrame.from_dict(products_data, orient='columns')
products_df['cogs'] = products_df['price'] * np.random.uniform(0.5, 0.8, len(products_df))
products_df['cogs'] = products_df['cogs'].round(2)
products_df['expedited_price'] = products_df['cogs'] * np.random.uniform(1.1, 1.8, len(products_df))
products_df['days_of_inventory'] = products_df['current_stock'] // products_df['daily_demand']

# Fictitious suppliers generated by ChatGPT
FICTITIOUS_SUPPLIERS = ['FreshHarvest Suppliers', 'Golden Grain Distributors', 'Evergreen Provisions']

products_df["supplier"] = np.random.choice(FICTITIOUS_SUPPLIERS, size=len(products_df))
products_df.to_csv("data/products_data.csv", index=False)