In this ipynb the goal is to generate a synthetic data of demand and supply in inventories for a test environment 

In [1]:
import pandas as pd
import random

In [2]:

# Load products table
df_products = pd.read_csv("products_table.csv")
product_ids = df_products["ProductID"].tolist()

def random_date(start, end):
    #Generate random date between start and end
    return start + pd.to_timedelta(random.randint(0, (end - start).days), unit='d')

def generate_inventory(inv_id):
    #Generate one inventory record with injected issues
    product_id = random.choice(product_ids)
    product_row = df_products[df_products["ProductID"] == product_id].iloc[0]
    
    warehouse = product_row["WarehouseLocation"]
    
    # Normal values
    demand = random.randint(50, 500)
    availability = max(0, demand - random.randint(0, 500))
    
    # Inject nulls (5% chance each)
    if random.random() < 0.05:
        demand = None
    if random.random() < 0.05:
        availability = None
    
    # Inject outliers (2% chance each, only if not None)
    if demand is not None and random.random() < 0.02:
        demand *= 100
    if availability is not None and random.random() < 0.02:
        availability *= 50
    
    return {
        "InvID": inv_id,
        "ProductID": product_id,
        "ProductName": product_row["ProductName"],
        "WarehouseLocation": warehouse,
        "Availability": availability,
        "Demand": demand,
        "OrderDate": random_date(pd.Timestamp("2022-01-01"), pd.Timestamp("2024-09-01"))
    }


In [3]:
def generate_inventory_dataset(num_rows, filename):
    """Generate full inventory dataset with duplicates + save to CSV"""
    inventory_records = [generate_inventory(i+1) for i in range(num_rows)]
    df_inventory = pd.DataFrame(inventory_records)

    # Inject duplicates (2% of dataset)
    num_duplicates = int(0.02 * len(df_inventory))
    duplicates = df_inventory.sample(num_duplicates)
    df_inventory = pd.concat([df_inventory, duplicates], ignore_index=True)

    # Save dataset
    df_inventory.to_csv(filename, index=False)
    print(f"✅ Generated {len(df_inventory)} rows → {filename}")

In [4]:
# Generate test dataset (5,000 rows)
generate_inventory_dataset(5000, "inventory_test_with_issues.csv")

# Generate production dataset (50,000 rows)
generate_inventory_dataset(50000, "inventory_production_with_issues.csv")


✅ Generated 5100 rows → inventory_test_with_issues.csv
✅ Generated 51000 rows → inventory_production_with_issues.csv
