In [3]:
import datetime
import calendar
import random
import numpy
import pandas as pd
import uuid

products = {
  'iPhone': [700, 10],
  'Google Phone': [600, 8],
  'Vareebadd Phone': [400, 3],
  '20in Monitor': [109.99,6],
  '34in Ultrawide Monitor': [379.99, 9],
  '27in 4K Gaming Monitor': [389.99,9],
  '27in FHD Monitor': [149.99, 11],
  'Flatscreen TV': [300, 7],
  'Macbook Pro Laptop': [1700, 7],
  'ThinkPad Laptop': [999.99, 6],
  'AA Batteries (4-pack)': [3.84, 30],
  'AAA Batteries (4-pack)': [2.99, 30],
  'USB-C Charging Cable': [11.95, 30],
  'Lightning Charging Cable': [14.95, 30],
  'Wired Headphones': [11.99, 26],
  'Bose SoundSport Headphones': [99.99, 19],
  'Apple Airpods Headphones': [150, 22],
  'LG Washing Machine': [600.00, 1],
  'LG Dryer': [600.00, 1]
}

columns = ['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date', 'Purchase Address']

def generate_random_time(month):
  day = generate_random_day(month)
  if random.random() < 0.5:
    date = datetime.datetime(2019, month, day,12,00)
  else:
    date = datetime.datetime(2019, month, day,20,00)
  time_offset = numpy.random.normal(loc=0.0, scale=180)
  final_date = date + datetime.timedelta(minutes=time_offset)
  return final_date.strftime("%m/%d/%y %H:%M")

def generate_random_day(month):
  day_range = calendar.monthrange(2019,month)[1]
  return random.randint(1,day_range)

def generate_random_address():
  street_names = ['Main', '2nd', '1st', '4th', '5th', 'Park', '6th', '7th', 'Maple', 'Pine', 'Washington', '8th', 'Cedar', 'Elm', 'Walnut', '9th', '10th', 'Lake', 'Sunset', 'Lincoln', 'Jackson', 'Church', 'River', '11th', 'Willow', 'Jefferson', 'Center', '12th', 'North', 'Lakeview', 'Ridge', 'Hickory', 'Adams', 'Cherry', 'Highland', 'Johnson', 'South', 'Dogwood', 'West', 'Chestnut', '13th', 'Spruce', '14th', 'Wilson', 'Meadow', 'Forest', 'Hill', 'Madison']
  cities = ['San Francisco', 'Boston', 'New York City', 'Austin', 'Dallas', 'Atlanta', 'Portland', 'Portland', 'Los Angeles', 'Seattle']
  weights = [9,4,5,2,3,3,2,0.5,6,3]
  zips = ['94016', '02215', '10001', '73301', '75001', '30301', '97035', '04101', '90001', '98101']
  state = ['CA', 'MA', 'NY', 'TX', 'TX', 'GA', 'OR', 'ME', 'CA', 'WA']

  street = random.choice(street_names)
  index = random.choices(range(len(cities)), weights=weights)[0]

  return f"{random.randint(1,999)} {street} St, {cities[index]}, {state[index]} {zips[index]}"

def create_data_csv():
  pass

def write_row(order_number, product, order_date, address):
  product_price = products[product][0]
  quantity = numpy.random.geometric(p=1.0-(1.0/product_price), size=1)[0]
  output = [order_number, product, quantity, product_price, order_date, address]
  return output

if __name__ == '__main__':
  order_number = 141234
  total_rows = 0

  for month in range(1,13):
    if month <= 10:
      orders_amount = int(numpy.random.normal(loc=12000, scale=4000))
    elif month == 11:
      orders_amount = int(numpy.random.normal(loc=20000, scale=3000))
    else: # month == 12
      orders_amount = int(numpy.random.normal(loc=26000, scale=3000))

    product_list = [product for product in products]
    weights = [products[product][1] for product in products]

    df = pd.DataFrame(columns=columns)
    print(orders_amount)

    i = 0
    while orders_amount > 0 and total_rows < 2000:

      address = generate_random_address()
      order_date = generate_random_time(month)

      product_choice = random.choices(product_list, weights)[0]
      df.loc[i] = write_row(order_number, product_choice, order_date, address)
      i += 1
      total_rows += 1

      # Add some items to orders with random chance
      if product_choice == 'iPhone':
        if random.random() < 0.15:
          df.loc[i] = write_row(order_number, "Lightning Charging Cable", order_date, address)
          i += 1
          total_rows += 1
        if random.random() < 0.05:
          df.loc[i] = write_row(order_number, "Apple Airpods Headphones", order_date, address)
          i += 1
          total_rows += 1

        if random.random() < 0.07:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1
          total_rows += 1 

      elif product_choice == "Google Phone" or product_choice == "Vareebadd Phone":
        if random.random() < 0.18:
          df.loc[i] = write_row(order_number, "USB-C Charging Cable", order_date, address)
          i += 1
          total_rows += 1
        if random.random() < 0.04:
          df.loc[i] = write_row(order_number, "Bose SoundSport Headphones", order_date, address)
          i += 1
          total_rows += 1
        if random.random() < 0.07:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1 
          total_rows += 1

      if random.random() <= 0.02:
        product_choice = random.choices(product_list, weights)[0]
        df.loc[i] = write_row(order_number, product_choice, order_date, address)
        i += 1
        total_rows += 1

      if random.random() <= 0.002:
        df.loc[i] = columns
        i += 1
        total_rows += 1

      if random.random() <= 0.003:
        df.loc[i] = ["","","","","",""]
        i += 1
        total_rows += 1

      order_number += 1
      orders_amount -= 1

    month_name = calendar.month_name[month]
    df.to_csv(f"Sales_{month_name}_2019.csv", index=False)
    print(f"{month_name} Complete")

18838
January Complete
10983
February Complete
15595
March Complete
6568
April Complete
10617
May Complete
6616
June Complete
7034
July Complete
13479
August Complete
11773
September Complete
14477
October Complete
14992
November Complete
32135
December Complete


In [16]:
import datetime
import calendar
import random
import numpy
import pandas as pd

products = {
    'iPhone': [700, 10],
    'Google Phone': [600, 8],
    'Vareebadd Phone': [400, 3],
    '20in Monitor': [109.99, 6],
    '34in Ultrawide Monitor': [379.99, 9],
    '27in 4K Gaming Monitor': [389.99, 9],
    '27in FHD Monitor': [149.99, 11],
    'Flatscreen TV': [300, 7],
    'Macbook Pro Laptop': [1700, 7],
    'ThinkPad Laptop': [999.99, 6],
    'AA Batteries (4-pack)': [3.84, 30],
    'AAA Batteries (4-pack)': [2.99, 30],
    'USB-C Charging Cable': [11.95, 30],
    'Lightning Charging Cable': [14.95, 30],
    'Wired Headphones': [11.99, 26],
    'Bose SoundSport Headphones': [99.99, 19],
    'Apple Airpods Headphones': [150, 22],
    'LG Washing Machine': [600.00, 1],
    'LG Dryer': [600.00, 1]
}

columns = ['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date', 'Purchase Address']

def generate_random_time(year, month):
    day = generate_random_day(year, month)
    if random.random() < 0.5:
        date = datetime.datetime(year, month, day, 12, 00)
    else:
        date = datetime.datetime(year, month, day, 20, 00)
    time_offset = numpy.random.normal(loc=0.0, scale=180)
    final_date = date + datetime.timedelta(minutes=time_offset)
    return final_date.strftime("%m/%d/%y %H:%M")

def generate_random_day(year, month):
    day_range = calendar.monthrange(year, month)[1]
    return random.randint(1, day_range)

def generate_random_address():
    street_names = ['Soekarno Hatta', 'Jend. Sudirman', 'Imam Bonjol', 'Pangeran Diponegoro', 'Jend. Ahmad Yani', 'Raden Ajeng Kartini', 'Dewi Sartika', 'Ir. H. Juanda', 'Pemuda', 'Hayam Wuruk', 'Gajah Mada', 'M. H. Thamrin', 'Pahlawan', 'Asia Afrika', 'Gatot Subroto', 'Merdeka', 'Veteran', 'Kebon Sirih', 'Asia Tenggara', 'Panglima Polin', 'H. Agus Salim']
    cities = ['Bandung', 'Jakarta', 'Tangerang', 'Depok', 'Bogor', 'Bekasi', 'Surabaya', 'Yogyakarta', 'Cirebon', 'Semarang']
    weights = [9, 4, 5, 2, 3, 3, 2, 0.5, 6, 3]
    zips = ['94016', '02215', '10001', '73301', '75001', '30301', '97035', '04101', '90001', '98101']
    state = ['Jawa Barat', 'DKI Jakarta', 'Banten', 'Jawa Barat', 'Jawa Barat', 'Jawa Barat', 'Jawa Timur', 'DIY Yogyakarta', 'Jawa Barat', 'Jawa Tengah']

    street = random.choice(street_names)
    index = random.choices(range(len(cities)), weights=weights)[0]

    return f"{street} St, {cities[index]}, {state[index]} {zips[index]}"

def write_row(order_number, product, order_date, address):
    product_price = products[product][0]
    quantity = numpy.random.geometric(p=1.0-(1.0/product_price), size=1)[0]
    output = [order_number, product, quantity, product_price, order_date, address]
    return output

if __name__ == '__main__':
    current_order_number = 1001  # Start with the initial order number
    total_rows = 0

    df = pd.DataFrame(columns=columns)

    for month in range(1, 13):
        year = 2023
        month_name = calendar.month_name[month]
        orders_amount = random.randint(8000, 30000)  # Randomize the orders_amount]

        #df = pd.DataFrame(columns=columns)

        i = 0
        
        while orders_amount > 0 and total_rows < 2000:
            address = generate_random_address()
            order_date = generate_random_time(year, month)
            product_choice = random.choices(product_list, weights)[0]

            df.loc[total_rows] = write_row(current_order_number, product_choice, order_date, address)
            i += 1
            total_rows += 1

            if product_choice == 'iPhone':
                if random.random() < 0.15:
                    df.loc[i] = write_row(current_order_number, "Lightning Charging Cable", order_date, address)
                    i += 1
                    total_rows += 1
                if random.random() < 0.05:
                    df.loc[i] = write_row(current_order_number, "Apple Airpods Headphones", order_date, address)
                    i += 1
                    total_rows += 1
                if random.random() < 0.07:
                    df.loc[i] = write_row(current_order_number, "Wired Headphones", order_date, address)
                    i += 1
                    total_rows += 1

            elif product_choice == "Google Phone" or product_choice == "Vareebadd Phone":
                if random.random() < 0.18:
                    df.loc[i] = write_row(current_order_number, "USB-C Charging Cable", order_date, address)
                    i += 1
                    total_rows += 1
                if random.random() < 0.04:
                    df.loc[i] = write_row(current_order_number, "Bose SoundSport Headphones", order_date, address)
                    i += 1
                    total_rows += 1
                if random.random() < 0.07:
                    df.loc[i] = write_row(current_order_number, "Wired Headphones", order_date, address)
                    i += 1
                    total_rows += 1

            if random.random() <= 0.02:
                product_choice = random.choices(product_list, weights)[0]
                df.loc[i] = write_row(current_order_number, product_choice, order_date, address)
                i += 1
                total_rows += 1

            if random.random() <= 0.002:
                df.loc[i] = columns
                i += 1
                total_rows += 1

            if random.random() <= 0.003:
                df.loc[i] = ["", "", "", "", "", ""]
                i += 1
                total_rows += 1

            current_order_number += 1
            orders_amount -= 1

        month_name = calendar.month_name[month]
        df = df.sample(frac=1).reset_index(drop=True)
        df.to_csv(f"Sales_{month_name}_2023.csv", index=False)
        print(f"{month_name} Complete")
        


January Complete
February Complete
March Complete
April Complete
May Complete
June Complete
July Complete
August Complete
September Complete
October Complete
November Complete
December Complete


In [18]:
df1 = pd.read_csv('Sales_May_2023.csv')
df1

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,1085,Apple Airpods Headphones,1,150,01/12/23 14:49,"Asia Afrika St, Bekasi, Jawa Barat 30301"
1,1423,AAA Batteries (4-pack),1,2.99,01/06/23 20:53,"Gatot Subroto St, Tangerang, Banten 10001"
2,1851,AAA Batteries (4-pack),2,2.99,01/14/23 19:46,"Merdeka St, Jakarta, DKI Jakarta 02215"
3,2432,Bose SoundSport Headphones,1,99.99,01/13/23 16:59,"Pangeran Diponegoro St, Bogor, Jawa Barat 75001"
4,1334,27in FHD Monitor,1,149.99,01/15/23 15:53,"Pemuda St, Bekasi, Jawa Barat 30301"
...,...,...,...,...,...,...
1995,2638,27in FHD Monitor,1,149.99,01/28/23 21:44,"Dewi Sartika St, Tangerang, Banten 10001"
1996,1702,Apple Airpods Headphones,1,150,01/06/23 22:04,"Panglima Polin St, Cirebon, Jawa Barat 90001"
1997,2349,iPhone,1,700,01/11/23 16:43,"Raden Ajeng Kartini St, Cirebon, Jawa Barat 90001"
1998,1258,27in 4K Gaming Monitor,1,389.99,01/06/23 16:09,"Pahlawan St, Jakarta, DKI Jakarta 02215"


## After several code iterations, the final code cell that I'm going to use to create the data is the following one:

#### Note that I'm going to limit the number of rows for each month to 2000, so that the final dataset is not too big.

In [4]:
import datetime
import calendar
import random
import numpy
import pandas as pd
import uuid

products = {
  'iPhone': [18000000, 10],
  'Google Pixel': [15000000, 8],
  'Xiaomi Phone': [8000000, 3],
  '20in Monitor': [2000000,6],
  '34in Ultrawide Monitor': [6000000, 9],
  '27in 4K Gaming Monitor': [6400000,9],
  '27in FHD Monitor': [3000000, 11],
  'Flatscreen TV': [4000000, 7],
  'Macbook Pro Laptop': [26000000, 7],
  'ThinkPad Laptop': [15000000, 6],
  'AA Batteries (4-pack)': [40000, 30],
  'AAA Batteries (4-pack)': [32000, 30],
  'USB-C Charging Cable': [20000, 30],
  'Lightning Charging Cable': [30000, 30],
  'Wired Headphones': [150000, 26],
  'Bose SoundSport Headphones': [1500000, 19],
  'Apple Airpods Headphones': [2500000, 22],
  'LG Washing Machine': [9000000, 1],
  'LG Dryer': [9000000, 1]
}

columns = ['Order_ID', 'Product', 'Quantity_Ordered', 'Price_Each', 'Order_Date', 'Purchase_Address']

def generate_random_time(month):
  day = generate_random_day(month)
  if random.random() < 0.5:
    date = datetime.datetime(2023, month, day,12,00)
  else:
    date = datetime.datetime(2023, month, day,20,00)
  time_offset = numpy.random.normal(loc=0.0, scale=180)
  final_date = date + datetime.timedelta(minutes=time_offset)
  return final_date.strftime("%m/%d/%y %H:%M")

def generate_random_day(month):
  day_range = calendar.monthrange(2023,month)[1]
  return random.randint(1,day_range)

def generate_random_address():
  street_names = ['Soekarno Hatta', 'Jend. Sudirman', 'Imam Bonjol', 'Pangeran Diponegoro', 'Jend. Ahmad Yani', 'Raden Ajeng Kartini', 'Dewi Sartika', 'Ir. H. Juanda', 'Pemuda', 'Hayam Wuruk', 'Gajah Mada', 'M. H. Thamrin', 'Pahlawan', 'Asia Afrika', 'Gatot Subroto', 'Merdeka', 'Veteran', 'Kebon Sirih', 'Asia Tenggara', 'Panglima Polin', 'H. Agus Salim']
  cities = ['Bandung', 'Jakarta', 'Tangerang', 'Depok', 'Bogor', 'Bekasi', 'Surabaya', 'Yogyakarta', 'Cirebon', 'Semarang']
  weights = [9, 4, 5, 2, 3, 3, 2, 0.5, 6, 3]
  zips = ['94016', '02215', '10001', '73301', '75001', '30301', '97035', '04101', '90001', '98101']
  state = ['Jawa Barat', 'DKI Jakarta', 'Banten', 'Jawa Barat', 'Jawa Barat', 'Jawa Barat', 'Jawa Timur', 'DIY Yogyakarta', 'Jawa Barat', 'Jawa Tengah']

  street = random.choice(street_names)
  index = random.choices(range(len(cities)), weights=weights)[0]

  return f"{street} St, {cities[index]}, {state[index]} {zips[index]}"

def create_data_csv():
  pass

def write_row(order_number, product, order_date, address):
  product_price = products[product][0]
  max_quantity = 10
  quantity = random.randint(1,max_quantity)
  output = [order_number, product, quantity, product_price, order_date, address]
  return output

if __name__ == '__main__':
  order_number = 141234
  for month in range(1,13):
    if month <= 10:
      orders_amount = int(numpy.random.normal(loc=12000, scale=4000))
    elif month == 11:
      orders_amount = int(numpy.random.normal(loc=20000, scale=3000))
    else: # month == 12
      orders_amount = int(numpy.random.normal(loc=26000, scale=3000))

    product_list = [product for product in products]
    weights = [products[product][1] for product in products]

    df = pd.DataFrame(columns=columns)
    print(orders_amount)

    i = 0
    while orders_amount > 0 and i < 2000:

      address = generate_random_address()
      order_date = generate_random_time(month)

      product_choice = random.choices(product_list, weights)[0]
      df.loc[i] = write_row(order_number, product_choice, order_date, address)
      i += 1

      # Add some items to orders with random chance
      if product_choice == 'iPhone':
        if random.random() < 0.15:
          df.loc[i] = write_row(order_number, "Lightning Charging Cable", order_date, address)
          i += 1
        if random.random() < 0.05:
          df.loc[i] = write_row(order_number, "Apple Airpods Headphones", order_date, address)
          i += 1

        if random.random() < 0.07:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1 

      elif product_choice == "Google Phone" or product_choice == "Vareebadd Phone":
        if random.random() < 0.18:
          df.loc[i] = write_row(order_number, "USB-C Charging Cable", order_date, address)
          i += 1
        if random.random() < 0.04:
          df.loc[i] = write_row(order_number, "Bose SoundSport Headphones", order_date, address)
          i += 1
        if random.random() < 0.07:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1 

      if random.random() <= 0.02:
        product_choice = random.choices(product_list, weights)[0]
        df.loc[i] = write_row(order_number, product_choice, order_date, address)
        i += 1

      if random.random() <= 0.002:
        df.loc[i] = columns
        i += 1

      if random.random() <= 0.003:
        df.loc[i] = ["","","","","",""]
        i += 1

      order_number += 1
      orders_amount -= 1

    month_name = calendar.month_name[month]
    df.to_csv(f"Sales_{month_name}_2023.csv", index=False)
    print(f"{month_name} Complete")

13289
January Complete
8012
February Complete
9881
March Complete
15516
April Complete
10035
May Complete
12480
June Complete
17992
July Complete
6430
August Complete
8585
September Complete
11243
October Complete
18679
November Complete
30173
December Complete
