In [1]:
# libraries
import pandas as pd
import numpy as np
import datetime
import calendar
import random
import uuid

In [2]:
products = {
    'nokia' : [600, 70],
    'samsung' : [800,90],
    'iphone' : [700,100],
    'micromax' : [400,30],
    'sony': [200,30],
    'sony tv' : [500,40],
    'sony laptop' : [250,30],
    'macbook' : [1200, 110],
    'lg electonics' : [555.25,130],
    'asus' : [125.50,60],
    'usb-cables' :[50.75,500],
    'Wired Headphones' :[500.75,450],
    'washing machines' : [600.00,160],
    'Desktops' : [900,45],
    'wireless headphones': [8.99,35],
    'Lightning Charging Cable': [14.95, 75],
    'Apple Airpods Headphones': [150, 32],
    
}

In [3]:
columns = ['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date', 'Purchase Address']

In [4]:
def generate_random_time(month):
    day = generate_random_day(month)
    if random.random()<0.50:
        date = datetime.datetime(2020, month, day,12,00 )
    else:
        date = datetime.datetime(2020, month, day,20,00)
    time_offset = np.random.normal(loc=0.0, scale=180)
    final_date = date + datetime.timedelta(minutes=time_offset)
    return final_date.strftime("%d/%m/%y %H:%M")

In [5]:
def generate_random_day(month):
  day_range = calendar.monthrange(2020,month)[1]
  return random.randint(1,day_range)

In [6]:
def generate_random_address():
  street_names = ['Main', '2nd', '1st', '4th', '5th', 'Park', '6th', '7th', 'Taj mahal', 'charminar', '8th', '9th', '10th', 'beach', 'Sunset', 'Church', 'River', '11th', 'Center', '12th', 'North', 'Lakeview', 'Ridge', 'bridge', 'temple', 'Garden', 'mountains', 'north', 'South', 'woods', 'West', 'Forest', 'Hill']
  cities = ['Hyderabad', 'Delhi', 'Banglore', 'pune', 'chennai', 'vizag']
  weights = [9,4,5,3.5,3,6]
  zips = ['94016', '02215', '10001', '73301', '75001', '30301']
  state = ['TN', 'UT', 'KA', 'M', 'T', 'AP']

  street = random.choice(street_names)
  index = random.choices(range(len(cities)), weights=weights)[0]

  return f"{random.randint(1,999)} {street} St, {cities[index]}, {state[index]} {zips[index]}"

In [7]:
def create_data_csv():
  pass

def write_row(order_number, product, order_date, address):
  product_price = products[product][0]
  quantity = np.random.geometric(p=1.0-(1.0/product_price), size=1)[0]
  output = [order_number, product, quantity, product_price, order_date, address]
  return output


In [8]:
if __name__ == '__main__':
  order_number = 141234
  for month in range(1,13):
    if month <= 10:
      orders_amount = int(np.random.normal(loc=12000, scale=4000))
    elif month == 11:
      orders_amount = int(np.random.normal(loc=20000, scale=3000))
    else: # month == 12
      orders_amount = int(np.random.normal(loc=26000, scale=3000))

    product_list = [product for product in products]
    weights = [products[product][1] for product in products]

    df = pd.DataFrame(columns=columns)
    print(orders_amount)

    i = 0
    while orders_amount > 0:

      address = generate_random_address()
      order_date = generate_random_time(month)

      product_choice = random.choices(product_list, weights)[0]
      df.loc[i] = write_row(order_number, product_choice, order_date, address)
      i += 1

      # Add some items to orders with random chance
      if product_choice == 'iPhone':
        if random.random() < 0.15:
          df.loc[i] = write_row(order_number, "Lightning Charging Cable", order_date, address)
          i += 1
        if random.random() < 0.05:
          df.loc[i] = write_row(order_number, "Apple Airpods Headphones", order_date, address)
          i += 1

        if random.random() < 0.07:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1 
        
        if random.random() < 0.09:
          df.loc[i] = write_row(order_number, "sony", order_date, address)
          i += 1 

      elif product_choice == "Google Phone" or product_choice == "micromax":
        if random.random() < 0.18:
          df.loc[i] = write_row(order_number, "usb-cables", order_date, address)
          i += 1
        if random.random() < 0.04:
          df.loc[i] = write_row(order_number, "wireless headphones", order_date, address)
          i += 1
        if random.random() < 0.06:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1 
        if random.random() < 0.15:
          df.loc[i] = write_row(order_number, "sony tv", order_date, address)
          i += 1

      if random.random() <= 0.02:
        product_choice = random.choices(product_list, weights)[0]
        df.loc[i] = write_row(order_number, product_choice, order_date, address)
        i += 1

      if random.random() <= 0.002:
        df.loc[i] = columns
        i += 1

      if random.random() <= 0.009:
        df.loc[i] = ["","","","","",""]
        i += 1

      order_number += 1
      orders_amount -= 1

    month_name = calendar.month_name[month]
    df.to_csv(f"Sales_{month_name}_2020.csv", index=False)
    print(f"{month_name} Complete")


12579
January Complete
7892
February Complete
11392
March Complete
15170
April Complete
11135
May Complete
12738
June Complete
10374
July Complete
12931
August Complete
6741
September Complete
19756
October Complete
18045
November Complete
23948
December Complete


In [9]:
df

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,279987,usb-cables,1,50.75,13/12/20 13:35,"372 7th St, vizag, AP 30301"
1,279988,macbook,1,1200.00,08/12/20 13:18,"51 north St, Banglore, KA 10001"
2,279989,Wired Headphones,1,500.75,31/12/20 15:03,"284 2nd St, pune, M 73301"
3,279990,Lightning Charging Cable,1,14.95,18/12/20 11:51,"477 River St, Hyderabad, TN 94016"
4,279991,Wired Headphones,1,500.75,31/12/20 12:45,"558 Hill St, pune, M 73301"
...,...,...,...,...,...,...
24796,303930,usb-cables,1,50.75,19/12/20 15:37,"205 woods St, pune, M 73301"
24797,303931,iphone,1,700,13/12/20 06:50,"8 4th St, Delhi, UT 02215"
24798,303932,usb-cables,1,50.75,17/12/20 16:10,"202 Forest St, Banglore, KA 10001"
24799,303933,Wired Headphones,1,500.75,18/12/20 21:20,"265 4th St, Banglore, KA 10001"


In [10]:
df['Quantity Ordered'].value_counts()

1                   24300
2                     250
                      200
Quantity Ordered       41
3                      10
Name: Quantity Ordered, dtype: int64

In [11]:
df['Quantity Ordered'].unique()

array([1, 2, '', 'Quantity Ordered', 3], dtype=object)