<a href="https://colab.research.google.com/github/cemsakarya/whitepaper-parkinglot/blob/main/Dunder_Mifflin_Data_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
import re
random.seed(2704)
number_of_rows = 10_000
from datetime import datetime
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
import uuid

## Parsing the Wikipedia page for Paper Products and Pennsylvania Counties

In [2]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

In [3]:
response = requests.get(
url='https://en.wikipedia.org/wiki/Category:Paper_products',
)
soup = BeautifulSoup(response.content, 'html.parser')

# Get all the links
allLinks = soup.find(id="bodyContent").find_all("a")
random.shuffle(allLinks)
linkToScrape = 0

list_of_products = []
for i in allLinks:
    if str(i)[:15] == """<a href="/wiki/""":
        try:
            s = find_between( str(i), """href="/wiki/""", """" title=""" )
            if "category" in s.lower():
                pass
            else:
                list_of_products += [s]
        except:
            continue

In [4]:
print(f"Number of Products: {len(list_of_products)}")
print('\n10 Examples\n')
print(*list_of_products[:10], sep='\n')

Number of Products: 100

10 Examples

Chinet
Anaglypta
Holy_card
Receipt
Ticket_(admission)
Softwall
Duo-Tang
Coffee_cup_sleeve
Passbook
Container_compression_test


In [5]:
response = requests.get(
url='https://en.wikipedia.org/wiki/List_of_counties_in_Pennsylvania',
)
soup = BeautifulSoup(response.content, 'html.parser')

# Get all the links
allLinks = soup.find(id="bodyContent").find_all("a")
random.shuffle(allLinks)
linkToScrape = 0

list_of_counties = []
for i in allLinks:
    if str(i)[:15] == """<a href="/wiki/""":
        try:
            s = find_between( str(i), """href="/wiki/""", """" title=""" )
            if "category" in s.lower():
                pass
            elif not "_pennsylvania" in s.lower():
                pass
            else:
                list_of_counties += [s]
        except:
            continue

## Datetime

In [6]:
import random
import time
    
def str_time_prop(start, end, time_format, prop):
    """Get a time at a proportion of a range of two formatted times.

    start and end should be strings specifying times formatted in the
    given format (strftime-style), giving an interval [start, end].
    prop specifies how a proportion of the interval to be taken after
    start.  The returned time will be in the specified format.
    """

    stime = time.mktime(time.strptime(start, time_format))
    etime = time.mktime(time.strptime(end, time_format))

    ptime = stime + prop * (etime - stime)

    return time.strftime(time_format, time.localtime(ptime))


def random_date(start, end, prop):
    return str_time_prop(start, end, '%m/%d/%Y %I:%M %p', prop)

In [7]:
list_of_salesman = ["Dwight Schrute", "Jim Halpert", "Stanley Hudson", "Phyllis Vance", "Andrew Bernard"]

In [8]:
list_of_accountants = ["Kevin Malone", "Angela Martin", "Oscar Martinez"]

In [9]:
prices_of_products = dict(zip(list_of_products, np.random.randint(10,200,size=(number_of_rows, 1)[0])))

## Customers

In [10]:
word_site = "https://www.mit.edu/~ecprice/wordlist.100000"

response = requests.get(word_site)
WORDS = response.content.splitlines()

def business_name_generator():
    word = random.choice(WORDS)
    WORDS.remove(word)
    return word.decode("utf-8")  + ".inc"

In [11]:
!pip install names
import names

def name_generator():
    return names.get_full_name()

Collecting names
  Downloading names-0.3.0.tar.gz (789 kB)
[?25l[K     |▍                               | 10 kB 25.9 MB/s eta 0:00:01[K     |▉                               | 20 kB 29.7 MB/s eta 0:00:01[K     |█▎                              | 30 kB 13.6 MB/s eta 0:00:01[K     |█▋                              | 40 kB 10.7 MB/s eta 0:00:01[K     |██                              | 51 kB 8.0 MB/s eta 0:00:01[K     |██▌                             | 61 kB 9.3 MB/s eta 0:00:01[K     |███                             | 71 kB 9.5 MB/s eta 0:00:01[K     |███▎                            | 81 kB 9.7 MB/s eta 0:00:01[K     |███▊                            | 92 kB 10.6 MB/s eta 0:00:01[K     |████▏                           | 102 kB 11.5 MB/s eta 0:00:01[K     |████▋                           | 112 kB 11.5 MB/s eta 0:00:01[K     |█████                           | 122 kB 11.5 MB/s eta 0:00:01[K     |█████▍                          | 133 kB 11.5 MB/s eta 0:00:01[K     |██

In [12]:
list_of_job_titles = ["General Manager","Administrative Assistant","Executive Assistant","Marketing Manager",
              "Customer Service Representative","Nurse Practitioner",
              "Sales Manager","Data Entry Clerk","Office Assistant", "Supply Manager", "Supply Manager", np.nan]

In [13]:
class _customer:
    def __init__(self):
        self.name = business_name_generator()
        self.customer_id = uuid.uuid4().hex
        self.county = random.choice(list_of_counties)
        self.Main_Customer_Representative = name_generator()
        self.Secondary_Customer_Representative = name_generator()
        self.Tertiary_Customer_Representative = name_generator()
        self.Main_Customer_Representative_Job_Title  = random.choice(list_of_job_titles)
        self.Secondary_Customer_Representative_Job_Title  = random.choice(list_of_job_titles)
        self.Tertiary_Customer_Representative_Job_Title   = random.choice(list_of_job_titles)
        self.salesman = random.choice(list_of_salesman)

In [14]:
i = 0
customer_df = pd.DataFrame([])
while i < 500:
    customer = _customer()
    temp = pd.DataFrame([[customer.name, customer.customer_id, customer.county, customer.Main_Customer_Representative,
                    customer.Secondary_Customer_Representative, customer.Tertiary_Customer_Representative,
                    customer.Main_Customer_Representative_Job_Title, customer.Secondary_Customer_Representative_Job_Title,
                    customer.Tertiary_Customer_Representative_Job_Title, customer.salesman]])
    
    customer_df = customer_df.append(temp) 
    i += 1

In [15]:
customer_df.columns = ["name", "customer_id", "county", "Main_Customer_Representative",
                    "Secondary_Customer_Representative", "Tertiary_Customer_Representative",
                    "Main_Customer_Representative_Job_Title", "Secondary_Customer_Representative_Job_Title",
                    "Tertiary_Customer_Representative_Job_Title", "Salesman"]

## Sales

In [17]:
class _sales:
    def __init__(self, salesman, customer_representative):
        self.sales_id = uuid.uuid4().hex
        self.sales_datetime_str = random_date("1/1/2008 12:00 PM", "1/1/2010 12:00 PM", random.random())
        
        self.sales_datetime = datetime.strptime(self.sales_datetime_str, '%m/%d/%Y %I:%M %p')
        self.product = random.choice(list_of_products)
        self.number_of_units = np.random.randint(100,2000,size=(1, 1))[0][0]
        self.price_by_unit = prices_of_products[self.product]
        self.salesman = salesman
        
        self.delivery_how_many_days_later = timedelta(days = int(np.random.randint(1,30,size=(1, 1))[0][0])  )
        self.delivery_date = ((self.sales_datetime + self.delivery_how_many_days_later).date()).strftime("%m/%d/%Y")
        
        self.data_entry_how_many_days_later = timedelta(days = int(np.random.randint(1,10,size=(1, 1))[0][0])  )
        self.data_entry_date = ((self.sales_datetime + self.data_entry_how_many_days_later).date()).strftime("%m/%d/%Y")
        self.data_entry_officer = random.choice(list_of_accountants)
        
        self.customer_representative = customer_representative

In [18]:
i = 0
sales_df = pd.DataFrame([])
while i < number_of_rows:
    
    salesman = random.choice(list_of_salesman)
    customer_representative = random.choice(customer_df[customer_df["Salesman"] == salesman]["Main_Customer_Representative"].tolist() + customer_df[customer_df["Salesman"] == salesman]["Secondary_Customer_Representative"].tolist() + customer_df[customer_df["Salesman"] == salesman]["Tertiary_Customer_Representative"].tolist())
    
    sale = _sales(salesman, customer_representative)
    temp = pd.DataFrame([[ sale.sales_id,  sale.sales_datetime_str, sale.sales_datetime, sale.product, 
                          sale.number_of_units, sale.price_by_unit, 
                            sale.salesman, sale.delivery_how_many_days_later,
                            sale.delivery_date, sale.data_entry_how_many_days_later, sale.data_entry_date, 
                          sale.data_entry_officer, sale.customer_representative]])
    
    sales_df = sales_df.append(temp) 
    i += 1

In [19]:
sales_df.columns = ["Order ID","Order Entry Datetime Str","Order Entry Datetime","Product","# of Units",
                    "Price by Unit","Salesman",
                          "Delivery How Many Dates Later", "Delivery Date", 
                    "Data Entry How Many Dates Later",
                    "Data Entry Date", "Data Entry Officer", "Customer Representative"]

## Warehouse

In [20]:
class _warehouse:
    def __init__(self, order_id, order_date, latency_customer):
        self.order_id = order_id
        self.order_date = datetime.strptime(order_date, '%m/%d/%Y %I:%M %p')
        self.latency_customer = latency_customer
        self.quality_control = None   
        
        self.latency_load = timedelta(days = int(np.random.randint(0,5,size=(1, 1))[0][0]))      
        
        self.load_date = ((self.order_date + self.latency_customer + self.latency_load).date()).strftime("%m/%d/%Y")       
        
        self.latency_delivery = timedelta(days = int(np.random.randint(0,5,size=(1, 1))[0][0])) 
        
        
        self.delivery_date = ((self.order_date + self.latency_customer + self.latency_load + self.latency_delivery).date()).strftime("%m/%d/%Y")

In [None]:
i = 0
warehouse_logs = pd.DataFrame([])
for index, row in sales_df.iterrows():
    log = _warehouse(row["Order ID"], row["Order Entry Datetime Str"], row["Delivery How Many Dates Later"])
    
    temp = pd.DataFrame([[ log.order_id, log.order_date, log.latency_customer, log.quality_control,
                         log.latency_load, log.load_date, log.latency_delivery, log.delivery_date]])
    
    warehouse_logs = warehouse_logs.append(temp)

In [None]:
warehouse_logs.columns = ["Order ID", 
                          "Order Date",
                          "Latency Customer",
                          "Quality Control Check is Done",
                          "Latency Load",
                          "Load Date",
                          "Latency Delivery",
                          "Delivery Date"]

In [None]:
sales_df