# All material ©2019, Alex Siegman

---

### Below is the code used to generate the mock dataset for Stern Technologies

In [145]:
import pandas as pd
import random
import uuid # https://docs.python.org/3/library/uuid.html

In [146]:
# user ID creation

id_list = []

for i in range(0,50000):
    id = uuid.uuid4() # generate a random UUID
    id_list.append(str(id))

In [147]:
id_list[:5] # see the first five UUID's generated by the loop above

['081217b4-1cf5-4657-8287-6db1b75462e4',
 'd0b45a01-b73d-4f8e-bfa8-c53ea75397f1',
 '1dc2e636-e19b-4d42-b228-df09cd009acb',
 '5d09d6d4-023e-4fa1-9559-89526679e885',
 'b69e54e3-fc89-4c0f-8bdb-280409db173e']

In [148]:
# company size creation 

company_size_list = []

for i in range(0,50000):
    size = random.choice(['startup','small','medium','large']) # choose a random size from the 'sizes' list
    company_size_list.append(size)

In [149]:
company_size_list[:5] # see the first five company sizes generated by the loop above

['large', 'large', 'large', 'large', 'medium']

In [150]:
# age

age_list = []

for i in range(0,50000):
    age = random.randint(18,99) # choose a random age between 18 and 99
    age_list.append(int(age))

In [151]:
age_list[:5] # see the first five ages generated by the loop above

[92, 56, 20, 55, 25]

In [152]:
# sex

sex_list = []

sexes = ['M','F','N']

for i in range(0,50000):
    sex = random.choice(sexes) # choose a M, F, or N (for non-binary)
    sex_list.append(str(sex))

In [153]:
sex_list[:5] # see the first five sexes generated by the loop above

['M', 'M', 'F', 'F', 'N']

In [154]:
# clicked on ad

clicked_on_ad_list = []

actions = ['Yes','No']

for i in range(0,50000):
    choice = random.choice(actions) # choose either 'Yes' or 'No'
    clicked_on_ad_list.append(str(choice))

In [155]:
clicked_on_ad_list[:5] # see the first five user actions generated by the loop above

['Yes', 'Yes', 'No', 'Yes', 'No']

In [156]:
# ad type

ad_type_list = []

ad_types = ['Tech','Luxury','Culinary','Fashion','Travel','Political','Business','Real Estate']

for i in range(0,50000):
    ad_type = random.choice(ad_types) # choose one of the ad types
    ad_type_list.append(ad_type)

In [157]:
ad_type_list[:5] # see the first five ad types generated by the loop above

['Business', 'Culinary', 'Business', 'Political', 'Tech']

In [158]:
# location

location_list = []

locations = ['US','NorthEast','SouthEast','MidWest','SouthWest','NorthWest','Canada','Mexico','SouthAmerica']

for i in range(0,50000):
    location = random.choice(locations) # choose one of the locations
    location_list.append(str(location))

In [159]:
location_list[:5] # see the first five locations generated by the loop above

['MidWest', 'SouthWest', 'SouthEast', 'NorthWest', 'US']

In [160]:
# timestamp

from datetime import *

timestamp_list = []

def gen_datetime(min_year=2000, max_year=datetime.now().year):
    start = datetime(min_year, 1, 1, 00, 00, 00)
    years = max_year - min_year + 1
    end = start + timedelta(days = 365 * years)
    return start + (end - start) * random.random()

for i in range(0,50000): 
    timestamp = gen_datetime()
    timestamp_list.append(timestamp)

timestamp_list[:5] # view the first five random timestamps generated in the loop above  

[datetime.datetime(2018, 8, 26, 6, 0, 27, 124290),
 datetime.datetime(2011, 6, 1, 18, 54, 34, 815634),
 datetime.datetime(2013, 7, 16, 0, 24, 47, 888180),
 datetime.datetime(2010, 6, 25, 12, 13, 51, 369878),
 datetime.datetime(2010, 9, 22, 7, 53, 12, 454909)]

In [161]:
user_data = pd.DataFrame({'id':id_list,'company_size':company_size_list,'age':age_list,'sex':sex_list,'clicked_on_ad':clicked_on_ad_list,'ad_type':ad_type_list,'location':location_list,'timestamp':timestamp_list})

In [162]:
user_data

Unnamed: 0,id,company_size,age,sex,clicked_on_ad,ad_type,location,timestamp
0,081217b4-1cf5-4657-8287-6db1b75462e4,large,92,M,Yes,Business,MidWest,2018-08-26 06:00:27.124290
1,d0b45a01-b73d-4f8e-bfa8-c53ea75397f1,large,56,M,Yes,Culinary,SouthWest,2011-06-01 18:54:34.815634
2,1dc2e636-e19b-4d42-b228-df09cd009acb,large,20,F,No,Business,SouthEast,2013-07-16 00:24:47.888180
3,5d09d6d4-023e-4fa1-9559-89526679e885,large,55,F,Yes,Political,NorthWest,2010-06-25 12:13:51.369878
4,b69e54e3-fc89-4c0f-8bdb-280409db173e,medium,25,N,No,Tech,US,2010-09-22 07:53:12.454909
5,26753d01-eaf3-4906-a0e8-ec3364e336ab,large,87,N,Yes,Political,Canada,2018-08-07 01:14:12.732566
6,8b449b66-55fb-411b-a27d-81f88d12d6e2,startup,39,N,No,Real Estate,SouthEast,2003-05-31 07:24:51.873042
7,ec5a6353-f816-4737-8fed-a8a620fe70c3,medium,39,N,No,Business,SouthAmerica,2015-08-26 08:07:55.115573
8,3d732520-cbb7-4e90-8d17-5dcd12ae3ea2,small,62,M,Yes,Tech,SouthAmerica,2017-06-15 10:03:34.420593
9,eedba0f5-0e4b-49b2-8409-62b24761a05a,large,76,F,No,Luxury,SouthEast,2008-10-10 16:28:52.002102


In [163]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
id               50000 non-null object
company_size     50000 non-null object
age              50000 non-null int64
sex              50000 non-null object
clicked_on_ad    50000 non-null object
ad_type          50000 non-null object
location         50000 non-null object
timestamp        50000 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 3.1+ MB


In [164]:
user_data.to_csv('./SternTech_UserData.csv')