/
mock_customer.py
81 lines (61 loc) · 3.76 KB
/
mock_customer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from __future__ import division
from builtins import range
from past.utils import old_div
import pandas as pd
from numpy import random
from numpy.random import choice
import featuretools as ft
def load_mock_customer(n_customers=5, n_products=5, n_sessions=35, n_transactions=500,
random_seed=0, return_single_table=False, return_entityset=False):
"""Return dataframes of mock customer data"""
random.seed(random_seed)
customers_df = pd.DataFrame({"customer_id": range(1, n_customers + 1)})
customers_df["zip_code"] = choice(["60091", "02139"], n_customers,)
customers_df["join_date"] = pd.date_range('1/1/2008', periods=n_customers, freq='50d') # todo make these less regular
products_df = pd.DataFrame({"product_id": range(1, n_products + 1)})
products_df["brand"] = choice(["A", "B", "C"], n_products)
sessions_df = pd.DataFrame({"session_id": range(1, n_sessions + 1)})
sessions_df["customer_id"] = choice(customers_df["customer_id"], n_sessions)
sessions_df["device"] = choice(["desktop", "mobile", "tablet"], n_sessions)
transactions_df = pd.DataFrame({"transaction_id": range(1, n_transactions + 1)})
transactions_df["session_id"] = choice(sessions_df["session_id"], n_transactions)
transactions_df = transactions_df.sort_values("session_id").reset_index(drop=True)
transactions_df["transaction_time"] = pd.date_range('1/1/2014', periods=n_transactions, freq='65s') # todo make these less regular
transactions_df["product_id"] = pd.Categorical(choice(products_df["product_id"], n_transactions))
transactions_df["amount"] = old_div(random.randint(500, 15000, n_transactions), 100.0)
# calculate and merge in session start
# based on the times we came up with for transactions
session_starts = transactions_df.drop_duplicates("session_id")[["session_id", "transaction_time"]].rename(columns={"transaction_time": "session_start"})
sessions_df = sessions_df.merge(session_starts)
if return_single_table:
return transactions_df.merge(sessions_df).merge(customers_df).merge(products_df).reset_index(drop=True)
elif return_entityset:
es = ft.EntitySet(id="transactions")
es = es.entity_from_dataframe(entity_id="transactions",
dataframe=transactions_df,
index="transaction_id",
time_index="transaction_time",
variable_types={"product_id": ft.variable_types.Categorical})
es = es.entity_from_dataframe(entity_id="products",
dataframe=products_df,
index="product_id")
es = es.entity_from_dataframe(entity_id="sessions",
dataframe=sessions_df,
index="session_id",
time_index="session_start")
es = es.entity_from_dataframe(entity_id="customers",
dataframe=customers_df,
index="customer_id",
time_index="join_date")
rels = [ft.Relationship(es["products"]["product_id"],
es["transactions"]["product_id"]),
ft.Relationship(es["sessions"]["session_id"],
es["transactions"]["session_id"]),
ft.Relationship(es["customers"]["customer_id"],
es["sessions"]["customer_id"])]
es = es.add_relationships(rels)
return es
return {"customers": customers_df,
"sessions": sessions_df,
"transactions": transactions_df,
"products": products_df}