In [1]:
import pandas as pd 
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import sys
import os

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 10

%matplotlib inline

base_path = "seeds/"

In [2]:
companies = pd.read_csv(base_path + "companylist.csv")
companies.head()

Unnamed: 0,Symbol,Name,LastSale,MarketCap,ADR TSO,IPOyear,Sector,Industry,Summary Quote,Unnamed: 9
0,YI,"111, Inc.",13.71,98369250.0,7175000.0,2018.0,Health Care,Medical/Nursing Services,https://www.nasdaq.com/symbol/yi,
1,PIH,"1347 Property Insurance Holdings, Inc.",6.25,37404790.0,,2014.0,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pih,
2,PIHPP,"1347 Property Insurance Holdings, Inc.",25.3101,0.0,,,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pihpp,
3,TURN,180 Degree Capital Corp.,2.25,70023510.0,,,Finance,Finance/Investors Services,https://www.nasdaq.com/symbol/turn,
4,FLWS,"1-800 FLOWERS.COM, Inc.",10.65,688032800.0,,1999.0,Consumer Services,Other Specialty Stores,https://www.nasdaq.com/symbol/flws,


In [3]:
names = pd.read_csv(base_path + "first_names.csv")
names.sample(10)

Unnamed: 0,Name,Gender,Count
7806,Ariane,M,5
93176,Terrisa,F,287
80849,Rosemery,F,28
90977,Talulla,F,38
45259,Jimmi,M,49
46313,Jontez,M,124
63413,Maloy,M,5
41761,Japree,M,5
80079,Rocco,F,5
52811,Khadjiah,F,10


In [4]:
addresses = pd.read_csv(base_path + "addresses.csv")
addresses.head()

Unnamed: 0,Name of Institution,Street Address,City,ZIP Code,County,Location 1
0,Abacus Federal Savings Bank,36-30 Main Street,Flushing,11354.0,Queens,"36 30 Main Street\nFlushing, NY 11354\n"
1,Adirondack Bank,448 Route 3,Plattsburgh,12901.0,Clinton,"448 Route\nPlattsburgh, NY 12901\n"
2,Adirondack Bank,13150 State Route 12,Boonville,13309.0,Oneida,"13150 State Route\nBoonville, NY 13309\n"
3,Adirondack Bank,Utica College,Utica,13501.0,Oneida,
4,"Adirondack Trust Company, The",112 Broadway,Saratoga Springs,12866.0,Saratoga,"112 Broadway\nSaratoga Springs, NY 12866\n"


In [5]:
domains = pd.read_csv(base_path + "top_domains.csv", header=None)[1]
domains.head(10)

0       google.com
1      youtube.com
2     facebook.com
3        baidu.com
4    wikipedia.org
5        yahoo.com
6           qq.com
7       taobao.com
8        tmall.com
9     google.co.in
Name: 1, dtype: object

In [6]:
class GenericData():
    
    _record_count_by_object = {}
    _code_by_object_name = {}
    _object_name_by_code = {}
    
    def __init__(self, object_name):
        count = GenericData._record_count_by_object.get(object_name, 0)
        object_code = GenericData._to_object_code(object_name)
        self.id =  "%s%010d" % (object_code, count)
        GenericData._record_count_by_object[object_name] = count + 1
            
    def __repr__(self):
        return self.id
    
    @staticmethod
    def save(records, filename):
        with open(filename, "w") as f:
            for r in records:
                f.write(str(r) + "\n")
    
    @staticmethod
    def reset():
        GenericData._record_count_by_object.clear()
        GenericData._code_by_object_name.clear()
        GenericData._object_name_by_code.clear()

    @staticmethod
    def sample(object_name, count):
        if not object_name in GenericData._code_by_object_name:
            raise ValueError("Invalid object name %s" % object_name) 
        
        object_code = GenericData._code_by_object_name[object_name]
        record_count = GenericData._record_count_by_object[object_name]
        indices = pd.Series(np.arange(record_count)).sample(count, replace = True)
        object_ids = ["%s%010d" % (object_code, i) for i in indices]
        return object_ids
    
    @staticmethod
    def rand_by_range(lower, upper, count): 
        return np.random.random(int(count)) * (upper - lower) + lower

    @staticmethod
    def generate_double(minimum, maximum, bins, count = None):
        bin_count = len(bins)
        
        if count is not None:
            bins = np.array(bins)
            bins = np.ceil((bins / np.sum(bins) * count))
        
        intervals = np.linspace(minimum, maximum, bin_count + 1)
        y = np.arange(0)
        for i in range(bin_count):
            lower, upper = intervals[i], intervals[i+1]
            count = bins[i]
            y = np.concatenate((y, GenericData.rand_by_range(lower, upper, count)))
        return list(np.array(y))
    
    @staticmethod
    def _to_object_code(s):
        import string
        
        if s in GenericData._code_by_object_name:
            return GenericData._code_by_object_name[s]
        
        alpha_digits =  string.digits + string.ascii_uppercase
        v = hash(s.strip().lower()) % (len(alpha_digits) ** 2)
        digit1 = v // 36
        digit0 = v % 36
        object_code = "".join([alpha_digits[digit1], alpha_digits[digit0]])
        
        GenericData._code_by_object_name[s] = object_code
        GenericData._object_name_by_code[object_code] = s
        return object_code

sample = GenericData("sample_object")
sample.id

'D90000000000'

In [7]:
class Address(GenericData):
    
    _addresses = pd.read_csv(base_path + "addresses.csv")
    
    def __init__(self, street, city, zipcode, state, county = None):
        GenericData.__init__(self, "address")
        self.street = street
        self.city = city 
        self.zipcode = zipcode
        self.state = state
        self.county = county
    
    def __repr__(self):
        record = (self.id, self.street, self.city, self.zipcode, self.state, self.county)
        return json.dumps(self.__dict__)
   

    @staticmethod
    def generate_addresses(count):
        addresses = Address._addresses
        addresses = addresses.sample(count, replace = True)
        records = []
        for i in range(count):
            record = Address(
                list(addresses["Street Address"])[i],
                list(addresses["City"])[i],
                list(addresses["ZIP Code"])[i],
                "NY",
                list(addresses["County"])[i],
            )
            records.append(record)
        return records
            
addresses = Address.generate_addresses(3)  
addresses

[{"id": "O40000000000", "street": "232 East Hartsdale Avenue", "city": "Hartsdale", "zipcode": 10530.0, "state": "NY", "county": "Westchester"},
 {"id": "O40000000001", "street": "40 Wolf Road", "city": "Colonie", "zipcode": 12205.0, "state": "NY", "county": "Albany"},
 {"id": "O40000000002", "street": "1 East Central Avenue", "city": "Pearl River", "zipcode": 10965.0, "state": "NY", "county": "Rockland"}]

In [8]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 3}

In [9]:
class Person(GenericData):
    
    _first_names = pd.Series(pd.read_csv(base_path + "first_names.csv").Name.unique())
    _last_names = pd.read_csv(base_path + "last_names.csv", header = None)
    
    def __init__(self, first_name, last_name, age, dob, email, gender, address):
        GenericData.__init__(self, "person")
        self.first_name = first_name
        self.last_name = last_name
        self.age = age
        self.dob = dob
        self.email = email
        self.address = address
        self.gender = gender
    
    def __repr__(self):
        record = (self.id, self.first_name, self.last_name
                      , self.dob, self.age, self.email, self.gender, self.address)
        return json.dumps(self.__dict__)
        
    @staticmethod
    def generate_records(count):
        
        first_names = list(Person._first_names.sample(count, replace=True))
        last_names = list(Person._last_names.sample(count, replace=True)[0])
        last_names = [s.capitalize() for s in last_names]
        
        ages = np.random.randint(low = 18, high = 90, size = count) 
        days_since = ages * 365 + np.random.randint(low = 0, high = 365, size = count)

        today = pd.to_datetime("today")
        dobs = today - pd.to_timedelta(days_since, unit="D")
        dobs = [d.strftime("%Y-%m-%d") for d in dobs.date]
        
        email_domains = ["gmail", "msn", "hotmail", "yahoo"]
        
        genders = list(pd.Series(np.array(["M", "F"])).sample(count, replace = True))
        
        emails = []
        
        for fname, lname in zip(first_names, last_names):
            domain = email_domains[np.random.randint(0, len(email_domains) - 1, 1)[0]]
            fname = re.sub(r"[^a-z]", "", fname.lower())
            lname = re.sub(r"[^a-z]", "", lname.lower())
            if np.random.random() > 0.5:
                emails.append("%s.%s@%s.com" % (fname, lname, domain))
            else:
                emails.append("%s%s@%s.com" % (fname[0], lname, domain))
        
        addresses = Address.generate_addresses(count)
        
        records = []
        for i in range(count):
            record = Person(first_names[i], last_names[i]
                            , int(ages[i]), dobs[i], emails[i]
                            , genders[i], addresses[i].__dict__)
            records.append(record)
        return records

persons = Person.generate_records(3)
persons

[{"id": "140000000000", "first_name": "Articia", "last_name": "Gnibba", "age": 85, "dob": "1933-06-24", "email": "articia.gnibba@hotmail.com", "address": {"id": "O40000000003", "street": "1139 Upper Front Street", "city": "Binghamton", "zipcode": 13901.0, "state": "NY", "county": "Broome"}, "gender": "F"},
 {"id": "140000000001", "first_name": "Birch", "last_name": "G\u00fcrkan", "age": 21, "dob": "1997-04-22", "email": "birch.grkan@msn.com", "address": {"id": "O40000000004", "street": " 20 Great Neck Rd", "city": " Great Neck", "zipcode": 11021.0, "state": "NY", "county": "Nassau"}, "gender": "F"},
 {"id": "140000000002", "first_name": "Akiyra", "last_name": "Bissinger", "age": 60, "dob": "1958-09-05", "email": "abissinger@hotmail.com", "address": {"id": "O40000000005", "street": "825 Fairport Road", "city": "East Rochester", "zipcode": 14445.0, "state": "NY", "county": "Monroe"}, "gender": "M"}]

In [10]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3}

In [11]:
pd.DataFrame.from_records([d.__dict__ for d in persons])

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name
0,"{'id': 'O40000000003', 'street': '1139 Upper F...",85,1933-06-24,articia.gnibba@hotmail.com,Articia,F,140000000000,Gnibba
1,"{'id': 'O40000000004', 'street': ' 20 Great Ne...",21,1997-04-22,birch.grkan@msn.com,Birch,F,140000000001,Gürkan
2,"{'id': 'O40000000005', 'street': '825 Fairport...",60,1958-09-05,abissinger@hotmail.com,Akiyra,M,140000000002,Bissinger


In [12]:
class Merchant(GenericData):
    
    _companies = pd.read_csv(base_path + "companylist.csv")
    
    def __init__(self, name):
        GenericData.__init__(self, "merchant")
        self.name = name
    
    def __repr__(self):
        return json.dumps(self.__dict__)

    @staticmethod
    def generate_records(count):
        count = min(count, len(Merchant._companies))
        names = list(Merchant._companies.sample(count, replace = False).Name)
        records = []
        for i in range(count):
            record = Merchant(names[i])
            records.append(record)
        return records
    
Merchant.generate_records(10)    

[{"id": "PJ0000000000", "name": "Koss Corporation"},
 {"id": "PJ0000000001", "name": "Sotherly Hotels Inc."},
 {"id": "PJ0000000002", "name": "Clearfield, Inc."},
 {"id": "PJ0000000003", "name": "Blink Charging Co."},
 {"id": "PJ0000000004", "name": "Five Star Senior Living Inc."},
 {"id": "PJ0000000005", "name": "Lexicon Pharmaceuticals, Inc."},
 {"id": "PJ0000000006", "name": "JD.com, Inc."},
 {"id": "PJ0000000007", "name": "Crocs, Inc."},
 {"id": "PJ0000000008", "name": "LPL Financial Holdings Inc."},
 {"id": "PJ0000000009", "name": "Iteris, Inc."}]

In [13]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3, 'merchant': 10}

In [14]:
class Transaction(GenericData):
    
    _types = ["pos", "atm", "mobile", "web"]
    
    def __init__(self, customer_id, merchant_id, amount, timestamp, category):
        super().__init__("transaction")
        self.customer_id = customer_id
        self.merchant_id = merchant_id
        self.amount = amount
        self.timestamp = timestamp
        self.category = category
    
    def __repr__(self):
        return json.dumps(self.__dict__)
    
    @staticmethod
    def generate_records(count):
        customer_ids = GenericData.sample("person", count)
        merchant_ids = GenericData.sample("merchant", count)
        amounts = GenericData.generate_double(0, 10000, [0.1, 0.3, 0.4, 0.4, 0.1, 0.1, 0.01], count)
        categories = list(pd.Series(Transaction._types).sample(count, replace = True))
        

        time_deltas = pd.to_timedelta(np.random.randint(10, 6 * 30 * 24 * 3600, count), "S")        
        timestamps = (pd.to_datetime("today") - time_deltas)
        timestamps = timestamps.astype(np.int32)
        timestamps = list(timestamps)
        
        records = []
        for i in range(count):
            record = Transaction(customer_ids[i], merchant_ids[i], amounts[i], timestamps[i], categories[i])
            records.append(record)
        return records
        
    
transactions = Transaction.generate_records(10)
transactions

[{"id": "DS0000000000", "customer_id": "140000000002", "merchant_id": "PJ0000000000", "amount": 801.2767021747588, "timestamp": 1532937831055385000, "category": "web"},
 {"id": "DS0000000001", "customer_id": "140000000001", "merchant_id": "PJ0000000009", "amount": 2791.5957342353304, "timestamp": 1534192078055385000, "category": "web"},
 {"id": "DS0000000002", "customer_id": "140000000001", "merchant_id": "PJ0000000001", "amount": 2347.0079786415185, "timestamp": 1525042586055385000, "category": "pos"},
 {"id": "DS0000000003", "customer_id": "140000000000", "merchant_id": "PJ0000000001", "amount": 2166.2150019818514, "timestamp": 1538182272055385000, "category": "pos"},
 {"id": "DS0000000004", "customer_id": "140000000002", "merchant_id": "PJ0000000000", "amount": 2883.800997064069, "timestamp": 1533275010055385000, "category": "web"},
 {"id": "DS0000000005", "customer_id": "140000000001", "merchant_id": "PJ0000000009", "amount": 3715.3775554511462, "timestamp": 1530610421055385000, "c

In [15]:
GenericData._record_count_by_object

{'sample_object': 1,
 'address': 6,
 'person': 3,
 'merchant': 10,
 'transaction': 10}

In [16]:
pd.DataFrame.from_records([d.__dict__ for d in transactions])

Unnamed: 0,amount,category,customer_id,id,merchant_id,timestamp
0,801.276702,web,140000000002,DS0000000000,PJ0000000000,1532937831055385000
1,2791.595734,web,140000000001,DS0000000001,PJ0000000009,1534192078055385000
2,2347.007979,pos,140000000001,DS0000000002,PJ0000000001,1525042586055385000
3,2166.215002,pos,140000000000,DS0000000003,PJ0000000001,1538182272055385000
4,2883.800997,web,140000000002,DS0000000004,PJ0000000000,1533275010055385000
5,3715.377555,pos,140000000001,DS0000000005,PJ0000000009,1530610421055385000
6,3631.839429,pos,140000000001,DS0000000006,PJ0000000004,1537264191055385000
7,5052.23333,web,140000000002,DS0000000007,PJ0000000004,1536503806055385000
8,5685.244053,web,140000000000,DS0000000008,PJ0000000006,1530771907055385000
9,4910.569128,web,140000000001,DS0000000009,PJ0000000003,1525512056055385000


In [17]:
%%time 

GenericData.reset()
persons = Person.generate_records(10000)
merchants = Merchant.generate_records(100)
transactions = Transaction.generate_records(10 ** 6)

os.makedirs("target", exist_ok=True)

GenericData.save(persons, "target/customers.json")
GenericData.save(merchants, "target/merchants.json")
GenericData.save(transactions, "target/transactions.json")

print("Record counts: ", GenericData._record_count_by_object)

print("Size in memory", sys.getsizeof(persons), sys.getsizeof(merchants), sys.getsizeof(transactions))

Record counts:  {'address': 10000, 'person': 10000, 'merchant': 100, 'transaction': 1000000}
Size in memory 87624 912 8697464
CPU times: user 13.6 s, sys: 236 ms, total: 13.9 s
Wall time: 12.4 s


In [18]:
persons_df = pd.DataFrame.from_records([d.__dict__ for d in persons])
persons_df.head()

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name
0,"{'id': 'O40000000000', 'street': '1510 West Ge...",24,1994-04-24,jninni@msn.com,Jennabelle,M,140000000000,Ninni
1,"{'id': 'O40000000001', 'street': '251 Saratoga...",47,1971-05-19,natayja.koudelka@gmail.com,Natayja,F,140000000001,Koudelka
2,"{'id': 'O40000000002', 'street': '10 South Mai...",32,1986-04-29,nilza.louis@msn.com,Nilza,F,140000000002,Louis
3,"{'id': 'O40000000003', 'street': '1281 North M...",39,1979-08-14,yaimee@gmail.com,Yameka,F,140000000003,Aimee
4,"{'id': 'O40000000004', 'street': '1414 Union A...",74,1944-09-22,jaycon.dayag@hotmail.com,Jaycon,M,140000000004,Dayag


In [19]:
merchants_df = pd.DataFrame.from_records([d.__dict__ for d in merchants])
merchants_df.head()

Unnamed: 0,id,name
0,PJ0000000000,"Papa Murphy&#39;s Holdings, Inc."
1,PJ0000000001,"ESSA Bancorp, Inc."
2,PJ0000000002,The Middleby Corporation
3,PJ0000000003,WisdomTree Japan Hedged SmallCap Equity Fund
4,PJ0000000004,FedNat Holding Company


In [20]:
transactions_df = pd.DataFrame.from_records([d.__dict__ for d in transactions])
transactions_df.head()

Unnamed: 0,amount,category,customer_id,id,merchant_id,timestamp
0,1171.569763,web,140000006672,DS0000000000,PJ0000000075,1535713514580663000
1,762.158257,pos,140000001456,DS0000000001,PJ0000000098,1537725574580663000
2,68.680391,web,140000003079,DS0000000002,PJ0000000088,1531430596580663000
3,444.150979,web,140000009313,DS0000000003,PJ0000000000,1538600570580663000
4,603.258859,mobile,140000007265,DS0000000004,PJ0000000086,1536505845580663000


In [21]:
transactions_df.customer_id.value_counts()

140000000904    141
140000007451    135
140000005793    134
140000009133    134
140000007973    133
               ... 
140000004738     67
140000006648     66
140000003975     66
140000009310     64
140000008409     63
Name: customer_id, Length: 10000, dtype: int64

In [22]:
transactions_df.customer_id.unique().shape

(10000,)