In [26]:
import pandas as pd 
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import sys
import os
import time

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 10

%matplotlib inline

base_path = "seeds/"

In [2]:
companies = pd.read_csv(base_path + "companylist.csv")
companies.head()

Unnamed: 0,Symbol,Name,LastSale,MarketCap,ADR TSO,IPOyear,Sector,Industry,Summary Quote,Unnamed: 9
0,YI,"111, Inc.",13.71,98369250.0,7175000.0,2018.0,Health Care,Medical/Nursing Services,https://www.nasdaq.com/symbol/yi,
1,PIH,"1347 Property Insurance Holdings, Inc.",6.25,37404790.0,,2014.0,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pih,
2,PIHPP,"1347 Property Insurance Holdings, Inc.",25.3101,0.0,,,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pihpp,
3,TURN,180 Degree Capital Corp.,2.25,70023510.0,,,Finance,Finance/Investors Services,https://www.nasdaq.com/symbol/turn,
4,FLWS,"1-800 FLOWERS.COM, Inc.",10.65,688032800.0,,1999.0,Consumer Services,Other Specialty Stores,https://www.nasdaq.com/symbol/flws,


In [3]:
names = pd.read_csv(base_path + "first_names.csv")
names.sample(10)

Unnamed: 0,Name,Gender,Count
11804,Berkli,F,29
45250,Jimm,M,22
37511,Hydiah,F,9
49156,Kanessha,F,6
48187,Kaiis,M,7
78118,Raydon,M,223
77247,Raiana,F,99
69962,Nahzier,M,13
70767,Natea,F,26
5277,Amondre,M,23


In [4]:
addresses = pd.read_csv(base_path + "addresses.csv")
addresses.head()

Unnamed: 0,Name of Institution,Street Address,City,ZIP Code,County,Location 1
0,Abacus Federal Savings Bank,36-30 Main Street,Flushing,11354.0,Queens,"36 30 Main Street\nFlushing, NY 11354\n"
1,Adirondack Bank,448 Route 3,Plattsburgh,12901.0,Clinton,"448 Route\nPlattsburgh, NY 12901\n"
2,Adirondack Bank,13150 State Route 12,Boonville,13309.0,Oneida,"13150 State Route\nBoonville, NY 13309\n"
3,Adirondack Bank,Utica College,Utica,13501.0,Oneida,
4,"Adirondack Trust Company, The",112 Broadway,Saratoga Springs,12866.0,Saratoga,"112 Broadway\nSaratoga Springs, NY 12866\n"


In [5]:
domains = pd.read_csv(base_path + "top_domains.csv", header=None)[1]
domains.head(10)

0       google.com
1      youtube.com
2     facebook.com
3        baidu.com
4    wikipedia.org
5        yahoo.com
6           qq.com
7       taobao.com
8        tmall.com
9     google.co.in
Name: 1, dtype: object

In [6]:
class GenericData():
    
    _record_count_by_object = {}
    _code_by_object_name = {}
    _object_name_by_code = {}
    
    def __init__(self, object_name):
        count = GenericData._record_count_by_object.get(object_name, 0)
        object_code = GenericData._to_object_code(object_name)
        self.id =  "%s%010d" % (object_code, count)
        GenericData._record_count_by_object[object_name] = count + 1
            
    def __repr__(self):
        return self.id
    
    @staticmethod
    def save(records, filename):
        with open(filename, "w") as f:
            for r in records:
                f.write(str(r) + "\n")
    
    @staticmethod
    def reset():
        GenericData._record_count_by_object.clear()
        GenericData._code_by_object_name.clear()
        GenericData._object_name_by_code.clear()

    @staticmethod
    def sample(object_name, count):
        if not object_name in GenericData._code_by_object_name:
            raise ValueError("Invalid object name %s" % object_name) 
        
        object_code = GenericData._code_by_object_name[object_name]
        record_count = GenericData._record_count_by_object[object_name]
        indices = pd.Series(np.arange(record_count)).sample(count, replace = True)
        object_ids = ["%s%010d" % (object_code, i) for i in indices]
        return object_ids
    
    @staticmethod
    def rand_by_range(lower, upper, count): 
        return np.random.random(int(count)) * (upper - lower) + lower

    @staticmethod
    def generate_double(minimum, maximum, bins, count = None):
        bin_count = len(bins)
        
        if count is not None:
            bins = np.array(bins)
            bins = np.ceil((bins / np.sum(bins) * count))
        
        intervals = np.linspace(minimum, maximum, bin_count + 1)
        y = np.arange(0)
        for i in range(bin_count):
            lower, upper = intervals[i], intervals[i+1]
            count = bins[i]
            y = np.concatenate((y, GenericData.rand_by_range(lower, upper, count)))
        return list(np.array(y))
    
    @staticmethod
    def _to_object_code(s):
        import string
        
        if s in GenericData._code_by_object_name:
            return GenericData._code_by_object_name[s]
        
        alpha_digits =  string.digits + string.ascii_uppercase
        v = hash(s.strip().lower()) % (len(alpha_digits) ** 2)
        digit1 = v // 36
        digit0 = v % 36
        object_code = "".join([alpha_digits[digit1], alpha_digits[digit0]])
        
        GenericData._code_by_object_name[s] = object_code
        GenericData._object_name_by_code[object_code] = s
        return object_code

sample = GenericData("sample_object")
sample.id

'2U0000000000'

In [7]:
class Address(GenericData):
    
    _addresses = pd.read_csv(base_path + "addresses.csv")
    
    def __init__(self, street, city, zipcode, state, county = None):
        GenericData.__init__(self, "address")
        self.street = street
        self.city = city 
        self.zipcode = zipcode
        self.state = state
        self.county = county
    
    def __repr__(self):
        record = (self.id, self.street, self.city, self.zipcode, self.state, self.county)
        return json.dumps(self.__dict__)
   

    @staticmethod
    def generate_addresses(count):
        addresses = Address._addresses
        addresses = addresses.sample(count, replace = True)
        records = []
        for i in range(count):
            record = Address(
                list(addresses["Street Address"])[i],
                list(addresses["City"])[i],
                list(addresses["ZIP Code"])[i],
                "NY",
                list(addresses["County"])[i],
            )
            records.append(record)
        return records
            
addresses = Address.generate_addresses(3)  
addresses

[{"id": "630000000000", "street": "1575 Mount Hope Avenue", "city": "Rochester", "zipcode": 14620.0, "state": "NY", "county": "Monroe"},
 {"id": "630000000001", "street": "407 Broadway", "city": "New York", "zipcode": 10013.0, "state": "NY", "county": "New York"},
 {"id": "630000000002", "street": "2911 Walden Avenue", "city": "Depew", "zipcode": 14043.0, "state": "NY", "county": "Erie"}]

In [8]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 3}

In [9]:
class Person(GenericData):
    
    _first_names = pd.Series(pd.read_csv(base_path + "first_names.csv").Name.unique())
    _last_names = pd.read_csv(base_path + "last_names.csv", header = None)
    
    def __init__(self, first_name, last_name, age, dob, email, gender, address):
        GenericData.__init__(self, "person")
        self.first_name = first_name
        self.last_name = last_name
        self.age = age
        self.dob = dob
        self.email = email
        self.address = address
        self.gender = gender
    
    def __repr__(self):
        record = (self.id, self.first_name, self.last_name
                      , self.dob, self.age, self.email, self.gender, self.address)
        return json.dumps(self.__dict__)
        
    @staticmethod
    def generate_records(count):
        
        first_names = list(Person._first_names.sample(count, replace=True))
        last_names = list(Person._last_names.sample(count, replace=True)[0])
        last_names = [s.capitalize() for s in last_names]
        
        ages = np.random.randint(low = 18, high = 90, size = count) 
        days_since = ages * 365 + np.random.randint(low = 0, high = 365, size = count)

        today = pd.to_datetime("today")
        dobs = today - pd.to_timedelta(days_since, unit="D")
        dobs = [d.strftime("%Y-%m-%d") for d in dobs.date]
        
        email_domains = ["gmail", "msn", "hotmail", "yahoo"]
        
        genders = list(pd.Series(np.array(["M", "F"])).sample(count, replace = True))
        
        emails = []
        
        for fname, lname in zip(first_names, last_names):
            domain = email_domains[np.random.randint(0, len(email_domains) - 1, 1)[0]]
            fname = re.sub(r"[^a-z]", "", fname.lower())
            lname = re.sub(r"[^a-z]", "", lname.lower())
            if np.random.random() > 0.5:
                emails.append("%s.%s@%s.com" % (fname, lname, domain))
            else:
                emails.append("%s%s@%s.com" % (fname[0], lname, domain))
        
        addresses = Address.generate_addresses(count)
        
        records = []
        for i in range(count):
            record = Person(first_names[i], last_names[i]
                            , int(ages[i]), dobs[i], emails[i]
                            , genders[i], addresses[i].__dict__)
            records.append(record)
        return records

persons = Person.generate_records(3)
persons

[{"id": "2A0000000000", "first_name": "Breelyn", "last_name": "Mckaig", "age": 18, "dob": "1999-12-09", "email": "bmckaig@gmail.com", "address": {"id": "630000000003", "street": "350 Jay Street", "city": "Brooklyn", "zipcode": 11201.0, "state": "NY", "county": "Kings"}, "gender": "M"},
 {"id": "2A0000000001", "first_name": "Ramzan", "last_name": "Lisa", "age": 68, "dob": "1950-01-24", "email": "rlisa@msn.com", "address": {"id": "630000000004", "street": "67 Jackson Street", "city": "Fishkill", "zipcode": 12524.0, "state": "NY", "county": "Dutchess"}, "gender": "F"},
 {"id": "2A0000000002", "first_name": "Lung", "last_name": "Hannauer", "age": 45, "dob": "1972-11-26", "email": "lhannauer@msn.com", "address": {"id": "630000000005", "street": "388 Greenwich Street", "city": "New York", "zipcode": 10013.0, "state": "NY", "county": "New York"}, "gender": "F"}]

In [10]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3}

In [11]:
pd.DataFrame.from_records([d.__dict__ for d in persons])

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name
0,"{'id': '630000000003', 'street': '350 Jay Stre...",18,1999-12-09,bmckaig@gmail.com,Breelyn,M,2A0000000000,Mckaig
1,"{'id': '630000000004', 'street': '67 Jackson S...",68,1950-01-24,rlisa@msn.com,Ramzan,F,2A0000000001,Lisa
2,"{'id': '630000000005', 'street': '388 Greenwic...",45,1972-11-26,lhannauer@msn.com,Lung,F,2A0000000002,Hannauer


In [12]:
class Merchant(GenericData):
    
    _companies = pd.read_csv(base_path + "companylist.csv")
    
    def __init__(self, name):
        GenericData.__init__(self, "merchant")
        self.name = name
    
    def __repr__(self):
        return json.dumps(self.__dict__)

    @staticmethod
    def generate_records(count):
        count = min(count, len(Merchant._companies))
        names = list(Merchant._companies.sample(count, replace = False).Name)
        records = []
        for i in range(count):
            record = Merchant(names[i])
            records.append(record)
        return records
    
Merchant.generate_records(10)    

[{"id": "0P0000000000", "name": "Rimini Street, Inc."},
 {"id": "0P0000000001", "name": "Investors Bancorp, Inc."},
 {"id": "0P0000000002", "name": "iShares Exponential Technologies ETF"},
 {"id": "0P0000000003", "name": "iShares MSCI Europe Small-Cap ETF"},
 {"id": "0P0000000004", "name": "Unico American Corporation"},
 {"id": "0P0000000005", "name": "RCI Hospitality Holdings, Inc."},
 {"id": "0P0000000006", "name": "iShares Global Infrastructure ETF"},
 {"id": "0P0000000007", "name": "FNCB Bancorp Inc."},
 {"id": "0P0000000008", "name": "Vanguard Russell 2000 Value ETF"},
 {"id": "0P0000000009", "name": "Astronics Corporation"}]

In [13]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3, 'merchant': 10}

In [14]:
class Transaction(GenericData):
    
    _types = ["pos", "atm", "mobile", "web"]
    
    def __init__(self, customer_id, merchant_id, amount, timestamp, category):
        super().__init__("transaction")
        self.customer_id = customer_id
        self.merchant_id = merchant_id
        self.amount = amount
        self.timestamp = timestamp
        self.category = category
    
    def __repr__(self):
        return json.dumps(self.__dict__)
    
    @staticmethod
    def generate_records(count):
        customer_ids = GenericData.sample("person", count)
        merchant_ids = GenericData.sample("merchant", count)
        amounts = GenericData.generate_double(0, 10000, [0.1, 0.3, 0.4, 0.4, 0.1, 0.1, 0.01], count)
        categories = list(pd.Series(Transaction._types).sample(count, replace = True))
        

        time_deltas = pd.to_timedelta(np.random.randint(10, 6 * 30 * 24 * 3600, count), "S")        
        timestamps = (pd.to_datetime("today") - time_deltas)
        timestamps = timestamps.astype(np.int32)
        timestamps = list(timestamps)
        
        records = []
        for i in range(count):
            record = Transaction(customer_ids[i], merchant_ids[i], amounts[i], timestamps[i], categories[i])
            records.append(record)
        return records
        
    
transactions = Transaction.generate_records(10)
transactions

[{"id": "3Y0000000000", "customer_id": "2A0000000002", "merchant_id": "0P0000000008", "amount": 1235.442003090407, "timestamp": 1530742268608225000, "category": "atm"},
 {"id": "3Y0000000001", "customer_id": "2A0000000000", "merchant_id": "0P0000000006", "amount": 1616.588782249943, "timestamp": 1534867356608225000, "category": "pos"},
 {"id": "3Y0000000002", "customer_id": "2A0000000001", "merchant_id": "0P0000000002", "amount": 1585.7322377409419, "timestamp": 1527755294608225000, "category": "atm"},
 {"id": "3Y0000000003", "customer_id": "2A0000000001", "merchant_id": "0P0000000004", "amount": 1741.3960658363128, "timestamp": 1523873886608225000, "category": "mobile"},
 {"id": "3Y0000000004", "customer_id": "2A0000000000", "merchant_id": "0P0000000004", "amount": 3460.568878601627, "timestamp": 1532647693608225000, "category": "mobile"},
 {"id": "3Y0000000005", "customer_id": "2A0000000002", "merchant_id": "0P0000000009", "amount": 3449.361435741932, "timestamp": 1537581223608225000

In [15]:
GenericData._record_count_by_object

{'sample_object': 1,
 'address': 6,
 'person': 3,
 'merchant': 10,
 'transaction': 10}

In [16]:
pd.DataFrame.from_records([d.__dict__ for d in transactions])

Unnamed: 0,amount,category,customer_id,id,merchant_id,timestamp
0,1235.442003,atm,2A0000000002,3Y0000000000,0P0000000008,1530742268608225000
1,1616.588782,pos,2A0000000000,3Y0000000001,0P0000000006,1534867356608225000
2,1585.732238,atm,2A0000000001,3Y0000000002,0P0000000002,1527755294608225000
3,1741.396066,mobile,2A0000000001,3Y0000000003,0P0000000004,1523873886608225000
4,3460.568879,mobile,2A0000000000,3Y0000000004,0P0000000004,1532647693608225000
5,3449.361436,atm,2A0000000002,3Y0000000005,0P0000000009,1537581223608225000
6,3030.236009,atm,2A0000000001,3Y0000000006,0P0000000006,1524213219608225000
7,5243.47499,mobile,2A0000000002,3Y0000000007,0P0000000003,1523463411608225000
8,4438.023481,web,2A0000000002,3Y0000000008,0P0000000005,1528452946608225000
9,5237.608726,mobile,2A0000000001,3Y0000000009,0P0000000000,1529327492608225000


In [17]:
%%time 

GenericData.reset()
persons = Person.generate_records(10000)
merchants = Merchant.generate_records(100)
transactions = Transaction.generate_records(10 ** 6)

os.makedirs("target", exist_ok=True)

GenericData.save(persons, "target/customers.json")
GenericData.save(merchants, "target/merchants.json")
GenericData.save(transactions, "target/transactions.json")

print("Record counts: ", GenericData._record_count_by_object)

print("Size in memory", sys.getsizeof(persons), sys.getsizeof(merchants), sys.getsizeof(transactions))

Record counts:  {'address': 10000, 'person': 10000, 'merchant': 100, 'transaction': 1000000}
Size in memory 87624 912 8697464
CPU times: user 14.5 s, sys: 275 ms, total: 14.8 s
Wall time: 13.3 s


In [18]:
persons_df = pd.DataFrame.from_records([d.__dict__ for d in persons])
persons_df.head()

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name
0,"{'id': '630000000000', 'street': '57 South Buf...",82,1936-08-15,tmarcelot@hotmail.com,Toddy,M,2A0000000000,Marcelot
1,"{'id': '630000000001', 'street': '294 Main Str...",45,1973-08-17,chemrick@gmail.com,Cub,M,2A0000000001,Hemrick
2,"{'id': '630000000002', 'street': '119 Temple H...",33,1985-05-06,skohlhase@msn.com,Satanya,M,2A0000000002,Kohlhase
3,"{'id': '630000000003', 'street': '3 Main Stree...",29,1989-03-21,aehrbach@gmail.com,Adalena,M,2A0000000003,Ehrbach
4,"{'id': '630000000004', 'street': ' 71-15 Austi...",46,1971-11-20,kdettenberg@hotmail.com,Kamla,F,2A0000000004,Dettenberg


In [19]:
merchants_df = pd.DataFrame.from_records([d.__dict__ for d in merchants])
merchants_df.head()

Unnamed: 0,id,name
0,0P0000000000,"People&#39;s United Financial, Inc."
1,0P0000000001,"Twenty-First Century Fox, Inc."
2,0P0000000002,Neogen Corporation
3,0P0000000003,Forum Merger II Corporation
4,0P0000000004,Qutoutiao Inc.


In [20]:
transactions_df = pd.DataFrame.from_records([d.__dict__ for d in transactions])
transactions_df.head()

Unnamed: 0,amount,category,customer_id,id,merchant_id,timestamp
0,1312.791577,atm,2A0000001236,3Y0000000000,0P0000000093,1533213879557725000
1,664.186471,web,2A0000008655,3Y0000000001,0P0000000066,1528803052557725000
2,572.588999,pos,2A0000003571,3Y0000000002,0P0000000035,1534755566557725000
3,127.502073,pos,2A0000003647,3Y0000000003,0P0000000099,1525219768557725000
4,1129.189154,pos,2A0000002198,3Y0000000004,0P0000000008,1533961866557725000


In [21]:
transactions_df.customer_id.value_counts()

2A0000006964    137
2A0000003526    135
2A0000000632    134
2A0000000262    134
2A0000005484    134
               ... 
2A0000006972     69
2A0000005435     69
2A0000003143     68
2A0000000216     66
2A0000005533     62
Name: customer_id, Length: 10000, dtype: int64

In [22]:
transactions_df.customer_id.unique().shape

(10000,)

In [24]:
rate_eps = 100 
wait_interval = 1.0 / rate_eps
for tnx in transactions[:10]:
    print(tnx)
    time.sleep(wait_interval)
    # call a method to send the tnx to kafka

{"id": "3Y0000000000", "customer_id": "2A0000001236", "merchant_id": "0P0000000093", "amount": 1312.791576784415, "timestamp": 1533213879557725000, "category": "atm"}
{"id": "3Y0000000001", "customer_id": "2A0000008655", "merchant_id": "0P0000000066", "amount": 664.1864711414687, "timestamp": 1528803052557725000, "category": "web"}
{"id": "3Y0000000002", "customer_id": "2A0000003571", "merchant_id": "0P0000000035", "amount": 572.5889994562172, "timestamp": 1534755566557725000, "category": "pos"}
{"id": "3Y0000000003", "customer_id": "2A0000003647", "merchant_id": "0P0000000099", "amount": 127.50207271677671, "timestamp": 1525219768557725000, "category": "pos"}
{"id": "3Y0000000004", "customer_id": "2A0000002198", "merchant_id": "0P0000000008", "amount": 1129.1891538275281, "timestamp": 1533961866557725000, "category": "pos"}
{"id": "3Y0000000005", "customer_id": "2A0000002885", "merchant_id": "0P0000000098", "amount": 191.434523148836, "timestamp": 1531097455557725000, "category": "web