In [2]:
import pandas as pd 
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import sys

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 10

%matplotlib inline

base_path = "seeds/"

In [3]:
companies = pd.read_csv(base_path + "companylist.csv")
companies.head()

Unnamed: 0,Symbol,Name,LastSale,MarketCap,ADR TSO,IPOyear,Sector,Industry,Summary Quote,Unnamed: 9
0,YI,"111, Inc.",13.71,98369250.0,7175000.0,2018.0,Health Care,Medical/Nursing Services,https://www.nasdaq.com/symbol/yi,
1,PIH,"1347 Property Insurance Holdings, Inc.",6.25,37404790.0,,2014.0,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pih,
2,PIHPP,"1347 Property Insurance Holdings, Inc.",25.3101,0.0,,,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pihpp,
3,TURN,180 Degree Capital Corp.,2.25,70023510.0,,,Finance,Finance/Investors Services,https://www.nasdaq.com/symbol/turn,
4,FLWS,"1-800 FLOWERS.COM, Inc.",10.65,688032800.0,,1999.0,Consumer Services,Other Specialty Stores,https://www.nasdaq.com/symbol/flws,


In [4]:
names = pd.read_csv(base_path + "first_names.csv")
names.sample(10)

Unnamed: 0,Name,Gender,Count
27955,Dymir,M,112
9190,Astin,M,382
55264,Kurtis,M,17744
102451,Zayed,M,228
22647,Darrisha,F,20
59498,Leyah,F,967
47870,Kadijah,F,1418
52015,Kentral,M,45
7084,Antonieo,M,10
100908,Yien,M,5


In [5]:
addresses = pd.read_csv(base_path + "addresses.csv")
addresses.head()

Unnamed: 0,Name of Institution,Street Address,City,ZIP Code,County,Location 1
0,Abacus Federal Savings Bank,36-30 Main Street,Flushing,11354.0,Queens,"36 30 Main Street\nFlushing, NY 11354\n"
1,Adirondack Bank,448 Route 3,Plattsburgh,12901.0,Clinton,"448 Route\nPlattsburgh, NY 12901\n"
2,Adirondack Bank,13150 State Route 12,Boonville,13309.0,Oneida,"13150 State Route\nBoonville, NY 13309\n"
3,Adirondack Bank,Utica College,Utica,13501.0,Oneida,
4,"Adirondack Trust Company, The",112 Broadway,Saratoga Springs,12866.0,Saratoga,"112 Broadway\nSaratoga Springs, NY 12866\n"


In [6]:
domains = pd.read_csv(base_path + "top_domains.csv", header=None)[1]
domains.head(10)

0       google.com
1      youtube.com
2     facebook.com
3        baidu.com
4    wikipedia.org
5        yahoo.com
6           qq.com
7       taobao.com
8        tmall.com
9     google.co.in
Name: 1, dtype: object

In [7]:
class GenericData():
    
    _record_count_by_object = {}
    _code_by_object_name = {}
    
    def __init__(self, object_name):
        self.object_name = object_name
        count = GenericData._record_count_by_object.get(object_name, 0)
        object_code = GenericData._to_object_code(object_name)
        self.id =  "%s%010d" % (object_code, count)
        GenericData._record_count_by_object[object_name] = count + 1
            
    def __repr__(self):
        return self.id
    
    @staticmethod
    def save(records, filename):
        with open(filename, "w") as f:
            for r in records:
                f.write(str(r) + "\n")
    
    @staticmethod
    def reset():
        GenericData._record_count_by_object.clear()
        GenericData._code_by_object_name.clear()

    @staticmethod
    def sample(object_name, count):
        if not object_name in GenericData._code_by_object_name:
            raise ValueError("Invalid object name %s" % object_name) 
        
        object_code = GenericData._code_by_object_name[object_name]
        record_count = GenericData._record_count_by_object[object_name]
        indices = pd.Series(np.arange(record_count)).sample(count, replace = True)
        object_ids = ["%s%010d" % (object_code, i) for i in indices]
        return object_ids
    
    @staticmethod
    def rand_by_range(lower, upper, count): 
        return np.random.random(int(count)) * (upper - lower) + lower

    @staticmethod
    def generate_double(minimum, maximum, bins, count = None):
        bin_count = len(bins)
        
        if count is not None:
            bins = np.array(bins)
            bins = np.ceil((bins / np.sum(bins) * count))
        
        intervals = np.linspace(minimum, maximum, bin_count + 1)
        y = np.arange(0)
        for i in range(bin_count):
            lower, upper = intervals[i], intervals[i+1]
            count = bins[i]
            y = np.concatenate((y, GenericData.rand_by_range(lower, upper, count)))
        return list(np.array(y))
    
    @staticmethod
    def _to_object_code(s):
        import string
        
        if s in GenericData._code_by_object_name:
            return GenericData._code_by_object_name[s]
        
        alpha_digits =  string.digits + string.ascii_uppercase
        v = hash(s.strip().lower()) % (len(alpha_digits) ** 2)
        digit1 = v // 36
        digit0 = v % 36
        object_code = "".join([alpha_digits[digit1], alpha_digits[digit0]])
        GenericData._code_by_object_name[s] = object_code
        return object_code

sample = GenericData("sample_object")
sample.id

'5S0000000000'

In [8]:
class Address(GenericData):
    
    _addresses = pd.read_csv(base_path + "addresses.csv")
    
    def __init__(self, street, city, zipcode, state, county = None):
        GenericData.__init__(self, "address")
        self.street = street
        self.city = city 
        self.zipcode = zipcode
        self.state = state
        self.county = county
    
    def __repr__(self):
        record = (self.id, self.street, self.city, self.zipcode, self.state, self.county)
        return json.dumps(self.__dict__)
   

    @staticmethod
    def generate_addresses(count):
        addresses = Address._addresses
        addresses = addresses.sample(count, replace = True)
        records = []
        for i in range(count):
            record = Address(
                list(addresses["Street Address"])[i],
                list(addresses["City"])[i],
                list(addresses["ZIP Code"])[i],
                "NY",
                list(addresses["County"])[i],
            )
            records.append(record)
        return records
            
addresses = Address.generate_addresses(3)  
addresses

[{"object_name": "address", "id": "6B0000000000", "street": "2752 Sunrise Highway", "city": "Bellmore", "zipcode": 11710.0, "state": "NY", "county": "Nassau"},
 {"object_name": "address", "id": "6B0000000001", "street": "782 Hoosick Road -- Route 7", "city": "Troy", "zipcode": 12180.0, "state": "NY", "county": "Rensselaer"},
 {"object_name": "address", "id": "6B0000000002", "street": "One Steuben Square", "city": "Hornell", "zipcode": 14843.0, "state": "NY", "county": "Steuben"}]

In [9]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 3}

In [10]:
class Person(GenericData):
    
    _first_names = pd.Series(pd.read_csv(base_path + "first_names.csv").Name.unique())
    _last_names = pd.read_csv(base_path + "last_names.csv", header = None)
    
    def __init__(self, first_name, last_name, age, dob, email, gender, address):
        GenericData.__init__(self, "person")
        self.first_name = first_name
        self.last_name = last_name
        self.age = age
        self.dob = dob
        self.email = email
        self.address = address
        self.gender = gender
    
    def __repr__(self):
        record = (self.id, self.first_name, self.last_name
                      , self.dob, self.age, self.email, self.gender, self.address)
        return json.dumps(self.__dict__)
        
    @staticmethod
    def generate_records(count):
        
        first_names = list(Person._first_names.sample(count, replace=True))
        last_names = list(Person._last_names.sample(count, replace=True)[0])
        last_names = [s.capitalize() for s in last_names]
        
        ages = np.random.randint(low = 18, high = 90, size = count) 
        days_since = ages * 365 + np.random.randint(low = 0, high = 365, size = count)

        today = pd.to_datetime("today")
        dobs = today - pd.to_timedelta(days_since, unit="D")
        dobs = [d.strftime("%Y-%m-%d") for d in dobs.date]
        
        email_domains = ["gmail", "msn", "hotmail", "yahoo"]
        
        genders = list(pd.Series(np.array(["M", "F"])).sample(count, replace = True))
        
        emails = []
        
        for fname, lname in zip(first_names, last_names):
            domain = email_domains[np.random.randint(0, len(email_domains) - 1, 1)[0]]
            fname = re.sub(r"[^a-z]", "", fname.lower())
            lname = re.sub(r"[^a-z]", "", lname.lower())
            if np.random.random() > 0.5:
                emails.append("%s.%s@%s.com" % (fname, lname, domain))
            else:
                emails.append("%s%s@%s.com" % (fname[0], lname, domain))
        
        addresses = Address.generate_addresses(count)
        
        records = []
        for i in range(count):
            record = Person(first_names[i], last_names[i]
                            , int(ages[i]), dobs[i], emails[i]
                            , genders[i], addresses[i].__dict__)
            records.append(record)
        return records

persons = Person.generate_records(3)
persons

[{"object_name": "person", "id": "N60000000000", "first_name": "Jacksin", "last_name": "C bellm", "age": 28, "dob": "1989-10-18", "email": "jacksin.cbellm@gmail.com", "address": {"object_name": "address", "id": "6B0000000003", "street": "75 NORTH CENTRAL AVENUE", "city": "VALLEY STREAM", "zipcode": 11580.0, "state": "NY", "county": "Nassau"}, "gender": "M"},
 {"object_name": "person", "id": "N60000000001", "first_name": "Char", "last_name": "Symonik", "age": 44, "dob": "1974-01-29", "email": "char.symonik@msn.com", "address": {"object_name": "address", "id": "6B0000000004", "street": "7296 SOUTH BROADWAY", "city": "RED HOOK", "zipcode": 12571.0, "state": "NY", "county": "Dutchess"}, "gender": "M"},
 {"object_name": "person", "id": "N60000000002", "first_name": "Jadey", "last_name": "Willing", "age": 27, "dob": "1990-11-23", "email": "jwilling@hotmail.com", "address": {"object_name": "address", "id": "6B0000000005", "street": "11 Maple Street", "city": "Cohocton", "zipcode": 14826.0, "s

In [11]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3}

In [12]:
pd.DataFrame.from_records([d.__dict__ for d in persons])

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name,object_name
0,"{'object_name': 'address', 'id': '6B0000000003...",28,1989-10-18,jacksin.cbellm@gmail.com,Jacksin,M,N60000000000,C bellm,person
1,"{'object_name': 'address', 'id': '6B0000000004...",44,1974-01-29,char.symonik@msn.com,Char,M,N60000000001,Symonik,person
2,"{'object_name': 'address', 'id': '6B0000000005...",27,1990-11-23,jwilling@hotmail.com,Jadey,M,N60000000002,Willing,person


In [13]:
class Merchant(GenericData):
    
    _companies = pd.read_csv(base_path + "companylist.csv")
    
    def __init__(self, name):
        GenericData.__init__(self, "merchant")
        self.name = name
    
    def __repr__(self):
        return json.dumps(self.__dict__)

    @staticmethod
    def generate_records(count):
        count = min(count, len(Merchant._companies))
        names = list(Merchant._companies.sample(count, replace = False).Name)
        records = []
        for i in range(count):
            record = Merchant(names[i])
            records.append(record)
        return records
    
Merchant.generate_records(10)    

[{"object_name": "merchant", "id": "ET0000000000", "name": "ImmuCell Corporation"},
 {"object_name": "merchant", "id": "ET0000000001", "name": "Digi International Inc."},
 {"object_name": "merchant", "id": "ET0000000002", "name": "First Trust Nasdaq Bank ETF"},
 {"object_name": "merchant", "id": "ET0000000003", "name": "VictoryShares US 500 Enhanced Volatility Wtd ETF"},
 {"object_name": "merchant", "id": "ET0000000004", "name": "Pacific Mercantile Bancorp"},
 {"object_name": "merchant", "id": "ET0000000005", "name": "Waterstone Financial, Inc."},
 {"object_name": "merchant", "id": "ET0000000006", "name": "Macatawa Bank Corporation"},
 {"object_name": "merchant", "id": "ET0000000007", "name": "Mudrick Capital Acquisition Corporation"},
 {"object_name": "merchant", "id": "ET0000000008", "name": "Good Times Restaurants Inc."},
 {"object_name": "merchant", "id": "ET0000000009", "name": "aTyr Pharma, Inc."}]

In [14]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3, 'merchant': 10}

In [15]:
class Transaction(GenericData):
    
    _types = ["pos", "atm", "mobile", "web"]
    
    def __init__(self, customer_id, merchant_id, amount, timestamp, category):
        super().__init__("transaction")
        self.customer_id = customer_id
        self.merchant_id = merchant_id
        self.amount = amount
        self.timestamp = timestamp
        self.category = category
    
    def __repr__(self):
        return json.dumps(self.__dict__)
    
    @staticmethod
    def generate_records(count):
        customer_ids = GenericData.sample("person", count)
        merchant_ids = GenericData.sample("merchant", count)
        amounts = GenericData.generate_double(0, 10000, [0.1, 0.3, 0.4, 0.4, 0.1, 0.1, 0.01], count)
        categories = list(pd.Series(Transaction._types).sample(count, replace = True))
        

        time_deltas = pd.to_timedelta(np.random.randint(10, 6 * 30 * 24 * 3600, count), "S")        
        timestamps = (pd.to_datetime("today") - time_deltas)
        timestamps = timestamps.astype(np.int32)
        timestamps = list(timestamps)
        
        records = []
        for i in range(count):
            record = Transaction(customer_ids[i], merchant_ids[i], amounts[i], timestamps[i], categories[i])
            records.append(record)
        return records
        
    
transactions = Transaction.generate_records(10)
transactions

[{"object_name": "transaction", "id": "U50000000000", "customer_id": "N60000000001", "merchant_id": "ET0000000005", "amount": 231.98504328702282, "timestamp": 1525608205413159000, "category": "mobile"},
 {"object_name": "transaction", "id": "U50000000001", "customer_id": "N60000000000", "merchant_id": "ET0000000002", "amount": 1801.5257736230167, "timestamp": 1533878763413159000, "category": "pos"},
 {"object_name": "transaction", "id": "U50000000002", "customer_id": "N60000000002", "merchant_id": "ET0000000003", "amount": 2547.376335835061, "timestamp": 1529889604413159000, "category": "atm"},
 {"object_name": "transaction", "id": "U50000000003", "customer_id": "N60000000001", "merchant_id": "ET0000000005", "amount": 1636.9796523131533, "timestamp": 1533141566413159000, "category": "atm"},
 {"object_name": "transaction", "id": "U50000000004", "customer_id": "N60000000000", "merchant_id": "ET0000000007", "amount": 4178.342667466838, "timestamp": 1531074379413159000, "category": "web"},

In [16]:
GenericData._record_count_by_object

{'sample_object': 1,
 'address': 6,
 'person': 3,
 'merchant': 10,
 'transaction': 10}

In [17]:
pd.DataFrame.from_records([d.__dict__ for d in transactions])

Unnamed: 0,amount,category,customer_id,id,merchant_id,object_name,timestamp
0,231.985043,mobile,N60000000001,U50000000000,ET0000000005,transaction,1525608205413159000
1,1801.525774,pos,N60000000000,U50000000001,ET0000000002,transaction,1533878763413159000
2,2547.376336,atm,N60000000002,U50000000002,ET0000000003,transaction,1529889604413159000
3,1636.979652,atm,N60000000001,U50000000003,ET0000000005,transaction,1533141566413159000
4,4178.342667,web,N60000000000,U50000000004,ET0000000007,transaction,1531074379413159000
5,3252.573566,mobile,N60000000000,U50000000005,ET0000000000,transaction,1528994529413159000
6,2957.849746,mobile,N60000000001,U50000000006,ET0000000001,transaction,1531278511413159000
7,5482.510966,atm,N60000000002,U50000000007,ET0000000000,transaction,1523526952413159000
8,5586.395921,pos,N60000000000,U50000000008,ET0000000001,transaction,1535893022413159000
9,4323.461263,web,N60000000000,U50000000009,ET0000000002,transaction,1533872101413159000


In [18]:
%%time 

GenericData.reset()
persons = Person.generate_records(10000)
merchants = Merchant.generate_records(100)
transactions = Transaction.generate_records(10 ** 6)



GenericData.save(persons, "target/customers.json")
GenericData.save(merchants, "target/merchants.json")
GenericData.save(transactions, "target/transactions.json")

print("Record counts: ", GenericData._record_count_by_object)

print(sys.getsizeof(persons), sys.getsizeof(merchants), sys.getsizeof(transactions))

FileNotFoundError: [Errno 2] No such file or directory: 'target/customers.json'

In [19]:
persons_df = pd.DataFrame.from_records([d.__dict__ for d in persons])
persons_df.head()

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name,object_name
0,"{'object_name': 'address', 'id': '6B0000000000...",65,1953-08-20,dbaltzer@gmail.com,Daishun,M,N60000000000,Baltzer,person
1,"{'object_name': 'address', 'id': '6B0000000001...",55,1962-12-21,sfazanaro@msn.com,Saquoia,F,N60000000001,Fazanaro,person
2,"{'object_name': 'address', 'id': '6B0000000002...",46,1972-06-15,omauri.breeden@hotmail.com,Omauri,M,N60000000002,Breeden,person
3,"{'object_name': 'address', 'id': '6B0000000003...",52,1966-05-11,mhya.lunnjr@hotmail.com,Mhya,F,N60000000003,Lunn jr,person
4,"{'object_name': 'address', 'id': '6B0000000004...",79,1938-12-07,jonquavious.eibeck@msn.com,Jonquavious,F,N60000000004,Eibeck,person


In [20]:
merchants_df = pd.DataFrame.from_records([d.__dict__ for d in merchants])
merchants_df.head()

Unnamed: 0,id,name,object_name
0,ET0000000000,"ICC Holdings, Inc.",merchant
1,ET0000000001,"VIVUS, Inc.",merchant
2,ET0000000002,InflaRx N.V.,merchant
3,ET0000000003,ClearBridge Dividend Strategy ESG ETF,merchant
4,ET0000000004,"People&#39;s United Financial, Inc.",merchant


In [21]:
transactions_df = pd.DataFrame.from_records([d.__dict__ for d in transactions])
transactions_df.head()

Unnamed: 0,amount,category,customer_id,id,merchant_id,object_name,timestamp
0,236.292555,atm,N60000008572,U50000000000,ET0000000059,transaction,1527974038699856000
1,1242.931217,mobile,N60000005230,U50000000001,ET0000000071,transaction,1525030965699856000
2,682.136872,pos,N60000001778,U50000000002,ET0000000052,transaction,1531619173699856000
3,126.790676,mobile,N60000008737,U50000000003,ET0000000085,transaction,1530380374699856000
4,110.359278,mobile,N60000003143,U50000000004,ET0000000097,transaction,1527639255699856000


In [22]:
transactions_df.customer_id.value_counts()

N60000009731    141
N60000005956    140
N60000001544    137
N60000005365    136
N60000006556    135
               ... 
N60000009178     69
N60000007088     68
N60000008923     68
N60000001349     66
N60000005562     60
Name: customer_id, Length: 10000, dtype: int64

In [23]:
transactions_df.customer_id.unique().shape

(10000,)