In [1]:
import pandas as pd 
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import sys

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 10

%matplotlib inline

base_path = "data/"

In [2]:
companies = pd.read_csv(base_path + "companylist.csv")
companies.head()

Unnamed: 0,Symbol,Name,LastSale,MarketCap,ADR TSO,IPOyear,Sector,Industry,Summary Quote,Unnamed: 9
0,YI,"111, Inc.",13.71,98369250.0,7175000.0,2018.0,Health Care,Medical/Nursing Services,https://www.nasdaq.com/symbol/yi,
1,PIH,"1347 Property Insurance Holdings, Inc.",6.25,37404790.0,,2014.0,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pih,
2,PIHPP,"1347 Property Insurance Holdings, Inc.",25.3101,0.0,,,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pihpp,
3,TURN,180 Degree Capital Corp.,2.25,70023510.0,,,Finance,Finance/Investors Services,https://www.nasdaq.com/symbol/turn,
4,FLWS,"1-800 FLOWERS.COM, Inc.",10.65,688032800.0,,1999.0,Consumer Services,Other Specialty Stores,https://www.nasdaq.com/symbol/flws,


In [3]:
names = pd.read_csv(base_path + "first_names.csv")
names.sample(10)

Unnamed: 0,Name,Gender,Count
27106,Dontrice,F,6
44250,Jerianne,F,116
47714,Kaamilah,F,10
93845,Ticarra,F,5
81808,Saboor,M,16
75723,Pierrette,F,212
36293,Hattie,M,113
77616,Ramyia,F,38
59976,Lindy,M,1153
57316,Lashai,F,210


In [4]:
addresses = pd.read_csv(base_path + "addresses.csv")
addresses.head()

Unnamed: 0,Name of Institution,Street Address,City,ZIP Code,County,Location 1
0,Abacus Federal Savings Bank,36-30 Main Street,Flushing,11354.0,Queens,"36 30 Main Street\nFlushing, NY 11354\n"
1,Adirondack Bank,448 Route 3,Plattsburgh,12901.0,Clinton,"448 Route\nPlattsburgh, NY 12901\n"
2,Adirondack Bank,13150 State Route 12,Boonville,13309.0,Oneida,"13150 State Route\nBoonville, NY 13309\n"
3,Adirondack Bank,Utica College,Utica,13501.0,Oneida,
4,"Adirondack Trust Company, The",112 Broadway,Saratoga Springs,12866.0,Saratoga,"112 Broadway\nSaratoga Springs, NY 12866\n"


In [5]:
domains = pd.read_csv(base_path + "top_domains.csv", header=None)[1]
domains.head(10)

0       google.com
1      youtube.com
2     facebook.com
3        baidu.com
4    wikipedia.org
5        yahoo.com
6           qq.com
7       taobao.com
8        tmall.com
9     google.co.in
Name: 1, dtype: object

In [6]:
class GenericData():
    
    _record_count_by_object = {}
    _code_by_object_name = {}
    
    def __init__(self, object_name):
        self.object_name = object_name
        count = GenericData._record_count_by_object.get(object_name, 0)
        object_code = GenericData._to_object_code(object_name)
        self.id =  "%s%010d" % (object_code, count)
        GenericData._record_count_by_object[object_name] = count + 1
            
    def __repr__(self):
        return self.id
    
    @staticmethod
    def save(records, filename):
        with open(filename, "w") as f:
            for r in records:
                f.write(str(r) + "\n")
    
    @staticmethod
    def reset():
        GenericData._record_count_by_object.clear()
        GenericData._code_by_object_name.clear()

    @staticmethod
    def sample(object_name, count):
        if not object_name in GenericData._code_by_object_name:
            raise ValueError("Invalid object name %s" % object_name) 
        
        object_code = GenericData._code_by_object_name[object_name]
        record_count = GenericData._record_count_by_object[object_name]
        indices = pd.Series(np.arange(record_count)).sample(count, replace = True)
        object_ids = ["%s%010d" % (object_code, i) for i in indices]
        return object_ids
    
    @staticmethod
    def rand_by_range(lower, upper, count): 
        return np.random.random(int(count)) * (upper - lower) + lower

    @staticmethod
    def generate_double(minimum, maximum, bins, count = None):
        bin_count = len(bins)
        
        if count is not None:
            bins = np.array(bins)
            bins = np.ceil((bins / np.sum(bins) * count))
        
        intervals = np.linspace(minimum, maximum, bin_count + 1)
        y = np.arange(0)
        for i in range(bin_count):
            lower, upper = intervals[i], intervals[i+1]
            count = bins[i]
            y = np.concatenate((y, GenericData.rand_by_range(lower, upper, count)))
        return list(np.array(y))
    
    @staticmethod
    def _to_object_code(s):
        import string
        
        if s in GenericData._code_by_object_name:
            return GenericData._code_by_object_name[s]
        
        alpha_digits =  string.digits + string.ascii_uppercase
        v = hash(s.strip().lower()) % (len(alpha_digits) ** 2)
        digit1 = v // 36
        digit0 = v % 36
        object_code = "".join([alpha_digits[digit1], alpha_digits[digit0]])
        GenericData._code_by_object_name[s] = object_code
        return object_code

sample = GenericData("sample_object")
sample.id

'7N0000000000'

In [7]:
class Address(GenericData):
    
    _addresses = pd.read_csv(base_path + "addresses.csv")
    
    def __init__(self, street, city, zipcode, state, county = None):
        GenericData.__init__(self, "address")
        self.street = street
        self.city = city 
        self.zipcode = zipcode
        self.state = state
        self.county = county
    
    def __repr__(self):
        record = (self.id, self.street, self.city, self.zipcode, self.state, self.county)
        return json.dumps(self.__dict__)
   

    @staticmethod
    def generate_addresses(count):
        addresses = Address._addresses
        addresses = addresses.sample(count, replace = True)
        records = []
        for i in range(count):
            record = Address(
                list(addresses["Street Address"])[i],
                list(addresses["City"])[i],
                list(addresses["ZIP Code"])[i],
                "NY",
                list(addresses["County"])[i],
            )
            records.append(record)
        return records
            
addresses = Address.generate_addresses(3)  
addresses

[{"object_name": "address", "id": "G60000000000", "street": "3875 Merrick Road", "city": "Seaford", "zipcode": 11783.0, "state": "NY", "county": "Nassau"},
 {"object_name": "address", "id": "G60000000001", "street": "2 Wall Street", "city": "New York", "zipcode": 10005.0, "state": "NY", "county": "New York"},
 {"object_name": "address", "id": "G60000000002", "street": "4031 Route 31", "city": "Liverpool", "zipcode": 13090.0, "state": "NY", "county": "Onondaga"}]

In [8]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 3}

In [9]:
class Person(GenericData):
    
    _first_names = pd.Series(pd.read_csv(base_path + "first_names.csv").Name.unique())
    _last_names = pd.read_csv(base_path + "last_names.csv", header = None)
    
    def __init__(self, first_name, last_name, age, dob, email, gender, address):
        GenericData.__init__(self, "person")
        self.first_name = first_name
        self.last_name = last_name
        self.age = age
        self.dob = dob
        self.email = email
        self.address = address
        self.gender = gender
    
    def __repr__(self):
        record = (self.id, self.first_name, self.last_name
                      , self.dob, self.age, self.email, self.gender, self.address)
        return json.dumps(self.__dict__)
        
    @staticmethod
    def generate_records(count):
        
        first_names = list(Person._first_names.sample(count, replace=True))
        last_names = list(Person._last_names.sample(count, replace=True)[0])
        last_names = [s.capitalize() for s in last_names]
        
        ages = np.random.randint(low = 18, high = 90, size = count) 
        days_since = ages * 365 + np.random.randint(low = 0, high = 365, size = count)

        today = pd.to_datetime("today")
        dobs = today - pd.to_timedelta(days_since, unit="D")
        dobs = [d.strftime("%Y-%m-%d") for d in dobs.date]
        
        email_domains = ["gmail", "msn", "hotmail", "yahoo"]
        
        genders = list(pd.Series(np.array(["M", "F"])).sample(count, replace = True))
        
        emails = []
        
        for fname, lname in zip(first_names, last_names):
            domain = email_domains[np.random.randint(0, len(email_domains) - 1, 1)[0]]
            fname = re.sub(r"[^a-z]", "", fname.lower())
            lname = re.sub(r"[^a-z]", "", lname.lower())
            if np.random.random() > 0.5:
                emails.append("%s.%s@%s.com" % (fname, lname, domain))
            else:
                emails.append("%s%s@%s.com" % (fname[0], lname, domain))
        
        addresses = Address.generate_addresses(count)
        
        records = []
        for i in range(count):
            record = Person(first_names[i], last_names[i]
                            , int(ages[i]), dobs[i], emails[i]
                            , genders[i], addresses[i].__dict__)
            records.append(record)
        return records

persons = Person.generate_records(3)
persons

[{"object_name": "person", "id": "VK0000000000", "first_name": "Alonte", "last_name": "Czoske", "age": 49, "dob": "1969-01-13", "email": "alonte.czoske@gmail.com", "address": {"object_name": "address", "id": "G60000000003", "street": "3767 75th Street", "city": "Jackson Heights", "zipcode": 11372.0, "state": "NY", "county": "Queens"}, "gender": "M"},
 {"object_name": "person", "id": "VK0000000001", "first_name": "Alizet", "last_name": "Mckitting", "age": 47, "dob": "1971-08-28", "email": "alizet.mckitting@hotmail.com", "address": {"object_name": "address", "id": "G60000000004", "street": "2 Park Avenue", "city": "Yonkers", "zipcode": 10703.0, "state": "NY", "county": "Westchester"}, "gender": "F"},
 {"object_name": "person", "id": "VK0000000002", "first_name": "Kearria", "last_name": "Nimmich", "age": 57, "dob": "1961-02-26", "email": "knimmich@msn.com", "address": {"object_name": "address", "id": "G60000000005", "street": "518 Brighton Beach Avenue", "city": "Brooklyn", "zipcode": 112

In [10]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3}

In [11]:
pd.DataFrame.from_records([d.__dict__ for d in persons])

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name,object_name
0,"{'object_name': 'address', 'id': 'G60000000003...",49,1969-01-13,alonte.czoske@gmail.com,Alonte,M,VK0000000000,Czoske,person
1,"{'object_name': 'address', 'id': 'G60000000004...",47,1971-08-28,alizet.mckitting@hotmail.com,Alizet,F,VK0000000001,Mckitting,person
2,"{'object_name': 'address', 'id': 'G60000000005...",57,1961-02-26,knimmich@msn.com,Kearria,F,VK0000000002,Nimmich,person


In [12]:
class Merchant(GenericData):
    
    _companies = pd.read_csv(base_path + "companylist.csv")
    
    def __init__(self, name):
        GenericData.__init__(self, "merchant")
        self.name = name
    
    def __repr__(self):
        return json.dumps(self.__dict__)

    @staticmethod
    def generate_records(count):
        count = min(count, len(Merchant._companies))
        names = list(Merchant._companies.sample(count, replace = False).Name)
        records = []
        for i in range(count):
            record = Merchant(names[i])
            records.append(record)
        return records
    
Merchant.generate_records(10)    

[{"object_name": "merchant", "id": "WN0000000000", "name": "Zogenix, Inc."},
 {"object_name": "merchant", "id": "WN0000000001", "name": "Canterbury Park Holding Corporation"},
 {"object_name": "merchant", "id": "WN0000000002", "name": "LexinFintech Holdings Ltd."},
 {"object_name": "merchant", "id": "WN0000000003", "name": "BMC Stock Holdings, Inc."},
 {"object_name": "merchant", "id": "WN0000000004", "name": "Diversified Restaurant Holdings, Inc."},
 {"object_name": "merchant", "id": "WN0000000005", "name": "Gores Holdings II, Inc."},
 {"object_name": "merchant", "id": "WN0000000006", "name": "Powell Industries, Inc."},
 {"object_name": "merchant", "id": "WN0000000007", "name": "FVCBankcorp, Inc."},
 {"object_name": "merchant", "id": "WN0000000008", "name": "Digirad Corporation"},
 {"object_name": "merchant", "id": "WN0000000009", "name": "Synopsys, Inc."}]

In [13]:
GenericData._record_count_by_object

{'sample_object': 1, 'address': 6, 'person': 3, 'merchant': 10}

In [14]:
class Transaction(GenericData):
    
    _types = ["pos", "atm", "mobile", "web"]
    
    def __init__(self, customer_id, merchant_id, amount, timestamp, category):
        super().__init__("transaction")
        self.customer_id = customer_id
        self.merchant_id = merchant_id
        self.amount = amount
        self.timestamp = timestamp
        self.category = category
    
    def __repr__(self):
        return json.dumps(self.__dict__)
    
    @staticmethod
    def generate_records(count):
        customer_ids = GenericData.sample("person", count)
        merchant_ids = GenericData.sample("merchant", count)
        amounts = GenericData.generate_double(0, 10000, [0.1, 0.3, 0.4, 0.4, 0.1, 0.1, 0.01], count)
        categories = list(pd.Series(Transaction._types).sample(count, replace = True))
        

        time_deltas = pd.to_timedelta(np.random.randint(10, 6 * 30 * 24 * 3600, count), "S")        
        timestamps = (pd.to_datetime("today") - time_deltas)
        timestamps = timestamps.astype(np.int32)
        timestamps = list(timestamps)
        
        records = []
        for i in range(count):
            record = Transaction(customer_ids[i], merchant_ids[i], amounts[i], timestamps[i], categories[i])
            records.append(record)
        return records
        
    
transactions = Transaction.generate_records(10)
transactions

[{"object_name": "transaction", "id": "TT0000000000", "customer_id": "VK0000000000", "merchant_id": "WN0000000008", "amount": 1359.6377565426803, "timestamp": 1534750387115181000, "category": "mobile"},
 {"object_name": "transaction", "id": "TT0000000001", "customer_id": "VK0000000000", "merchant_id": "WN0000000009", "amount": 2594.401172543816, "timestamp": 1532570113115181000, "category": "atm"},
 {"object_name": "transaction", "id": "TT0000000002", "customer_id": "VK0000000002", "merchant_id": "WN0000000005", "amount": 2188.7099092215312, "timestamp": 1530909070115181000, "category": "atm"},
 {"object_name": "transaction", "id": "TT0000000003", "customer_id": "VK0000000001", "merchant_id": "WN0000000004", "amount": 2106.4202775144504, "timestamp": 1523256964115181000, "category": "web"},
 {"object_name": "transaction", "id": "TT0000000004", "customer_id": "VK0000000001", "merchant_id": "WN0000000003", "amount": 3283.8036872558678, "timestamp": 1529009094115181000, "category": "pos"}

In [15]:
GenericData._record_count_by_object

{'sample_object': 1,
 'address': 6,
 'person': 3,
 'merchant': 10,
 'transaction': 10}

In [16]:
pd.DataFrame.from_records([d.__dict__ for d in transactions])

Unnamed: 0,amount,category,customer_id,id,merchant_id,object_name,timestamp
0,1359.637757,mobile,VK0000000000,TT0000000000,WN0000000008,transaction,1534750387115181000
1,2594.401173,atm,VK0000000000,TT0000000001,WN0000000009,transaction,1532570113115181000
2,2188.709909,atm,VK0000000002,TT0000000002,WN0000000005,transaction,1530909070115181000
3,2106.420278,web,VK0000000001,TT0000000003,WN0000000004,transaction,1523256964115181000
4,3283.803687,pos,VK0000000001,TT0000000004,WN0000000003,transaction,1529009094115181000
5,3837.6669,mobile,VK0000000000,TT0000000005,WN0000000006,transaction,1524834684115181000
6,4104.473256,atm,VK0000000002,TT0000000006,WN0000000006,transaction,1532121048115181000
7,5184.258758,web,VK0000000000,TT0000000007,WN0000000006,transaction,1534429845115181000
8,4455.888758,web,VK0000000001,TT0000000008,WN0000000004,transaction,1526239088115181000
9,4439.576126,atm,VK0000000001,TT0000000009,WN0000000000,transaction,1537714534115181000


In [17]:
%%time 

GenericData.reset()
persons = Person.generate_records(10000)
merchants = Merchant.generate_records(100)
transactions = Transaction.generate_records(10 ** 6)

GenericData.save(persons, "target/customers.json")
GenericData.save(merchants, "target/merchants.json")
GenericData.save(transactions, "target/transactions.json")

print("Record counts: ", GenericData._record_count_by_object)

print(sys.getsizeof(persons), sys.getsizeof(merchants), sys.getsizeof(transactions))

Record counts:  {'address': 10000, 'person': 10000, 'merchant': 100, 'transaction': 1000000}
87624 912 8697464
CPU times: user 14.2 s, sys: 297 ms, total: 14.5 s
Wall time: 13.1 s


In [18]:
persons_df = pd.DataFrame.from_records([d.__dict__ for d in persons])
persons_df.head()

Unnamed: 0,address,age,dob,email,first_name,gender,id,last_name,object_name
0,"{'object_name': 'address', 'id': 'G60000000000...",53,1965-01-06,dimitrios.klepcyk@gmail.com,Dimitrios,F,VK0000000000,Klepcyk,person
1,"{'object_name': 'address', 'id': 'G60000000001...",32,1986-10-05,kevis.wehrisch@msn.com,Kevis,F,VK0000000001,Wehrisch,person
2,"{'object_name': 'address', 'id': 'G60000000002...",41,1977-05-31,svaconsin@msn.com,Saif,M,VK0000000002,Vaconsin,person
3,"{'object_name': 'address', 'id': 'G60000000003...",63,1955-04-29,ikarnowski@gmail.com,Ion,M,VK0000000003,Karnowski,person
4,"{'object_name': 'address', 'id': 'G60000000004...",25,1993-01-27,mstenkamp@gmail.com,Medger,M,VK0000000004,Stenkamp,person


In [19]:
merchants_df = pd.DataFrame.from_records([d.__dict__ for d in merchants])
merchants_df.head()

Unnamed: 0,id,name,object_name
0,WN0000000000,KBL Merger Corp. IV,merchant
1,WN0000000001,Atossa Genetics Inc.,merchant
2,WN0000000002,Oconee Federal Financial Corp.,merchant
3,WN0000000003,Oaktree Strategic Income Corporation,merchant
4,WN0000000004,BiondVax Pharmaceuticals Ltd.,merchant


In [20]:
transactions_df = pd.DataFrame.from_records([d.__dict__ for d in transactions])
transactions_df.head()

Unnamed: 0,amount,category,customer_id,id,merchant_id,object_name,timestamp
0,660.649449,web,VK0000007884,TT0000000000,WN0000000035,transaction,1531125445687419000
1,1239.374513,pos,VK0000004080,TT0000000001,WN0000000058,transaction,1534957062687419000
2,421.601186,web,VK0000005437,TT0000000002,WN0000000052,transaction,1538544953687419000
3,5.058134,atm,VK0000006732,TT0000000003,WN0000000088,transaction,1525329583687419000
4,986.432518,mobile,VK0000006496,TT0000000004,WN0000000076,transaction,1526370274687419000


In [21]:
transactions_df.customer_id.value_counts()

VK0000000065    135
VK0000003423    135
VK0000009700    134
VK0000005048    134
VK0000003417    134
               ... 
VK0000002045     70
VK0000004214     70
VK0000006578     70
VK0000000437     67
VK0000007087     61
Name: customer_id, Length: 10000, dtype: int64

In [22]:
transactions_df.customer_id.unique().shape

(10000,)