In [3]:
import pandas as pd
import numpy as np
import datetime as dt
from bs4 import BeautifulSoup as bs
import requests

In [5]:
## Getting all US postal codes
zip_codes = bs(requests.get('https://www.phaster.com/zip_code.html').text, 'lxml').find('table')
zip_codes = pd.read_html(str(zip_codes), header = 0)[0]
zip_codes.columns = ['State','City','ZIP']

zip_codes_corrections = zip_codes[zip_codes.ZIP.str.contains('thru')]
#zip_codes[zip_codes.ZIP.str.len() > 16]
zip_codes = zip_codes[zip_codes.ZIP.str.contains('thru') == False]

corrections_dict = {'State': [],
                    'ZIP': []}

for idx, row in zip_codes_corrections.iterrows():
    curr_zips = row.ZIP.split(u'\xa0')
    
    for ziprange in curr_zips:
        if 'thru' in ziprange:
            curr_range = ziprange.split(' thru ')
            for i in range(int(curr_range[0]), int(curr_range[1]) + 1):
                corrections_dict['State'].append(row.State)
                corrections_dict['ZIP'].append(str(i).zfill(5))
        else:
            for i in ziprange.split(' - '):
                corrections_dict['State'].append(row.State)
                corrections_dict['ZIP'].append(i.zfill(5))
                
zip_codes_corrections = pd.DataFrame(corrections_dict)

zips_dict = {'State': [],
             'ZIP': []}

for idx, row in zip_codes.iterrows():
    curr_zips = row.ZIP.split(u'\xa0')
    
    for ziprange in curr_zips:
        for i in ziprange.split('-'):
            zips_dict['State'].append(row.State)
            zips_dict['ZIP'].append(i.strip(' ').zfill(5))
zip_codes = pd.DataFrame(zips_dict)

zip_codes = pd.concat([zip_codes, zip_codes_corrections], axis = 0).reset_index(drop = True)

In [26]:
## Generating Random Customers

customers = zip_codes.sample(10000, replace = True).reset_index(drop = True)
customers['Model'] = np.random.choice(10, 10000, 
                                      p = [0.5, 0.25, 0.1, 0.05, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01])
customers['cid'] = customers.index
customers['State'] = customers.State.str.split('(').apply(lambda x: x[1]).str.replace(')','')

In [27]:
## Generating random datetimes
def random_dates(start, end, n=10):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')


In [28]:
## Generating fake calls
start = pd.to_datetime('2018-01-01')
end = pd.to_datetime('2019-01-01')
random_dates(start, end, 1000000)

simulations = 10000000
calls = customers.sample(simulations, replace = True)

calls['Issue'] = np.random.choice(2, simulations, p = [0.8, 0.2])
calls = calls[calls.Issue == 1].reset_index(drop = True)
calls['CallDate'] = random_dates(start, end, len(calls))

In [29]:
# calls.to_csv('calls_sim.csv')
# customers.to_csv('cids.csv')