In [1]:
import random
import os
import csv 
from faker import Faker 
import itertools

In [2]:
data_dir = '../../../data'
table_name = 'fact_sales'
output_dir = os.path.join(data_dir, f'pybrew_{table_name}')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
csv_location = os.path.join(output_dir, f'{table_name}.csv')

## Part 1. Gather All Possible Foreign Key Values From Dim Tables

In [3]:
# test file location will poin tot the right csv file
file_names = ['dim_customer.csv', 'dim_date.csv', 'dim_product.csv']
for file_name in file_names:
    file_location = os.path.join(data_dir, f"pybrew_{file_name.split('.')[0]}", file_name)
    print(file_location)

../../../data/pybrew_dim_customer/dim_customer.csv
../../../data/pybrew_dim_date/dim_date.csv
../../../data/pybrew_dim_product/dim_product.csv


In [4]:

# try a sample run for dim_customer table
dim_customer_csv_location = '../../../data/pybrew_dim_customer/dim_customer.csv'

with open(dim_customer_csv_location, 'r') as f:
    # Create a list to store the column referenced by fact_sales via Foreign Key
    column_names = list()
    
    # Use the reader() method to create a reader object.
    reader = csv.reader(f)
    
    # Skip the header
    next(reader)
    
    # Iterate over the reader object and add the column names to the list.
    for row in reader:
        column_names.append(row[0])

print(column_names[:10])
    

['100', '101', '102', '103', '104', '105', '106', '107', '108', '109']


In case of futere additions, iterate over the existing foreign keys to collect the foreign key column values that need to be passed to **faker** to generate `fact_sales` table data. 

In [5]:
# Create a list for each CSV file that holds a reference to fact_sales table.
dim_customer_ids = list()
dim_date_ids = list()
dim_product_ids = list()

# Keep the lists in the order of file names
foreign_key_id_list = [dim_customer_ids, dim_date_ids, dim_product_ids]

file_names = ['dim_customer.csv', 'dim_date.csv', 'dim_product.csv']

try:
    for i,file_name in enumerate(file_names):
        # point out the csv file location
        file_location = os.path.join(data_dir, f"pybrew_{file_name.split('.')[0]}", file_name)
        
        # Open the CSV file in read mode.
        with open(file_location, 'r') as csvfile:
                    
            # Use the reader() method to create a reader object.
            reader = csv.reader(csvfile)
            
            # skip the header row
            next(reader)
            
            # Iterate over the reader object and add the column names to the list.
            for row in reader:
                foreign_key_id_list[i].append(row[0])
except Exception as e:
    print(e)
else:
    for id_list in foreign_key_id_list:
        print(id_list[:10])


['100', '101', '102', '103', '104', '105', '106', '107', '108', '109']
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
['HOT:COF:CAP:1000', 'HOT:TEA:HER:1001', 'HOT:COF:MOC:1002', 'HOT:TEA:HER:1003', 'COL:ICE:ICE:1004', 'COL:ICE:ICE:1005', 'HOT:TEA:GRE:1006', 'RET:MUG:PYB:1007', 'SNA:PAC:CHE:1008', 'RET:MUG:PYB:1009']


## Part 2. Generate ***fact_sales*** Data.

In [6]:
fake = Faker()

In [7]:
# Generate a random float between 5.00 and 50.00. using faker pyfloat
sales_amount = fake.pyfloat(min_value=5.00, max_value=50.00)

# Get the actual floating point number.
float_sales_amount = float(sales_amount)

# Print the floating point number.
print(float_sales_amount)

38.626538


In [8]:
field_names = ['sales_id', 'sales_amount', 'customer_id', 'date_id', 'product_id']

try:
    with open(csv_location, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names)
        writer.writeheader()
        
        for i in range(10000):
            writer.writerow({
                'sales_id': 100_000 + i,
                'sales_amount': round(random.uniform(5.00, 50.00), 2),
                'customer_id': random.choice(dim_customer_ids),
                'date_id': random.choice(dim_date_ids),
                'product_id': random.choice(dim_product_ids),
            })
except Exception as e:
    print(e)
else:
    print('Success!')

Success!
