<h2><center>STAGE 2 - CREATING DIMENSIONS</center></h2>

---

In [1]:
import os
import pandas as pd
import itertools

In [2]:
from utils.dates import get_trimester, get_week_day, get_day_type, get_national_holiday_indicator

In [3]:
from utils.data_warehouse_mappings import add_surrogate_key_dimension_table, map_surrogate_to_natural_key, \
    map_surrogate_to_natural_key_junk_dimension, save_lookup_to_json

In [4]:
from utils.constants import state_mappings

In [5]:
from utils.constants import processed_dataset_folder, customers_processed_dataset, geolocation_processed_dataset, \
    items_processed_dataset, payments_processed_dataset, reviews_processed_dataset, orders_processed_dataset, \
        products_processed_dataset, sellers_processed_dataset, category_processed_dataset

In [6]:
from utils.constants import lookup_tables_folder, lookup_table_customer, lookup_table_seller, lookup_table_product, \
    lookup_table_geolocation, lookup_table_customer_geolocation, lookup_table_seller_geolocation, lookup_table_order_indicator, \
        lookup_table_date

In [7]:
from utils.constants import dimension_tables_folder, dim_customer_file, dim_seller_file, dim_product_file, dim_geolocation_file, \
    dim_customer_geolocation_file, dim_seller_geolocation_file, dim_order_indicator_file, dim_date_file

<h3>1. READING THE PROCESSED CSV FILES FROM STAGE 1</h3>

In [8]:
customers = pd.read_csv(os.path.join(processed_dataset_folder, customers_processed_dataset))
geolocation = pd.read_csv(os.path.join(processed_dataset_folder, geolocation_processed_dataset))
items = pd.read_csv(os.path.join(processed_dataset_folder, items_processed_dataset))
payments = pd.read_csv(os.path.join(processed_dataset_folder, payments_processed_dataset))
reviews = pd.read_csv(os.path.join(processed_dataset_folder, reviews_processed_dataset))
orders = pd.read_csv(os.path.join(processed_dataset_folder, orders_processed_dataset))
products = pd.read_csv(os.path.join(processed_dataset_folder, products_processed_dataset))
sellers = pd.read_csv(os.path.join(processed_dataset_folder, sellers_processed_dataset))
category = pd.read_csv(os.path.join(processed_dataset_folder, category_processed_dataset))

<h3>2. CREATING DATAFRAMES AND LOOKUP TABLES FOR EACH DIMENSION</h3>

<h4>Dimension <sup>dimGeolocation</sup></h4>

In [9]:
dim_geolocation = geolocation.copy()

In [10]:
dim_geolocation.head()

Unnamed: 0,zip_code_prefix,latitude,longitude,city,state
0,1037,-23.545411,-46.638924,Sao Paulo,SP
1,1046,-23.54593,-46.643998,Sao Paulo,SP
2,1041,-23.543905,-46.640046,Sao Paulo,SP
3,1035,-23.541614,-46.641549,Sao Paulo,SP
4,1012,-23.54779,-46.634859,Sao Paulo,SP


In [11]:
state_names = dim_geolocation['state'].unique()

In [12]:
state_names

array(['SP', 'RJ', 'ES', 'MG', 'BA', 'SE', 'PE', 'RN', 'AL', 'PB', 'CE',
       'PI', 'MA', 'PA', 'AP', 'AM', 'RR', 'AC', 'DF', 'GO', 'RO', 'TO',
       'MT', 'MS', 'PR', 'SC', 'RS'], dtype=object)

In [13]:
dim_geolocation['state'] = dim_geolocation['state'].replace(state_mappings)

In [14]:
dim_geolocation.columns = ['ZIP_CODE_PREFIX', 'LATITUDE', 'LONGITUDE', 'CITY', 'STATE']

In [15]:
dim_geolocation = dim_geolocation.sort_values('ZIP_CODE_PREFIX').reset_index(drop=True)

In [16]:
dim_geolocation = dim_geolocation[['ZIP_CODE_PREFIX', 'CITY', 'STATE', 'LATITUDE', 'LONGITUDE']]

In [17]:
dim_geolocation['ZIP_CODE_PREFIX'].duplicated().any()

False

In [18]:
add_surrogate_key_dimension_table(dim_geolocation, 'GEOLOCATION_KEY')

In [19]:
geolocation_key_mapping = map_surrogate_to_natural_key(dim_geolocation, 'ZIP_CODE_PREFIX', 'GEOLOCATION_KEY')

In [20]:
dim_geolocation

Unnamed: 0,GEOLOCATION_KEY,ZIP_CODE_PREFIX,CITY,STATE,LATITUDE,LONGITUDE
0,1,1001,Sao Paulo,São Paulo,-23.550190,-46.634024
1,2,1002,Sao Paulo,São Paulo,-23.548146,-46.634979
2,3,1003,Sao Paulo,São Paulo,-23.548994,-46.635731
3,4,1004,Sao Paulo,São Paulo,-23.549799,-46.634757
4,5,1005,Sao Paulo,São Paulo,-23.549456,-46.636733
...,...,...,...,...,...,...
19172,19173,99960,Charrua,Rio Grande do Sul,-27.953722,-52.025511
19173,19174,99965,Agua Santa,Rio Grande do Sul,-28.183372,-52.039850
19174,19175,99970,Ciriaco,Rio Grande do Sul,-28.343766,-51.874689
19175,19176,99980,David Canabarro,Rio Grande do Sul,-28.389129,-51.843836


In [21]:
print(f'The dimGeolocation has {dim_geolocation.isnull().sum().sum()} missing values.')

The dimGeolocation has 0 missing values.


<h4>Dimension <sup>dimCustomer</sup></h4>

In [22]:
dim_customer = customers.copy()

In [23]:
dim_customer.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056


In [24]:
print(f'The customer dimension has {dim_customer.isna().sum().sum()} missing values.')

The customer dimension has 0 missing values.


In [25]:
dim_customer.columns = ['CUSTOMER_ID', 'CUSTOMER_UNIQUE_ID', 'CUSTOMER_ZIP_CODE_PREFIX']

In [26]:
dim_customer['CUSTOMER_ID'].duplicated().any()

False

In [27]:
add_surrogate_key_dimension_table(dim_customer, 'CUSTOMER_KEY')

In [28]:
customer_key_mapping = map_surrogate_to_natural_key(dim_customer, 'CUSTOMER_ID', 'CUSTOMER_KEY')

In [29]:
dim_customer

Unnamed: 0,CUSTOMER_KEY,CUSTOMER_ID,CUSTOMER_UNIQUE_ID,CUSTOMER_ZIP_CODE_PREFIX
0,1,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409
1,2,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790
2,3,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151
3,4,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775
4,5,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056
...,...,...,...,...
99436,99437,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937
99437,99438,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764
99438,99439,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115
99439,99440,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120


In [30]:
print(f'The dimCustomer has {dim_customer.isnull().sum().sum()} missing values.')

The dimCustomer has 0 missing values.


<h4>Dimension <sup>dimCustomerGeolocation</sup></h4>

In [31]:
dim_customer_geolocation = pd.merge(dim_geolocation, dim_customer, left_on='ZIP_CODE_PREFIX', 
                                    right_on='CUSTOMER_ZIP_CODE_PREFIX')

In [32]:
dim_customer_geolocation.head()

Unnamed: 0,GEOLOCATION_KEY,ZIP_CODE_PREFIX,CITY,STATE,LATITUDE,LONGITUDE,CUSTOMER_KEY,CUSTOMER_ID,CUSTOMER_UNIQUE_ID,CUSTOMER_ZIP_CODE_PREFIX
0,3,1003,Sao Paulo,São Paulo,-23.548994,-46.635731,23915,7ae2a9337aa4bc799723511faa1d6830,0c1a20644f0dc126c3eaff8dbc1bd12c,1003
1,4,1004,Sao Paulo,São Paulo,-23.549799,-46.634757,11748,a09edf8c1e842e94805a206b3d73eed5,968f6d2f674977d88a4b445a5117ccd8,1004
2,4,1004,Sao Paulo,São Paulo,-23.549799,-46.634757,23991,ee9b73e88afb4904ee2322cfc89cf638,095e7c124c5c1ccb1eb9f731152eae6a,1004
3,5,1005,Sao Paulo,São Paulo,-23.549456,-46.636733,11379,5a8b64ee6ccdae09ea823e6aa00e9517,9c84e5193d6ee59b3870e0e4e3a2dad8,1005
4,5,1005,Sao Paulo,São Paulo,-23.549456,-46.636733,12308,6ec2b4682814cfdac8d92bad42b3ddab,57f0ea1c7f6b9ef8615c0a0b8f06fe57,1005


In [33]:
dim_customer_geolocation.drop(['ZIP_CODE_PREFIX', 'CUSTOMER_KEY', 'CUSTOMER_ID', 'CUSTOMER_UNIQUE_ID'], axis=1, inplace=True)

In [34]:
dim_customer_geolocation.duplicated().any()

True

In [35]:
dim_customer_geolocation = dim_customer_geolocation.drop_duplicates().reset_index(drop=True)

In [36]:
dim_customer_geolocation = dim_customer_geolocation.rename(columns={'GEOLOCATION_KEY': 'CUSTOMER_GEOLOCATION_KEY'})

In [37]:
dim_customer_geolocation = dim_customer_geolocation[['CUSTOMER_GEOLOCATION_KEY', 'CUSTOMER_ZIP_CODE_PREFIX', 
                                                     'CITY', 'STATE', 'LATITUDE', 'LONGITUDE']]

In [38]:
customer_geolocation_key_mapping = map_surrogate_to_natural_key(dim_customer_geolocation, 'CUSTOMER_ZIP_CODE_PREFIX', 
                                                                'CUSTOMER_GEOLOCATION_KEY')

In [39]:
dim_customer_geolocation

Unnamed: 0,CUSTOMER_GEOLOCATION_KEY,CUSTOMER_ZIP_CODE_PREFIX,CITY,STATE,LATITUDE,LONGITUDE
0,3,1003,Sao Paulo,São Paulo,-23.548994,-46.635731
1,4,1004,Sao Paulo,São Paulo,-23.549799,-46.634757
2,5,1005,Sao Paulo,São Paulo,-23.549456,-46.636733
3,6,1006,Sao Paulo,São Paulo,-23.550102,-46.636137
4,7,1007,Sao Paulo,São Paulo,-23.550046,-46.637251
...,...,...,...,...,...,...
14989,19173,99960,Charrua,Rio Grande do Sul,-27.953722,-52.025511
14990,19174,99965,Agua Santa,Rio Grande do Sul,-28.183372,-52.039850
14991,19175,99970,Ciriaco,Rio Grande do Sul,-28.343766,-51.874689
14992,19176,99980,David Canabarro,Rio Grande do Sul,-28.389129,-51.843836


In [40]:
print(f'The dimCustomerGeolocation has {dim_customer_geolocation.isnull().sum().sum()} missing values.')

The dimCustomerGeolocation has 0 missing values.


<h4>Dimension <sup>dimSeller</sup></h4>

In [41]:
dim_seller = sellers.copy()

In [42]:
dim_seller.head()

Unnamed: 0,seller_id,seller_zip_code_prefix
0,3442f8959a84dea7ee197c632cb2df15,13023
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195
4,51a04a8a6bdcb23deccc82b0b80742cf,12914


In [43]:
dim_seller.columns = ['SELLER_ID', 'SELLER_ZIP_CODE_PREFIX']

In [44]:
dim_seller['SELLER_ID'].duplicated().any()

False

In [45]:
add_surrogate_key_dimension_table(dim_seller, 'SELLER_KEY')

In [46]:
seller_key_mapping = map_surrogate_to_natural_key(dim_seller, 'SELLER_ID', 'SELLER_KEY')

In [47]:
dim_seller

Unnamed: 0,SELLER_KEY,SELLER_ID,SELLER_ZIP_CODE_PREFIX
0,1,3442f8959a84dea7ee197c632cb2df15,13023
1,2,d1b65fc7debc3361ea86b5f14c68d2e2,13844
2,3,ce3ad9de960102d0677a81f5d0bb7b2d,20031
3,4,c0f3eea2e14555b6faeea3dd58c1b1c3,4195
4,5,51a04a8a6bdcb23deccc82b0b80742cf,12914
...,...,...,...
3090,3091,98dddbc4601dd4443ca174359b237166,87111
3091,3092,f8201cab383e484733266d1906e2fdfa,88137
3092,3093,74871d19219c7d518d0090283e03c137,4650
3093,3094,e603cf3fec55f8697c9059638d6c8eb5,96080


In [48]:
print(f'The dimSeller has {dim_seller.isnull().sum().sum()} missing values.')

The dimSeller has 0 missing values.


<h4>Dimension <sup>dimSellerGeolocation</sup></h4>

In [49]:
dim_seller_geolocation = pd.merge(dim_geolocation, dim_seller, left_on='ZIP_CODE_PREFIX', right_on='SELLER_ZIP_CODE_PREFIX')

In [50]:
dim_seller_geolocation.head()

Unnamed: 0,GEOLOCATION_KEY,ZIP_CODE_PREFIX,CITY,STATE,LATITUDE,LONGITUDE,SELLER_KEY,SELLER_ID,SELLER_ZIP_CODE_PREFIX
0,1,1001,Sao Paulo,São Paulo,-23.55019,-46.634024,849,8602a61d680a10a82cceeeda0d99ea3d,1001
1,21,1021,Sao Paulo,São Paulo,-23.543,-46.632143,600,e0487761face83d64fcada2408959a36,1021
2,21,1021,Sao Paulo,São Paulo,-23.543,-46.632143,2022,dd55f1bb788714a40e7954c3be6df745,1021
3,22,1022,Sao Paulo,São Paulo,-23.544922,-46.631931,1568,09bad886111255c5b5030314fc7f1a4a,1022
4,23,1023,Sao Paulo,São Paulo,-23.542331,-46.631373,1213,f049a72cf58fd31b11f8919cade515e7,1023


In [51]:
dim_seller_geolocation.drop(['ZIP_CODE_PREFIX', 'SELLER_KEY', 'SELLER_ID'], axis=1, inplace=True)

In [52]:
dim_seller_geolocation.duplicated().any()

True

In [53]:
dim_seller_geolocation = dim_seller_geolocation.drop_duplicates().reset_index(drop=True)

In [54]:
dim_seller_geolocation = dim_seller_geolocation.rename(columns={'GEOLOCATION_KEY': 'SELLER_GEOLOCATION_KEY'})

In [55]:
dim_seller_geolocation = dim_seller_geolocation[['SELLER_GEOLOCATION_KEY', 'SELLER_ZIP_CODE_PREFIX', 
                                                 'CITY', 'STATE', 'LATITUDE', 'LONGITUDE']]

In [56]:
seller_geolocation_key_mapping = map_surrogate_to_natural_key(dim_seller_geolocation, 'SELLER_ZIP_CODE_PREFIX', 
                                                              'SELLER_GEOLOCATION_KEY')

In [57]:
dim_seller_geolocation

Unnamed: 0,SELLER_GEOLOCATION_KEY,SELLER_ZIP_CODE_PREFIX,CITY,STATE,LATITUDE,LONGITUDE
0,1,1001,Sao Paulo,São Paulo,-23.550190,-46.634024
1,21,1021,Sao Paulo,São Paulo,-23.543000,-46.632143
2,22,1022,Sao Paulo,São Paulo,-23.544922,-46.631931
3,23,1023,Sao Paulo,São Paulo,-23.542331,-46.631373
4,26,1026,Sao Paulo,São Paulo,-23.539657,-46.631884
...,...,...,...,...,...,...
2241,19080,99300,Soledade,Rio Grande do Sul,-28.827450,-52.509255
2242,19100,99500,Carazinho,Rio Grande do Sul,-28.291245,-52.790256
2243,19119,99670,Ronda Alta,Rio Grande do Sul,-27.782222,-52.806500
2244,19124,99700,Erechim,Rio Grande do Sul,-27.636046,-52.273049


In [58]:
print(f'The dimSellerGeolocation has {dim_seller_geolocation.isnull().sum().sum()} missing values.')

The dimSellerGeolocation has 0 missing values.


<h4>Dimension <sup>dimProduct</sup></h4>

In [59]:
dim_product = pd.merge(products, category[['product_category_name', 'product_category_name_english', 
                                           'product_main_category_name']], on='product_category_name', how='left')

In [60]:
dim_product.head()

Unnamed: 0,product_id,product_category_name,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english,product_main_category_name
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,1,225,16,10,14,perfumery,Health and Beauty
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,1,1000,30,18,20,art,Music and Art
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,1,154,18,9,15,sports_leisure,Sports and Leisure
3,cef67bcfe19066a932b7673e239eb23d,bebes,1,371,26,4,26,baby,Children
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,4,625,20,17,13,housewares,Home and Decor


In [61]:
dim_product.drop('product_category_name', axis=1, inplace=True)

In [62]:
dim_product.columns = ['PRODUCT_ID', 'NUMBER_PHOTOS', 'WEIGHT', 'LENGTH', 'HEIGHT', 'WIDTH', 'SUB_CATEGORY', 'CATEGORY']

In [63]:
dim_product = dim_product[['PRODUCT_ID', 'CATEGORY', 'SUB_CATEGORY', 'WEIGHT', 'LENGTH', 'HEIGHT', 'WIDTH', 'NUMBER_PHOTOS']]

In [64]:
dim_product['PRODUCT_ID'].duplicated().any()

False

In [65]:
add_surrogate_key_dimension_table(dim_product, 'PRODUCT_KEY')

In [66]:
product_key_mapping = map_surrogate_to_natural_key(dim_product, 'PRODUCT_ID', 'PRODUCT_KEY')

In [67]:
dim_product

Unnamed: 0,PRODUCT_KEY,PRODUCT_ID,CATEGORY,SUB_CATEGORY,WEIGHT,LENGTH,HEIGHT,WIDTH,NUMBER_PHOTOS
0,1,1e9e8ef04dbcff4541ed26657ea517e5,Health and Beauty,perfumery,225,16,10,14,1
1,2,3aa071139cb16b67ca9e5dea641aaa2f,Music and Art,art,1000,30,18,20,1
2,3,96bd76ec8810374ed1b65e291975717f,Sports and Leisure,sports_leisure,154,18,9,15,1
3,4,cef67bcfe19066a932b7673e239eb23d,Children,baby,371,26,4,26,1
4,5,9dc1a7de274444849c219cff195d0b71,Home and Decor,housewares,625,20,17,13,4
...,...,...,...,...,...,...,...,...,...
32946,32947,a0b7d5a992ccda646f2d34e418fff5a0,Home and Decor,furniture_decor,12300,40,40,40,2
32947,32948,bf4538d88321d0fd4412a93c974510e6,Materials and Construction,construction_tools_lights,1700,16,19,16,1
32948,32949,9a7c6041fa9592d9d9ef6cfe62a71f8c,Home and Decor,bed_bath_table,1400,27,7,27,1
32949,32950,83808703fc0706a22e264b9d75f04a2e,Technology and Home Appliances,computers_accessories,700,31,13,20,2


In [68]:
print(f'The dimProduct has {dim_product.isnull().sum().sum()} missing values.')

The dimProduct has 0 missing values.


<h4>Junk Dimension <sup>dimOrderIndicator</sup></h4>

In [69]:
payment_type = payments['payment_type'].unique()
payment_type

array(['credit_card', 'boleto', 'voucher', 'debit_card'], dtype=object)

In [70]:
order_status = orders['order_status'].unique()
order_status

array(['delivered', 'canceled'], dtype=object)

In [71]:
combinations = list(itertools.product(payment_type, order_status))

In [72]:
dim_order_indicator = pd.DataFrame(combinations, columns=['Payment Type', 'Order Status'])

In [73]:
dim_order_indicator = dim_order_indicator.rename(columns={'Payment Type': 'PAYMENT_TYPE', 
                                                          'Order Status': 'ORDER_STATUS'})

In [74]:
add_surrogate_key_dimension_table(dim_order_indicator, 'ORDER_INDICATOR_KEY')

In [75]:
order_indicator_key_mapping = map_surrogate_to_natural_key_junk_dimension(dim_order_indicator, ['PAYMENT_TYPE', 
                                                                                                'ORDER_STATUS'])

In [76]:
dim_order_indicator

Unnamed: 0,ORDER_INDICATOR_KEY,PAYMENT_TYPE,ORDER_STATUS
0,1,credit_card,delivered
1,2,credit_card,canceled
2,3,boleto,delivered
3,4,boleto,canceled
4,5,voucher,delivered
5,6,voucher,canceled
6,7,debit_card,delivered
7,8,debit_card,canceled


In [77]:
print(f'The dimOrderIndicator has {dim_order_indicator.isnull().sum().sum()} missing values.')

The dimOrderIndicator has 0 missing values.


<h4>Dimension <sup>dimDate</sup></h4>

In [78]:
start_date = '2016-01-01'
end_date = '2018-12-31'
dates = pd.date_range(start=start_date, end=end_date, freq='D')

In [79]:
dim_date = pd.DataFrame({'FULL_DATE': dates.strftime('%Y-%m-%d'),
                         'YEAR': dates.year,
                         'MONTH': dates.month,
                         'MONTH_NAME': dates.strftime('%B'),
                         'WEEK_IN_MONTH': dates.day // 8 + 1,
                         'DAY': dates.day,
                         'DAY_OF_WEEK': dates.map(get_week_day),
                         'TRIMESTER': dates.map(get_trimester),
                         'NATIONAL_HOLIDAY_INDICATOR': dates.map(get_national_holiday_indicator),
                         'WEEKDAY_INDICATOR': dates.map(get_day_type)})

In [80]:
add_surrogate_key_dimension_table(dim_date, 'DATE_KEY')

In [81]:
date_key_mapping = map_surrogate_to_natural_key(dim_date, 'FULL_DATE', 'DATE_KEY')

In [82]:
dim_date

Unnamed: 0,DATE_KEY,FULL_DATE,YEAR,MONTH,MONTH_NAME,WEEK_IN_MONTH,DAY,DAY_OF_WEEK,TRIMESTER,NATIONAL_HOLIDAY_INDICATOR,WEEKDAY_INDICATOR
0,1,2016-01-01,2016,1,January,1,1,Friday,Q1,Holiday,Weekday
1,2,2016-01-02,2016,1,January,1,2,Saturday,Q1,Non-Holiday,Weekend
2,3,2016-01-03,2016,1,January,1,3,Sunday,Q1,Non-Holiday,Weekend
3,4,2016-01-04,2016,1,January,1,4,Monday,Q1,Non-Holiday,Weekday
4,5,2016-01-05,2016,1,January,1,5,Tuesday,Q1,Non-Holiday,Weekday
...,...,...,...,...,...,...,...,...,...,...,...
1091,1092,2018-12-27,2018,12,December,4,27,Thursday,Q4,Non-Holiday,Weekday
1092,1093,2018-12-28,2018,12,December,4,28,Friday,Q4,Non-Holiday,Weekday
1093,1094,2018-12-29,2018,12,December,4,29,Saturday,Q4,Non-Holiday,Weekend
1094,1095,2018-12-30,2018,12,December,4,30,Sunday,Q4,Non-Holiday,Weekend


In [83]:
print(f'The dimDate has {dim_date.isnull().sum().sum()} missing values.')

The dimDate has 0 missing values.


<h3>3. EXPORTING THE CSV FILES OF THE DIMENSIONS</h3>

In [84]:
os.makedirs(dimension_tables_folder, exist_ok=True)

In [85]:
dim_customer.to_csv(os.path.join(dimension_tables_folder, dim_customer_file), index=False)
dim_seller.to_csv(os.path.join(dimension_tables_folder, dim_seller_file), index=False)
dim_product.to_csv(os.path.join(dimension_tables_folder, dim_product_file), index=False)
dim_geolocation.to_csv(os.path.join(dimension_tables_folder, dim_geolocation_file), index=False)
dim_customer_geolocation.to_csv(os.path.join(dimension_tables_folder, dim_customer_geolocation_file), index=False)
dim_seller_geolocation.to_csv(os.path.join(dimension_tables_folder, dim_seller_geolocation_file), index=False)
dim_order_indicator.to_csv(os.path.join(dimension_tables_folder, dim_order_indicator_file), index=False)
dim_date.to_csv(os.path.join(dimension_tables_folder, dim_date_file), index=False)

<h3>4. EXPORTING THE LOOKUP TABLES WITH NATURAL AND SURROGATE KEY MAPPINGS</h3>

In [86]:
os.makedirs(lookup_tables_folder, exist_ok=True)

In [87]:
save_lookup_to_json(customer_key_mapping, os.path.join(lookup_tables_folder, lookup_table_customer))
save_lookup_to_json(seller_key_mapping, os.path.join(lookup_tables_folder, lookup_table_seller))
save_lookup_to_json(product_key_mapping, os.path.join(lookup_tables_folder, lookup_table_product))
save_lookup_to_json(geolocation_key_mapping, os.path.join(lookup_tables_folder, lookup_table_geolocation))
save_lookup_to_json(seller_geolocation_key_mapping, os.path.join(lookup_tables_folder, lookup_table_seller_geolocation))
save_lookup_to_json(customer_geolocation_key_mapping, os.path.join(lookup_tables_folder, lookup_table_customer_geolocation))
save_lookup_to_json(order_indicator_key_mapping, os.path.join(lookup_tables_folder, lookup_table_order_indicator))
save_lookup_to_json(date_key_mapping, os.path.join(lookup_tables_folder, lookup_table_date))