<h2><center>STAGE 4 - LOADING DIMENSIONS</center></h2>

---

In [1]:
import os
import pandas as pd
import psycopg2 as pg

In [2]:
from utils.constants import host, user, password, database_name

In [3]:
from utils.constants import dimension_tables_folder, dim_customer_file, dim_seller_file, dim_product_file, dim_geolocation_file, \
    dim_customer_geolocation_file, dim_seller_geolocation_file, dim_order_indicator_file, dim_date_file

<h3>1. DATABASE CONNECTION</h3>

In [4]:
os.environ["PGGSSENCMODE"] = "disable"

conn = pg.connect(host=host,database='postgres', user=user, password=password)

conn.autocommit = True

cursor = conn.cursor()

In [5]:
cursor.execute(f'DROP DATABASE IF EXISTS "{database_name}"')
cursor.execute(f'CREATE DATABASE "{database_name}"')
cursor.close()
conn.close()

In [6]:
conn = pg.connect(host=host,database=database_name, user=user, password=password)
cursor = conn.cursor()

<h3>2. READING THE DIMENSION FILES FROM STAGE 2</h3>

In [7]:
dim_customer = pd.read_csv(os.path.join(dimension_tables_folder, dim_customer_file))
dim_seller = pd.read_csv(os.path.join(dimension_tables_folder, dim_seller_file))
dim_product = pd.read_csv(os.path.join(dimension_tables_folder, dim_product_file))
dim_geolocation = pd.read_csv(os.path.join(dimension_tables_folder, dim_geolocation_file))
dim_customer_geolocation = pd.read_csv(os.path.join(dimension_tables_folder, dim_customer_geolocation_file))
dim_seller_geolocation = pd.read_csv(os.path.join(dimension_tables_folder, dim_seller_geolocation_file))
dim_order_indicator = pd.read_csv(os.path.join(dimension_tables_folder, dim_order_indicator_file))
dim_date = pd.read_csv(os.path.join(dimension_tables_folder, dim_date_file))

<h3>3. CREATING THE TABLES FOR THE DIMENSIONS IN POSTGRESQL</h3>

In [8]:
cursor.execute("DROP TABLE IF EXISTS fact_order_line;")
cursor.execute("DROP TABLE IF EXISTS fact_order_header;")
cursor.execute("DROP TABLE IF EXISTS dim_order_indicator;")
cursor.execute("DROP TABLE IF EXISTS dim_date;")
cursor.execute("DROP TABLE IF EXISTS dim_product;")
cursor.execute("DROP TABLE IF EXISTS dim_customer;")
cursor.execute("DROP TABLE IF EXISTS dim_seller;")
cursor.execute("DROP TABLE IF EXISTS dim_geolocation;")

<h4>Dimension <sup>dimGeolocation</sup></h4>

In [9]:
dim_geolocation.dtypes

GEOLOCATION_KEY      int64
ZIP_CODE_PREFIX      int64
CITY                object
STATE               object
LATITUDE           float64
LONGITUDE          float64
dtype: object

In [10]:
dim_geolocation.head()

Unnamed: 0,GEOLOCATION_KEY,ZIP_CODE_PREFIX,CITY,STATE,LATITUDE,LONGITUDE
0,1,1001,Sao Paulo,São Paulo,-23.55019,-46.634024
1,2,1002,Sao Paulo,São Paulo,-23.548146,-46.634979
2,3,1003,Sao Paulo,São Paulo,-23.548994,-46.635731
3,4,1004,Sao Paulo,São Paulo,-23.549799,-46.634757
4,5,1005,Sao Paulo,São Paulo,-23.549456,-46.636733


In [11]:
sql_dim_geolocation = """

CREATE TABLE dim_geolocation (
  geolocation_key              INTEGER,
  zip_code_prefix              INTEGER         NOT NULL,
  city                         VARCHAR(100)    NOT NULL,
  state                        VARCHAR(50)     NOT NULL,
  latitude                     DECIMAL(17,15)  NOT NULL,
  longitude                    DECIMAL(18,15)  NOT NULL,
--
  CONSTRAINT pk_geolocation
    PRIMARY KEY (geolocation_key),
--
  CONSTRAINT ck_geolocation_key
    CHECK (geolocation_key > 0),
--
  CONSTRAINT un_geolocation_zip_code_prefix
    UNIQUE (zip_code_prefix),
--
  CONSTRAINT ck_geolocation_latitude
    CHECK (latitude BETWEEN -90 AND 90),
--
  CONSTRAINT ck_geolocation_longitude
    CHECK (longitude BETWEEN -180 AND 180)
);
"""

cursor.execute(sql_dim_geolocation)

<h4>Dimension <sup>dimCustomer</sup></h4>

In [12]:
dim_customer.dtypes

CUSTOMER_KEY                 int64
CUSTOMER_ID                 object
CUSTOMER_UNIQUE_ID          object
CUSTOMER_ZIP_CODE_PREFIX     int64
dtype: object

In [13]:
dim_customer.head()

Unnamed: 0,CUSTOMER_KEY,CUSTOMER_ID,CUSTOMER_UNIQUE_ID,CUSTOMER_ZIP_CODE_PREFIX
0,1,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409
1,2,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790
2,3,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151
3,4,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775
4,5,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056


In [14]:
sql_dim_customer = """

CREATE TABLE dim_customer (
  customer_key                 INTEGER,
  customer_id                  VARCHAR(100)    NOT NULL,
  customer_unique_id           VARCHAR(100)    NOT NULL,
  customer_zip_code_prefix     INTEGER         NOT NULL,
--
  CONSTRAINT pk_customer
    PRIMARY KEY (customer_key),
--
  CONSTRAINT ck_customer_key
    CHECK (customer_key > 0),
--
  CONSTRAINT un_customer_id
    UNIQUE (customer_id),
--
  CONSTRAINT fk_customer_zip_code_prefix
    FOREIGN KEY (customer_zip_code_prefix)
    REFERENCES dim_geolocation (zip_code_prefix)
);
"""

cursor.execute(sql_dim_customer)

<h4>Dimension <sup>dimSeller</sup></h4>

In [15]:
dim_seller.dtypes

SELLER_KEY                 int64
SELLER_ID                 object
SELLER_ZIP_CODE_PREFIX     int64
dtype: object

In [16]:
dim_seller.head()

Unnamed: 0,SELLER_KEY,SELLER_ID,SELLER_ZIP_CODE_PREFIX
0,1,3442f8959a84dea7ee197c632cb2df15,13023
1,2,d1b65fc7debc3361ea86b5f14c68d2e2,13844
2,3,ce3ad9de960102d0677a81f5d0bb7b2d,20031
3,4,c0f3eea2e14555b6faeea3dd58c1b1c3,4195
4,5,51a04a8a6bdcb23deccc82b0b80742cf,12914


In [17]:
sql_dim_seller = """

CREATE TABLE dim_seller (
  seller_key                   INTEGER,
  seller_id                    VARCHAR(100)    NOT NULL,
  seller_zip_code_prefix       INTEGER         NOT NULL,
--
  CONSTRAINT pk_seller
    PRIMARY KEY (seller_key),
--
  CONSTRAINT ck_seller_key
    CHECK (seller_key > 0),
--
  CONSTRAINT un_seller_id
    UNIQUE (seller_id),
--
  CONSTRAINT fk_seller_zip_code_prefix
    FOREIGN KEY (seller_zip_code_prefix)
    REFERENCES dim_geolocation (zip_code_prefix)
);
"""

cursor.execute(sql_dim_seller)

<h4>Dimension <sup>dimProduct</sup></h4>

In [18]:
dim_product.dtypes

PRODUCT_KEY       int64
PRODUCT_ID       object
CATEGORY         object
SUB_CATEGORY     object
WEIGHT            int64
LENGTH            int64
HEIGHT            int64
WIDTH             int64
NUMBER_PHOTOS     int64
dtype: object

In [19]:
dim_product.head()

Unnamed: 0,PRODUCT_KEY,PRODUCT_ID,CATEGORY,SUB_CATEGORY,WEIGHT,LENGTH,HEIGHT,WIDTH,NUMBER_PHOTOS
0,1,1e9e8ef04dbcff4541ed26657ea517e5,Health and Beauty,perfumery,225,16,10,14,1
1,2,3aa071139cb16b67ca9e5dea641aaa2f,Music and Art,art,1000,30,18,20,1
2,3,96bd76ec8810374ed1b65e291975717f,Sports and Leisure,sports_leisure,154,18,9,15,1
3,4,cef67bcfe19066a932b7673e239eb23d,Children,baby,371,26,4,26,1
4,5,9dc1a7de274444849c219cff195d0b71,Home and Decor,housewares,625,20,17,13,4


In [20]:
sql_dim_product = """

CREATE TABLE dim_product (
  product_key                  INTEGER,
  product_id                   VARCHAR(100)    NOT NULL,
  category                     VARCHAR(100)    NOT NULL,
  sub_category                 VARCHAR(100)    NOT NULL,
  weight_g                     INTEGER         NOT NULL,
  length_cm                    INTEGER         NOT NULL,
  height_cm                    INTEGER         NOT NULL,
  width_cm                     INTEGER         NOT NULL,
  number_photos                INTEGER         NOT NULL,
--
  CONSTRAINT pk_product
    PRIMARY KEY (product_key),
--
  CONSTRAINT ck_product_key
    CHECK (product_key > 0),
--
  CONSTRAINT un_product_id
    UNIQUE (product_id),
--
  CONSTRAINT ck_number_photos
    CHECK (number_photos >= 0),
--
  CONSTRAINT ck_weight_g
    CHECK (weight_g >= 0),
--
  CONSTRAINT ck_length_cm
    CHECK (length_cm >= 0),
--
  CONSTRAINT ck_height_cm
    CHECK (height_cm >= 0),
--
  CONSTRAINT ck_width_cm
    CHECK (width_cm >= 0)
);
"""

cursor.execute(sql_dim_product)

<h4>Dimension <sup>dimDate</sup></h4>

In [21]:
dim_date.dtypes

DATE_KEY                       int64
FULL_DATE                     object
YEAR                           int64
MONTH                          int64
MONTH_NAME                    object
WEEK_IN_MONTH                  int64
DAY                            int64
DAY_OF_WEEK                   object
TRIMESTER                     object
NATIONAL_HOLIDAY_INDICATOR    object
WEEKDAY_INDICATOR             object
dtype: object

In [22]:
dim_date.head()

Unnamed: 0,DATE_KEY,FULL_DATE,YEAR,MONTH,MONTH_NAME,WEEK_IN_MONTH,DAY,DAY_OF_WEEK,TRIMESTER,NATIONAL_HOLIDAY_INDICATOR,WEEKDAY_INDICATOR
0,1,2016-01-01,2016,1,January,1,1,Friday,Q1,Holiday,Weekday
1,2,2016-01-02,2016,1,January,1,2,Saturday,Q1,Non-Holiday,Weekend
2,3,2016-01-03,2016,1,January,1,3,Sunday,Q1,Non-Holiday,Weekend
3,4,2016-01-04,2016,1,January,1,4,Monday,Q1,Non-Holiday,Weekday
4,5,2016-01-05,2016,1,January,1,5,Tuesday,Q1,Non-Holiday,Weekday


In [23]:
sql_dim_date = """

CREATE TABLE dim_date (
  date_key                     INTEGER,
  full_date                    DATE            NOT NULL,
  year                         INTEGER         NOT NULL,
  month                        INTEGER         NOT NULL,
  month_name                   VARCHAR(50)     NOT NULL,
  week_in_month                INTEGER         NOT NULL,
  day                          INTEGER         NOT NULL,
  day_of_week                  VARCHAR(10)     NOT NULL,
  trimester                    VARCHAR(10)     NOT NULL,
  national_holiday_indicator   VARCHAR(20)     NOT NULL,
  weekday_indicator            VARCHAR(10)     NOT NULL,
--
  CONSTRAINT pk_date
    PRIMARY KEY (date_key),
--
  CONSTRAINT ck_date_key
    CHECK (date_key > 0),
--
  CONSTRAINT un_date_full_date
    UNIQUE (full_date),
--
  CONSTRAINT ck_date_year
    CHECK (year > 0),
--
  CONSTRAINT ck_date_month
    CHECK (month BETWEEN 1 AND 12),
--
  CONSTRAINT ck_date_month_name
    CHECK (UPPER(month_name) 
    IN ('JANUARY', 'FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBER', 'DECEMBER')),
--
  CONSTRAINT ck_date_day_of_week
    CHECK (UPPER(day_of_week) 
    IN ('MONDAY', 'TUESDAY',  'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY')),
--
  CONSTRAINT ck_date_national_holiday_indicator
    CHECK (UPPER(national_holiday_indicator) 
    IN ('HOLIDAY','NON-HOLIDAY')),
--
  CONSTRAINT ck_date_weekday_indicator
    CHECK (UPPER(weekday_indicator) 
    IN ('WEEKEND','WEEKDAY'))
);
"""

cursor.execute(sql_dim_date)

<h4>Junk Dimension <sup>dimOrder Indicator</sup></h4>

In [24]:
dim_order_indicator.dtypes

ORDER_INDICATOR_KEY     int64
PAYMENT_TYPE           object
ORDER_STATUS           object
dtype: object

In [25]:
dim_order_indicator

Unnamed: 0,ORDER_INDICATOR_KEY,PAYMENT_TYPE,ORDER_STATUS
0,1,credit_card,delivered
1,2,credit_card,canceled
2,3,boleto,delivered
3,4,boleto,canceled
4,5,voucher,delivered
5,6,voucher,canceled
6,7,debit_card,delivered
7,8,debit_card,canceled


In [26]:
sql_dim_order_indicator = """

CREATE TABLE dim_order_indicator (
  order_indicator_key          INTEGER,
  payment_type                 VARCHAR(20)     NOT NULL,
  order_status                 VARCHAR(20)     NOT NULL,
--
  CONSTRAINT pk_order_indicator
    PRIMARY KEY (order_indicator_key),
--
  CONSTRAINT ck_order_indicator_key
    CHECK (order_indicator_key > 0),
--
  CONSTRAINT ck_order_indicator_payment_type
    CHECK (UPPER(payment_type) 
    IN ('CREDIT_CARD', 'BOLETO', 'VOUCHER','DEBIT_CARD'))
);
"""

cursor.execute(sql_dim_order_indicator)

<h3>4. INSERTING THE DATA INTO THE DIMENSION TABLES IN POSTGRESQL</h3>

In [27]:
dim_geolocation_list = dim_geolocation.to_numpy().tolist()

sql = "INSERT INTO dim_geolocation VALUES(%s, %s, %s, %s, %s, %s)"

cursor.executemany(sql, dim_geolocation_list)

In [28]:
dim_customer_list = dim_customer.to_numpy().tolist()

sql = "INSERT INTO dim_customer VALUES(%s, %s, %s, %s)"

cursor.executemany(sql, dim_customer_list)

In [29]:
dim_seller_list = dim_seller.to_numpy().tolist()

sql = "INSERT INTO dim_seller VALUES(%s, %s, %s)"

cursor.executemany(sql, dim_seller_list)

In [30]:
dim_product_list = dim_product.to_numpy().tolist()

sql = "INSERT INTO dim_product VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)"

cursor.executemany(sql, dim_product_list)

In [31]:
dim_date_list = dim_date.to_numpy().tolist()

sql = "INSERT INTO dim_date VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

cursor.executemany(sql, dim_date_list)

In [32]:
dim_order_indicator_list = dim_order_indicator.to_numpy().tolist()

sql = "INSERT INTO dim_order_indicator VALUES(%s, %s, %s)"

cursor.executemany(sql, dim_order_indicator_list)

In [33]:
conn.commit()

In [34]:
cursor.close()
conn.close()