## Instalasi dan Persiapan Data

In [128]:
%pip install duckdb pandas numpy  kagglehub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [129]:
import duckdb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

In [130]:
import kagglehub

path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")

print("Path to dataset files:", path)

Path to dataset files: /Users/ditmawa/.cache/kagglehub/datasets/olistbr/brazilian-ecommerce/versions/2


In [131]:
customers_csv = pd.read_csv(os.path.join(path,'olist_customers_dataset.csv'))
geo_csv = pd.read_csv(os.path.join(path,'olist_geolocation_dataset.csv'))
items_csv = pd.read_csv(os.path.join(path,'olist_order_items_dataset.csv'))
payments_csv = pd.read_csv(os.path.join(path,'olist_order_payments_dataset.csv'))
reviews_csv = pd.read_csv(os.path.join(path,'olist_order_reviews_dataset.csv'))
orders_csv = pd.read_csv(os.path.join(path,'olist_orders_dataset.csv'))
products_csv = pd.read_csv(os.path.join(path,'olist_products_dataset.csv'))
sellers_csv = pd.read_csv(os.path.join(path,'olist_sellers_dataset.csv'))
category_csv = pd.read_csv(os.path.join(path,'product_category_name_translation.csv'))

In [132]:
print(f"Customers: {customers_csv.shape}")
print(f"Geo: {geo_csv.shape}")
print(f"Items: {items_csv.shape}")
print(f"Payments: {payments_csv.shape}")
print(f"Reviews: {reviews_csv.shape}")
print(f"Orders: {orders_csv.shape}")
print(f"Products: {products_csv.shape}")
print(f"Sellers: {sellers_csv.shape}")
print(f"Category: {category_csv.shape}")

Customers: (99441, 5)
Geo: (1000163, 5)
Items: (112650, 7)
Payments: (103886, 5)
Reviews: (99224, 7)
Orders: (99441, 8)
Products: (32951, 9)
Sellers: (3095, 4)
Category: (71, 2)


## Koneksi DuckDB dan DDL

Koneksi ke DuckDB

In [133]:
conn = duckdb.connect("brazilian_ecommerce.db")
print("Terhubung ke DuckDB : brazilian_ecommerce.db")

# conn.close()

Terhubung ke DuckDB : brazilian_ecommerce.db


Menyimpan data mentah untuk kemudahan proses

In [134]:
# Drop all tables if they exist
tables = ["raw_customers", "raw_geo", "raw_items", "raw_payments", "raw_reviews", "raw_orders", "raw_products", "raw_sellers", "raw_category", "dim_payments","fact_order_items", "fact_orders", "dim_customers",  "dim_date", "dim_products", "dim_sellers"]
for table in tables:
    conn.execute(f"DROP TABLE IF EXISTS {table};")
print("All tables have been dropped.")

All tables have been dropped.


In [135]:
conn.execute("""
    CREATE OR REPLACE TABLE raw_customers AS SELECT * FROM customers_csv;
    CREATE OR REPLACE TABLE raw_geo AS SELECT * FROM geo_csv;
    CREATE OR REPLACE TABLE raw_items AS SELECT * FROM items_csv;
    CREATE OR REPLACE TABLE raw_payments AS SELECT * FROM payments_csv;
    CREATE OR REPLACE TABLE raw_reviews AS SELECT * FROM reviews_csv;
    CREATE OR REPLACE TABLE raw_orders AS SELECT * FROM orders_csv;
    CREATE OR REPLACE TABLE raw_products AS SELECT * FROM products_csv;
    CREATE OR REPLACE TABLE raw_sellers AS SELECT * FROM sellers_csv;
    CREATE OR REPLACE TABLE raw_category AS SELECT * FROM category_csv;
""")
conn.execute("SHOW TABLES;").fetchall()

[('raw_category',),
 ('raw_customers',),
 ('raw_geo',),
 ('raw_items',),
 ('raw_orders',),
 ('raw_payments',),
 ('raw_products',),
 ('raw_reviews',),
 ('raw_sellers',)]

### DDL Fact Constellation Schema

In [136]:
def create_datawarehouse_schema():
    conn.execute("DROP TABLE IF EXISTS dim_payments")
    conn.execute("DROP TABLE IF EXISTS fact_order_items")
    conn.execute("DROP TABLE IF EXISTS fact_orders")
    
    conn.execute("""
    CREATE OR REPLACE TABLE dim_date AS
    WITH date_range AS (
        SELECT unnest(generate_series('2016-09-01'::DATE, '2020-05-01'::DATE, INTERVAL '1 day')) as date
    )
    SELECT
        (EXTRACT(YEAR FROM date) * 10000 + EXTRACT(MONTH FROM date) * 100 + EXTRACT(DAY FROM date))::INTEGER AS date_id,
        date as date_value,
        EXTRACT(DAY FROM date) AS day,
        EXTRACT(MONTH FROM date) AS month,
        strftime(date, '%B') AS month_name,
        EXTRACT(YEAR FROM date) AS year,
        strftime(date, '%A') AS day_name,
        EXTRACT(DOW FROM date) AS day_of_week,
        EXTRACT(QUARTER FROM date) AS quarter
    FROM date_range;
    """)

    # Add primary key to dim_date table
    conn.execute("ALTER TABLE dim_date ADD PRIMARY KEY (date_id);")

    conn.execute("""
    -- Tabel Dimensi
    CREATE OR REPLACE TABLE dim_customers (
        customer_id VARCHAR(50) PRIMARY KEY,
        customer_unique_id VARCHAR(50) NOT NULL,
        customer_city VARCHAR(100),
        customer_state VARCHAR(50),
        customer_zip_code_prefix VARCHAR(20)
    );

    CREATE OR REPLACE TABLE dim_sellers (
        seller_id VARCHAR(50) PRIMARY KEY,
        seller_zip_code_prefix VARCHAR(20),
        seller_city VARCHAR(100),
        seller_state VARCHAR(50)
    );

    --
    CREATE OR REPLACE TABLE dim_products (
        produk_key INT PRIMARY KEY,
        product_id VARCHAR(50) UNIQUE NOT NULL,
        product_category_name VARCHAR(100),
        product_category_name_english VARCHAR(100),
        product_weight_g DECIMAL(10,2),
        product_length_cm DECIMAL(10,2),
        product_height_cm DECIMAL(10,2),
        product_width_cm DECIMAL(10,2),
        effective_date DATE NOT NULL,
        expiration_date DATE,
        current_flag BOOLEAN DEFAULT TRUE
    );

    -- Tabel Fakta Pemesanan
    CREATE OR REPLACE TABLE fact_orders (
        order_id VARCHAR(50) PRIMARY KEY,
        customer_id VARCHAR(50) NOT NULL,
        date_id INT NOT NULL,
        order_status VARCHAR(50),
        order_purchase_timestamp DATETIME NOT NULL,
        order_approved_at DATETIME,
        order_delivered_customer_date DATETIME,
        order_estimated_delivery_date DATETIME,
        sales_amount DECIMAL(10,2),
        FOREIGN KEY (customer_id) REFERENCES dim_customers(customer_id),
        FOREIGN KEY (date_id) REFERENCES dim_date(date_id)
    );

    -- Tabel Fakta Item Pemesanan
    CREATE OR REPLACE TABLE fact_order_items (
        order_id VARCHAR(50) NOT NULL,
        order_item_id VARCHAR(50) NOT NULL,
        product_id VARCHAR(50) NOT NULL,
        seller_id VARCHAR(50) NOT NULL,
        date_id INT NOT NULL,
        shipping_limit_date DATETIME,
        price DECIMAL(10,2),
        freight_value DECIMAL(10,2),
        PRIMARY KEY (order_id, order_item_id),
        FOREIGN KEY (order_id) REFERENCES fact_orders(order_id),
        FOREIGN KEY (seller_id) REFERENCES dim_sellers(seller_id),
        FOREIGN KEY (product_id) REFERENCES dim_products(product_id),
        FOREIGN KEY (date_id) REFERENCES dim_date(date_id)
    );
    
    -- DuckDB belum mendukung ALTER TABLE ADD FOREIGN KEY
    -- Sehingga dimensi payments harus setelah fact_orders karena foreign key
    CREATE OR REPLACE TABLE dim_payments (
        payment_id INT PRIMARY KEY,
        order_id VARCHAR(50) NOT NULL,
        payment_sequential INT,
        payment_type VARCHAR(50),
        payment_installments INT,
        payment_value DECIMAL(10,2),
        FOREIGN KEY (order_id) REFERENCES fact_orders(order_id)
    );
    """)
    
    print("Schema datawarehouse berhasil dibuat") 


In [137]:
create_datawarehouse_schema()
conn.execute("SHOW TABLES;").fetchall()

Schema datawarehouse berhasil dibuat


[('dim_customers',),
 ('dim_date',),
 ('dim_payments',),
 ('dim_products',),
 ('dim_sellers',),
 ('fact_order_items',),
 ('fact_orders',),
 ('raw_category',),
 ('raw_customers',),
 ('raw_geo',),
 ('raw_items',),
 ('raw_orders',),
 ('raw_payments',),
 ('raw_products',),
 ('raw_reviews',),
 ('raw_sellers',)]

## Proses ETL

### Extract

In [138]:
def extract_source_data():
    """Ekstrak data dari sumber data mentah"""

    customers_df = conn.execute("SELECT * FROM raw_customers").fetchdf()
    geo_df = conn.execute("SELECT * FROM raw_geo").fetchdf()
    items_df = conn.execute("SELECT * FROM raw_items").fetchdf()
    payments_df = conn.execute("SELECT * FROM raw_payments").fetchdf()
    reviews_df = conn.execute("SELECT * FROM raw_reviews").fetchdf()
    orders_df = conn.execute("SELECT * FROM raw_orders").fetchdf()
    products_df = conn.execute("SELECT * FROM raw_products").fetchdf()
    sellers_df = conn.execute("SELECT * FROM raw_sellers").fetchdf()
    category_df = conn.execute("SELECT * FROM raw_category").fetchdf()

    print(f"Data berhasil diekstrak:")
    print(f"Jumlah data customers: {customers_df.shape[0]}")
    print(f"Jumlah data geolocation: {geo_df.shape[0]}")
    print(f"Jumlah data items: {items_df.shape[0]}")
    print(f"Jumlah data payments: {payments_df.shape[0]}")
    print(f"Jumlah data reviews: {reviews_df.shape[0]}")
    print(f"Jumlah data orders: {orders_df.shape[0]}")
    print(f"Jumlah data products: {products_df.shape[0]}")
    print(f"Jumlah data sellers: {sellers_df.shape[0]}")
    print(f"Jumlah data product category: {category_df.shape[0]}")

    return customers_df, geo_df, items_df, payments_df, reviews_df, orders_df, products_df, sellers_df, category_df

In [139]:
customers_df, geo_df, items_df, payments_df, reviews_df, orders_df, products_df, sellers_df, category_df = extract_source_data()

Data berhasil diekstrak:
Jumlah data customers: 99441
Jumlah data geolocation: 1000163
Jumlah data items: 112650
Jumlah data payments: 103886
Jumlah data reviews: 99224
Jumlah data orders: 99441
Jumlah data products: 32951
Jumlah data sellers: 3095
Jumlah data product category: 71


### Transform

In [140]:
# Fungsi cek duplikasi data
def check_duplicates(df, column):
    duplicate_counts = df[column].value_counts()
    unique_counts = len(duplicate_counts)
    print(f"Jumlah data unik pada kolom {column}: {unique_counts}")
    if len(duplicate_counts[duplicate_counts > 1]) == 0:
        print(f"Tidak ada duplikat data pada kolom {column}")
    else:
        print(f"Ada duplikat data pada kolom {column} sebanyak {len(duplicate_counts[duplicate_counts > 1])}")

#### Dimensi Customers

In [141]:
dim_customers = customers_df.copy()
print(dim_customers.columns.tolist())

['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']


In [142]:
# Satu customer_unique_id bisa punya lebih dari satu customer_id
""" Cek duplikasi data """

check_duplicates(dim_customers, 'customer_id')

dim_customers = dim_customers.drop_duplicates(subset=['customer_id'])

Jumlah data unik pada kolom customer_id: 99441
Tidak ada duplikat data pada kolom customer_id


In [143]:
""" Cek data kosong """

dim_customers.isnull().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [144]:
""" Kolom yang digunakan """

dim_customers = dim_customers[['customer_id', 'customer_unique_id', 'customer_city', 'customer_state', 'customer_zip_code_prefix']]
print(dim_customers.head())

                        customer_id                customer_unique_id  \
0  06b8999e2fba1a1fbc88172c00ba8bc7  861eff4711a542e4b93843c6dd7febb0   
1  18955e83d337fd6b2def6b18a428ac77  290c77bc529b7ac935b93aa66c333dc3   
2  4e7b3e00288586ebd08712fdd0374a03  060e732b5b29e8181a18229c7b0b2b5e   
3  b2b6027bc5c5109e529d4dc6358b12c3  259dac757896d24d7702b9acbbff3f3c   
4  4f2d8ab171c80ec8364f7c12e35b23ad  345ecd01c38d18a9036ed96c73b8d066   

           customer_city customer_state  customer_zip_code_prefix  
0                 franca             SP                     14409  
1  sao bernardo do campo             SP                      9790  
2              sao paulo             SP                      1151  
3        mogi das cruzes             SP                      8775  
4               campinas             SP                     13056  


#### Dimensi Seller

In [145]:
dim_sellers = sellers_df.copy()
print(dim_sellers.columns.tolist())

['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state']


In [146]:
""" Cek duplikasi data """

check_duplicates(dim_sellers, 'seller_id')

dim_sellers = dim_sellers.drop_duplicates(subset=['seller_id'])

Jumlah data unik pada kolom seller_id: 3095
Tidak ada duplikat data pada kolom seller_id


In [147]:
""" Cek data kosong """

dim_sellers.isnull().sum()

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

In [148]:
""" Memilih dan mengurutkan kolom yang digunakan """

dim_sellers = dim_sellers[['seller_id', 'seller_city', 'seller_state', 'seller_zip_code_prefix']]
print(dim_sellers.head())

                          seller_id        seller_city seller_state  \
0  3442f8959a84dea7ee197c632cb2df15           campinas           SP   
1  d1b65fc7debc3361ea86b5f14c68d2e2         mogi guacu           SP   
2  ce3ad9de960102d0677a81f5d0bb7b2d     rio de janeiro           RJ   
3  c0f3eea2e14555b6faeea3dd58c1b1c3          sao paulo           SP   
4  51a04a8a6bdcb23deccc82b0b80742cf  braganca paulista           SP   

   seller_zip_code_prefix  
0                   13023  
1                   13844  
2                   20031  
3                    4195  
4                   12914  


#### Dimensi Products

In [149]:
dim_products = products_df.copy()
print(dim_products.columns.tolist())

['product_id', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']


In [150]:
""" Cek duplikasi data """

check_duplicates(dim_products, 'product_id')

dim_products = dim_products.drop_duplicates(subset=['product_id'])

Jumlah data unik pada kolom product_id: 32951
Tidak ada duplikat data pada kolom product_id


In [151]:
""" Cek data kosong """

dim_products.isnull().sum()

product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [152]:
""" Isi data kosong pada ['product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm'] dengan rata-rata """

for column in ['product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']:
    mean_value = dim_products[column].mean()
    dim_products[column] = dim_products[column].fillna(mean_value)

In [153]:
"""
Pada dimensi products akan diimplementasikan SCD type 2
Menambahkan product_key untuk primary key
Menambahkan tanggal efektif dengan default tanggal sekarang
Menambahkan tanggal kedaluwarsa dengan default None
Menambahkan current_flag dengan default True
"""

dim_products['product_key'] = range(1, len(dim_products) + 1)
dim_products['effective_date'] = datetime.now().date()  # atau menggunakan datetime(2019, 1, 1) 
dim_products['expiration_date'] = None
dim_products['current_flag'] = True


In [154]:
""" Menambahkan kolom product_category_name_english yang diambil dari category_df """

for i in range(len(category_df)):
    ctg = category_df['product_category_name'][i]
    eng_ctg = category_df['product_category_name_english'][i]
    if (ctg in dim_products['product_category_name'].unique()):
        dim_products.loc[dim_products['product_category_name'] == ctg, 'product_category_name_english'] = eng_ctg


In [155]:
""" Memilih dan mengurutkan kolom yang digunakan """

dim_products = dim_products[['product_key', 
                             'product_id', 
                             'product_category_name', 
                             'product_category_name_english', 
                             'product_weight_g', 
                             'product_length_cm', 
                             'product_height_cm', 
                             'product_width_cm', 
                             'effective_date', 
                             'expiration_date', 
                             'current_flag']]
print(dim_products.head())

   product_key                        product_id  product_category_name  \
0            1  1e9e8ef04dbcff4541ed26657ea517e5             perfumaria   
1            2  3aa071139cb16b67ca9e5dea641aaa2f                  artes   
2            3  96bd76ec8810374ed1b65e291975717f          esporte_lazer   
3            4  cef67bcfe19066a932b7673e239eb23d                  bebes   
4            5  9dc1a7de274444849c219cff195d0b71  utilidades_domesticas   

  product_category_name_english  product_weight_g  product_length_cm  \
0                     perfumery             225.0               16.0   
1                           art            1000.0               30.0   
2                sports_leisure             154.0               18.0   
3                          baby             371.0               26.0   
4                    housewares             625.0               20.0   

   product_height_cm  product_width_cm effective_date expiration_date  \
0               10.0              14.0     

#### Dimensi Payments

In [156]:
dim_payments = payments_df.copy()
print(dim_payments.columns.tolist())

['order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']


In [157]:
""" Cek data kosong """

dim_payments.isnull().sum()

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [158]:
""" Cek data unik pada tipe pembayaran """
dim_payments['payment_type'].value_counts()

credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: payment_type, dtype: int64

In [159]:
""" 
Menambahkan payment_id sebagai primary key
Memilih dan mengurutkan kolom yang digunakan
"""

dim_payments['payment_id'] = range(1, len(dim_payments) + 1)
columns = ['payment_id', 'order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']
dim_payments = dim_payments[columns]

print(dim_payments.head())

   payment_id                          order_id  payment_sequential  \
0           1  b81ef226f3fe1789b1e8b2acac839d17                   1   
1           2  a9810da82917af2d9aefd1278f1dcfa0                   1   
2           3  25e8ea4e93396b6fa0d3dd708e76c1bd                   1   
3           4  ba78997921bbcdc1373bb41e913ab953                   1   
4           5  42fdf880ba16b47b59251dd489d4441a                   1   

  payment_type  payment_installments  payment_value  
0  credit_card                     8          99.33  
1  credit_card                     1          24.39  
2  credit_card                     1          65.71  
3  credit_card                     8         107.78  
4  credit_card                     2         128.45  


#### Dimensi Date

Dimensi Tanggal sudah dibangun di skema data warehouse

#### Fakta Orders


In [160]:
fact_orders = orders_df.copy()
print(fact_orders.columns.tolist())

['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']


In [161]:
""" Cek data kosong """
fact_orders.isnull().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

In [162]:
""" Cek data unik pada status pembayaran """

fact_orders['order_status'].value_counts()

delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64

In [163]:
""" Memilah kolom yang tidak digunakan """

fact_orders = fact_orders.drop('order_delivered_carrier_date', axis=1)


In [164]:
""" 
Menambahkan kolom date_id dengan format YYYYMMDD dari data order_purchase_timestamp
Meletakkan date_id setelah customer_id
"""

fact_orders['date_id'] = pd.to_datetime(fact_orders['order_purchase_timestamp']).dt.strftime('%Y%m%d').astype(int)

cols = list(fact_orders.columns)
customer_id_index = cols.index('customer_id')
cols.insert(customer_id_index + 1, cols.pop(cols.index('date_id')))
fact_orders = fact_orders[cols]

print(fact_orders.head())

                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

    date_id order_status order_purchase_timestamp    order_approved_at  \
0  20171002    delivered      2017-10-02 10:56:33  2017-10-02 11:07:15   
1  20180724    delivered      2018-07-24 20:41:37  2018-07-26 03:24:27   
2  20180808    delivered      2018-08-08 08:38:49  2018-08-08 08:55:23   
3  20171118    delivered      2017-11-18 19:28:06  2017-11-18 19:45:59   
4  20180213    delivered      2018-02-13 21:18:39  2018-02-13 22:20:29   

  order_delivered_customer_date order_estimated_delivery_date  
0           2017-10-10 21:25:13           2017-10-18

In [165]:
""" 
Menambahkan kolom sales_amount dengan menjumlahkan harga produk
Data diambil dari price di tabel items_df yang dijumlahkan berdasarkan order_id
"""

sales_amount_df = items_df.groupby('order_id')['price'].sum().reset_index()
sales_amount_df = sales_amount_df.rename(columns={'price': 'sales_amount'})
fact_orders = pd.merge(fact_orders, sales_amount_df, on='order_id', how='left')

print(fact_orders.head())

                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

    date_id order_status order_purchase_timestamp    order_approved_at  \
0  20171002    delivered      2017-10-02 10:56:33  2017-10-02 11:07:15   
1  20180724    delivered      2018-07-24 20:41:37  2018-07-26 03:24:27   
2  20180808    delivered      2018-08-08 08:38:49  2018-08-08 08:55:23   
3  20171118    delivered      2017-11-18 19:28:06  2017-11-18 19:45:59   
4  20180213    delivered      2018-02-13 21:18:39  2018-02-13 22:20:29   

  order_delivered_customer_date order_estimated_delivery_date  sales_amount  
0           2017-10-10 21:25:13       

In [166]:
""" Memastikan kolom yang akan digunakan """

fact_orders = fact_orders[['order_id', 
                           'customer_id', 
                           'date_id', 
                           'order_status', 
                           'order_purchase_timestamp', 
                           'order_approved_at', 
                           'order_delivered_customer_date', 
                           'order_estimated_delivery_date', 
                           'sales_amount']]

print(fact_orders.shape)
print(fact_orders.head())

(99441, 9)
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

    date_id order_status order_purchase_timestamp    order_approved_at  \
0  20171002    delivered      2017-10-02 10:56:33  2017-10-02 11:07:15   
1  20180724    delivered      2018-07-24 20:41:37  2018-07-26 03:24:27   
2  20180808    delivered      2018-08-08 08:38:49  2018-08-08 08:55:23   
3  20171118    delivered      2017-11-18 19:28:06  2017-11-18 19:45:59   
4  20180213    delivered      2018-02-13 21:18:39  2018-02-13 22:20:29   

  order_delivered_customer_date order_estimated_delivery_date  sales_amount  
0           2017-10-10 21:2

#### Fakta Order Items

In [167]:
fact_order_items = items_df.copy()
print(fact_order_items.columns.tolist())

['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']


In [168]:
""" Cek data kosong """

fact_order_items.isnull().sum()

order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

In [169]:
""" 
Menambahkan kolom date_id dengan format YYYYMMDD dari data shipping_limit_date
Meletakkan date_id setelah seller_id
"""

fact_order_items['date_id'] = pd.to_datetime(fact_order_items['shipping_limit_date']).dt.strftime('%Y%m%d').astype(int)

cols = list(fact_order_items.columns)
product_id_index = cols.index('seller_id')
cols.insert(product_id_index + 1, cols.pop(cols.index('date_id')))
fact_order_items = fact_order_items[cols]

print(fact_order_items.shape)
print(fact_order_items.head())

(112650, 8)
                           order_id  order_item_id  \
0  00010242fe8c5a6d1ba2dd792cb16214              1   
1  00018f77f2f0320c557190d7a144bdd3              1   
2  000229ec398224ef6ca0657da4fc703e              1   
3  00024acbcdf0a6daa1e931b038114c75              1   
4  00042b26cf59d7ce69dfabb4e55b4fd9              1   

                         product_id                         seller_id  \
0  4244733e06e7ecb4970a6e2683c13e61  48436dade18ac8b2bce089ec2a041202   
1  e5f2d52b802189ee658865ca93d83a8f  dd7ddc04e1b6c2c614352b383efe2d36   
2  c777355d18b72b67abbeef9df44fd0fd  5b51032eddd242adc84c38acab88f23d   
3  7634da152a4610f1595efa32f14722fc  9d7a1d34a5052409006425275ba1c2b4   
4  ac6c3623068f30de03045865e4e10089  df560393f3a51e74553ab94004ba5c87   

    date_id  shipping_limit_date   price  freight_value  
0  20170919  2017-09-19 09:45:35   58.90          13.29  
1  20170503  2017-05-03 11:05:13  239.90          19.93  
2  20180118  2018-01-18 14:48:30  199.00          

### Load

In [171]:
""" Melakukan load dengan insert into """

def load_data_to_warehouse():
    # Memastikan data yang akan dimuat ke dalam data warehouse
    customers_final = dim_customers.copy()
    sellers_final = dim_sellers.copy()
    products_final = dim_products.copy()
    payments_final = dim_payments.copy()
    orders_final = fact_orders.copy()
    order_items_final = fact_order_items.copy()

    conn.execute("INSERT INTO dim_customers SELECT * FROM customers_final")
    conn.execute("INSERT INTO dim_sellers SELECT * FROM sellers_final")
    conn.execute("INSERT INTO dim_products SELECT * FROM products_final")
    conn.execute("INSERT INTO fact_orders SELECT * FROM orders_final")
    conn.execute("INSERT INTO fact_order_items SELECT * FROM order_items_final")
    conn.execute("INSERT INTO dim_payments SELECT * FROM payments_final")

    print("Data berhasil dimuat ke dalam data warehouse")


In [172]:
load_data_to_warehouse()

Data berhasil dimuat ke dalam data warehouse


In [173]:
conn.execute("SELECT * FROM fact_orders LIMIT 5").fetchdf()

Unnamed: 0,order_id,customer_id,date_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,order_estimated_delivery_date,sales_amount
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,20171002,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,2017-10-18,29.99
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,20180724,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-08-07 15:27:45,2018-08-13,118.7
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,20180808,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-17 18:06:29,2018-09-04,159.9
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,20171118,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-12-02 00:28:42,2017-12-15,45.0
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,20180213,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-16 18:17:02,2018-02-26,19.9


In [174]:
conn.execute("SELECT * FROM fact_order_items LIMIT 5").fetchdf()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,date_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,20170919,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,20170503,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,20180118,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,20180815,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,20170213,2017-02-13 13:57:51,199.9,18.14
