## Persiapan

In [None]:
%pip install duckdb pandas numpy  kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.10-py3-none-any.whl.metadata (31 kB)
Downloading kagglehub-0.3.10-py3-none-any.whl (63 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.10


In [1]:
import duckdb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

In [2]:
import kagglehub

path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\lenovo\.cache\kagglehub\datasets\olistbr\brazilian-ecommerce\versions\2


In [3]:
customers = pd.read_csv(os.path.join(path,'olist_customers_dataset.csv'))
geo = pd.read_csv(os.path.join(path,'olist_geolocation_dataset.csv'))
items = pd.read_csv(os.path.join(path,'olist_order_items_dataset.csv'))
payments = pd.read_csv(os.path.join(path,'olist_order_payments_dataset.csv'))
reviews = pd.read_csv(os.path.join(path,'olist_order_reviews_dataset.csv'))
orders = pd.read_csv(os.path.join(path,'olist_orders_dataset.csv'))
products = pd.read_csv(os.path.join(path,'olist_products_dataset.csv'))
sellers = pd.read_csv(os.path.join(path,'olist_sellers_dataset.csv'))
category = pd.read_csv(os.path.join(path,'product_category_name_translation.csv'))

In [22]:
print(f"Customers: \n{customers.head()}\n")
print(f"Geo Location: \n{geo.head()}\n")
print(f"Items: \n{items.head()}\n")
print(f"Payments: \n{payments.head()}\n")
print(f"Reviews: \n{reviews.head()}\n")
print(f"Orders: \n{orders.head()}\n")
print(f"Products: \n{products.head()}\n")
print(f"Sellers: \n{sellers.head()}\n")
print(f"Product Category: \n{category.head()}\n")

Customers: 
                        customer_id                customer_unique_id  \
0  06b8999e2fba1a1fbc88172c00ba8bc7  861eff4711a542e4b93843c6dd7febb0   
1  18955e83d337fd6b2def6b18a428ac77  290c77bc529b7ac935b93aa66c333dc3   
2  4e7b3e00288586ebd08712fdd0374a03  060e732b5b29e8181a18229c7b0b2b5e   
3  b2b6027bc5c5109e529d4dc6358b12c3  259dac757896d24d7702b9acbbff3f3c   
4  4f2d8ab171c80ec8364f7c12e35b23ad  345ecd01c38d18a9036ed96c73b8d066   

   customer_zip_code_prefix          customer_city customer_state  
0                     14409                 franca             SP  
1                      9790  sao bernardo do campo             SP  
2                      1151              sao paulo             SP  
3                      8775        mogi das cruzes             SP  
4                     13056               campinas             SP  

Geo Location: 
   geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
0                         1037       -23.545621       -46

## Koneksi DuckDB dan DDL

Koneksi ke DuckDB

In [4]:
conn = duckdb.connect("brazilian_ecommerce.db")
print("Terhubung ke DuckDB : brazilian_ecommerce.db")

# conn.close()
# print("Koneksi ke DuckDB tertutup")

Terhubung ke DuckDB : brazilian_ecommerce.db


Fact Constellation Schema

**Tabel Dimensi:**

dim_customers → Informasi pelanggan.

dim_products → Informasi produk.

dim_sellers → Informasi penjual.

dim_geo → Informasi lokasi berdasarkan kode pos.

dim_time → Dimensi waktu untuk analisis berbasis periode.

**Tabel Fakta:**

fact_orders → Menganalisis transaksi pesanan.

fact_order_items → Menganalisis detail produk dalam pesanan.

fact_payments → Menganalisis pembayaran pelanggan.

fact_reviews → Menganalisis ulasan pelanggan terhadap produk.

In [None]:
def create_datawarehouse_schema():

    # Membuat tabel dimensi
    conn.execute("""
                 -- Dimensi Geolocation
                 CREATE OR REPLACE TABLE dim_geolocation (
                    geolocation_sk INTEGER PRIMARY KEY,
                    zip_code_prefix VARCHAR NOT NULL,
                    latitude DECIMAL(10,6) NOT NULL,
                    longitude DECIMAL(10,6) NOT NULL,
                    geolocation_city VARCHAR NOT NULL,
                    geolocation_state VARCHAR NOT NULL
                );
                 """)
    
    conn.execute("""
                CREATE OR REPLACE TABLE dim_customers (
                    customer_sk INTEGER PRIMARY KEY,
                    customer_id VARCHAR UNIQUE NOT NULL,
                    customer_unique_id VARCHAR NOT NULL,
                    geolocation_sk INTEGER NOT NULL,
                    FOREIGN KEY (geolocation_sk) REFERENCES dim_geo(geolocation_sk)
                );

                CREATE OR REPLACE TABLE dim_sellers (
                    seller_sk INTEGER PRIMARY KEY,
                    seller_id VARCHAR UNIQUE NOT NULL,
                    geolocation_sk INTEGER NOT NULL,
                    FOREIGN KEY (geolocation_sk) REFERENCES dim_geo(geolocation_sk)
                );

                CREATE OR REPLACE TABLE dim_time (
                    time_sk INTEGER PRIMARY KEY,
                    date DATE UNIQUE NOT NULL,
                    year INTEGER NOT NULL,
                    month INTEGER NOT NULL,
                    day_of_week VARCHAR NOT NULL
                );
                """)

## Proses ETL 

### Extract

In [5]:
def extract_source_data():
    """Ekstrak data dari sumber"""

    customers_df = customers.copy()
    geo_df = geo.copy()
    items_df = items.copy()
    payments_df = payments.copy()
    reviews_df = reviews.copy()
    orders_df = orders.copy()
    products_df = products.copy()
    sellers_df = sellers.copy()
    category_df = category.copy()

    print(f"Data berhasil diekstrak:")
    print(f"Jumlah data customers: {customers_df.shape[0]}")
    print(f"Jumlah data geolocation: {geo_df.shape[0]}")
    print(f"Jumlah data items: {items_df.shape[0]}")
    print(f"Jumlah data payments: {payments_df.shape[0]}")
    print(f"Jumlah data reviews: {reviews_df.shape[0]}")
    print(f"Jumlah data orders: {orders_df.shape[0]}")
    print(f"Jumlah data products: {products_df.shape[0]}")
    print(f"Jumlah data sellers: {sellers_df.shape[0]}")
    print(f"Jumlah data product category: {category_df.shape[0]}")

    return customers_df, geo_df, items_df, payments_df, reviews_df, orders_df, products_df, sellers_df, category_df

In [6]:
customers_df, geo_df, items_df, payments_df, reviews_df, orders_df, products_df, sellers_df, category_df = extract_source_data()

Data berhasil diekstrak:
Jumlah data customers: 99441
Jumlah data geolocation: 1000163
Jumlah data items: 112650
Jumlah data payments: 103886
Jumlah data reviews: 99224
Jumlah data orders: 99441
Jumlah data products: 32951
Jumlah data sellers: 3095
Jumlah data product category: 71


### Transform

In [7]:
def check_duplicates(df, column):
    """Cek duplikat data"""
    duplicate_counts = df[column].value_counts()
    unique_counts = len(duplicate_counts)
    print(f"Jumlah data unik pada kolom {column}: {unique_counts}")
    if len(duplicate_counts[duplicate_counts > 1]) == 0:
        print(f"Tidak ada duplikat data pada kolom {column}")
    else:
        print(f"Ada duplikat data pada kolom {column} sebanyak {len(duplicate_counts[duplicate_counts > 1])}")  

In [10]:
check_duplicates(customers_df, 'customer_unique_id')

Jumlah data unik pada kolom customer_unique_id: 96096
Ada duplikat data pada kolom customer_unique_id sebanyak 2997


#### Dimensi Customers

In [8]:
dim_customers = customers_df.copy()
print(dim_customers.columns.tolist())

['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']


In [9]:
# Satu customer_unique_id bisa punya lebih dari satu customer_id
# Unique key: customer_id
# Foreign key: geolocation_sk -> dim_geo

check_duplicates(dim_customers, 'customer_id')

dim_customers = dim_customers.drop_duplicates(subset=['customer_id'])

Jumlah data unik pada kolom customer_id: 99441
Tidak ada duplikat data pada kolom customer_id


In [10]:
dim_customers.isnull().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [11]:
dim_customers = dim_customers[['customer_id', 'customer_unique_id', 'customer_city', 'customer_state', 'customer_zip_code_prefix']]
print(dim_customers.head())

                        customer_id                customer_unique_id  \
0  06b8999e2fba1a1fbc88172c00ba8bc7  861eff4711a542e4b93843c6dd7febb0   
1  18955e83d337fd6b2def6b18a428ac77  290c77bc529b7ac935b93aa66c333dc3   
2  4e7b3e00288586ebd08712fdd0374a03  060e732b5b29e8181a18229c7b0b2b5e   
3  b2b6027bc5c5109e529d4dc6358b12c3  259dac757896d24d7702b9acbbff3f3c   
4  4f2d8ab171c80ec8364f7c12e35b23ad  345ecd01c38d18a9036ed96c73b8d066   

           customer_city customer_state  customer_zip_code_prefix  
0                 franca             SP                     14409  
1  sao bernardo do campo             SP                      9790  
2              sao paulo             SP                      1151  
3        mogi das cruzes             SP                      8775  
4               campinas             SP                     13056  


#### Dimensi Seller

In [16]:
dim_sellers = sellers_df.copy()
print(dim_sellers.columns.tolist())


['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state']


In [15]:
check_duplicates(dim_sellers, 'seller_id')

dim_sellers = dim_sellers.drop_duplicates(subset=['seller_id'])

Jumlah data unik pada kolom seller_id: 3095
Tidak ada duplikat data pada kolom seller_id


In [17]:
dim_sellers.isnull().sum()

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

In [18]:
dim_sellers = dim_sellers[['seller_id', 'seller_city', 'seller_state', 'seller_zip_code_prefix']]
print(dim_sellers.head())

                          seller_id        seller_city seller_state  \
0  3442f8959a84dea7ee197c632cb2df15           campinas           SP   
1  d1b65fc7debc3361ea86b5f14c68d2e2         mogi guacu           SP   
2  ce3ad9de960102d0677a81f5d0bb7b2d     rio de janeiro           RJ   
3  c0f3eea2e14555b6faeea3dd58c1b1c3          sao paulo           SP   
4  51a04a8a6bdcb23deccc82b0b80742cf  braganca paulista           SP   

   seller_zip_code_prefix  
0                   13023  
1                   13844  
2                   20031  
3                    4195  
4                   12914  


#### Dimensi Products

In [36]:
dim_products = products_df.copy()
print(dim_products.columns.tolist())
dim_products.isnull().sum()

['product_id', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']


product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [37]:
print(len(dim_products['product_category_name'].unique()))

74


In [38]:
for i in range(len(category_df)):
    ctg = category_df['product_category_name'][i]
    eng_ctg = category_df['product_category_name_english'][i]
    if (ctg in dim_products['product_category_name'].unique()):
        dim_products.loc[dim_products['product_category_name'] == ctg, 'product_category_name_english'] = eng_ctg


In [49]:
dim_products['product_category_name_english'].isnull().sum()

np.int64(623)

#### Dimensi Payments

In [50]:
dim_payments = payments_df.copy()
print(dim_payments.columns.tolist())
dim_payments.isnull().sum()

['order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']


order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [56]:
check_duplicates(dim_payments, 'order_id')
dim_payments['payment_type'].value_counts()

Jumlah data unik pada kolom order_id: 99440
Ada duplikat data pada kolom order_id sebanyak 2961


payment_type
credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: count, dtype: int64

### Load