## Persiapan

In [None]:
%pip install duckdb pandas numpy  kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.10-py3-none-any.whl.metadata (31 kB)
Downloading kagglehub-0.3.10-py3-none-any.whl (63 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.10


In [1]:
import duckdb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

In [6]:
import kagglehub

path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\lenovo\.cache\kagglehub\datasets\olistbr\brazilian-ecommerce\versions\2


In [7]:
customers = pd.read_csv(os.path.join(path,'olist_customers_dataset.csv'))
geo = pd.read_csv(os.path.join(path,'olist_geolocation_dataset.csv'))
items = pd.read_csv(os.path.join(path,'olist_order_items_dataset.csv'))
payments = pd.read_csv(os.path.join(path,'olist_order_payments_dataset.csv'))
reviews = pd.read_csv(os.path.join(path,'olist_order_reviews_dataset.csv'))
orders = pd.read_csv(os.path.join(path,'olist_orders_dataset.csv'))
products = pd.read_csv(os.path.join(path,'olist_products_dataset.csv'))
sellers = pd.read_csv(os.path.join(path,'olist_sellers_dataset.csv'))
category = pd.read_csv(os.path.join(path,'product_category_name_translation.csv'))

In [22]:
print(f"Customers: \n{customers.head()}\n")
print(f"Geo Location: \n{geo.head()}\n")
print(f"Items: \n{items.head()}\n")
print(f"Payments: \n{payments.head()}\n")
print(f"Reviews: \n{reviews.head()}\n")
print(f"Orders: \n{orders.head()}\n")
print(f"Products: \n{products.head()}\n")
print(f"Sellers: \n{sellers.head()}\n")
print(f"Product Category: \n{category.head()}\n")

Customers: 
                        customer_id                customer_unique_id  \
0  06b8999e2fba1a1fbc88172c00ba8bc7  861eff4711a542e4b93843c6dd7febb0   
1  18955e83d337fd6b2def6b18a428ac77  290c77bc529b7ac935b93aa66c333dc3   
2  4e7b3e00288586ebd08712fdd0374a03  060e732b5b29e8181a18229c7b0b2b5e   
3  b2b6027bc5c5109e529d4dc6358b12c3  259dac757896d24d7702b9acbbff3f3c   
4  4f2d8ab171c80ec8364f7c12e35b23ad  345ecd01c38d18a9036ed96c73b8d066   

   customer_zip_code_prefix          customer_city customer_state  
0                     14409                 franca             SP  
1                      9790  sao bernardo do campo             SP  
2                      1151              sao paulo             SP  
3                      8775        mogi das cruzes             SP  
4                     13056               campinas             SP  

Geo Location: 
   geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
0                         1037       -23.545621       -46

## Koneksi DuckDB dan DDL

Koneksi ke DuckDB

In [2]:
conn = duckdb.connect("brazilian_ecommerce.db")
print("Terhubung ke DuckDB : brazilian_ecommerce.db")

# conn.close()
# print("Koneksi ke DuckDB tertutup")

Terhubung ke DuckDB : brazilian_ecommerce.db


Fact Constellation Schema

**Tabel Dimensi:**

dim_customers → Informasi pelanggan.

dim_products → Informasi produk.

dim_sellers → Informasi penjual.

dim_geo → Informasi lokasi berdasarkan kode pos.

dim_time → Dimensi waktu untuk analisis berbasis periode.

**Tabel Fakta:**

fact_orders → Menganalisis transaksi pesanan.

fact_order_items → Menganalisis detail produk dalam pesanan.

fact_payments → Menganalisis pembayaran pelanggan.

fact_reviews → Menganalisis ulasan pelanggan terhadap produk.

In [None]:
def create_datawarehouse_schema():

    # Membuat tabel dimensi
    conn.execute("""
                 -- Dimensi Geolocation
                 CREATE OR REPLACE TABLE dim_geolocation (
                    geolocation_sk INTEGER PRIMARY KEY,
                    zip_code_prefix VARCHAR NOT NULL,
                    latitude DECIMAL(10,6) NOT NULL,
                    longitude DECIMAL(10,6) NOT NULL,
                    geolocation_city VARCHAR NOT NULL,
                    geolocation_state VARCHAR NOT NULL
                );
                 """)
    
    conn.execute("""
                CREATE OR REPLACE TABLE dim_customers (
                    customer_sk INTEGER PRIMARY KEY,
                    customer_id VARCHAR UNIQUE NOT NULL,
                    customer_unique_id VARCHAR NOT NULL,
                    geolocation_sk INTEGER NOT NULL,
                    FOREIGN KEY (geolocation_sk) REFERENCES dim_geo(geolocation_sk)
                );

                CREATE OR REPLACE TABLE dim_sellers (
                    seller_sk INTEGER PRIMARY KEY,
                    seller_id VARCHAR UNIQUE NOT NULL,
                    geolocation_sk INTEGER NOT NULL,
                    FOREIGN KEY (geolocation_sk) REFERENCES dim_geo(geolocation_sk)
                );

                CREATE OR REPLACE TABLE dim_time (
                    time_sk INTEGER PRIMARY KEY,
                    date DATE UNIQUE NOT NULL,
                    year INTEGER NOT NULL,
                    month INTEGER NOT NULL,
                    day_of_week VARCHAR NOT NULL
                );
                """)

## Proses ETL 

### Extract

In [3]:
def extract_source_data():
    """Ekstrak data dari sumber"""

    customers_df = customers.copy()
    geo_df = geo.copy()
    items_df = items.copy()
    payments_df = payments.copy()
    reviews_df = reviews.copy()
    orders_df = orders.copy()
    products_df = products.copy()
    sellers_df = sellers.copy()
    category_df = category.copy()

    print(f"Data berhasil diekstrak:")
    print(f"Jumlah data customers: {customers_df.shape[0]}")
    print(f"Jumlah data geolocation: {geo_df.shape[0]}")
    print(f"Jumlah data items: {items_df.shape[0]}")
    print(f"Jumlah data payments: {payments_df.shape[0]}")
    print(f"Jumlah data reviews: {reviews_df.shape[0]}")
    print(f"Jumlah data orders: {orders_df.shape[0]}")
    print(f"Jumlah data products: {products_df.shape[0]}")
    print(f"Jumlah data sellers: {sellers_df.shape[0]}")
    print(f"Jumlah data product category: {category_df.shape[0]}")

    return customers_df, geo_df, items_df, payments_df, reviews_df, orders_df, products_df, sellers_df, category_df

In [8]:
customers_df, geo_df, items_df, payments_df, reviews_df, orders_df, products_df, sellers_df, category_df = extract_source_data()

Data berhasil diekstrak:
Jumlah data customers: 99441
Jumlah data geolocation: 1000163
Jumlah data items: 112650
Jumlah data payments: 103886
Jumlah data reviews: 99224
Jumlah data orders: 99441
Jumlah data products: 32951
Jumlah data sellers: 3095
Jumlah data product category: 71


### Transform

In [9]:
def check_duplicates(df, column):
    """Cek duplikat data"""
    duplicate_counts = df[column].value_counts()
    unique_counts = len(duplicate_counts)
    print(f"Jumlah data unik pada kolom {column}: {unique_counts}")
    if len(duplicate_counts[duplicate_counts > 1]) == 0:
        print(f"Tidak ada duplikat data pada kolom {column}")
    else:
        print(f"Ada duplikat data pada kolom {column} sebanyak {len(duplicate_counts[duplicate_counts > 1])}")  

In [10]:
check_duplicates(customers_df, 'customer_unique_id')

Jumlah data unik pada kolom customer_unique_id: 96096
Ada duplikat data pada kolom customer_unique_id sebanyak 2997


Dimensi Geolocation

In [167]:
dim_geo = geo_df.copy()
print(geo_df.columns.tolist())
print(len(geo_df))

['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']
1000163


In [168]:
check_duplicates(dim_geo, 'geolocation_zip_code_prefix')
dim_geo = dim_geo.drop_duplicates(subset=['geolocation_zip_code_prefix'])
print(f"Duplikat data berhasil dihapus. Jumlah data geolocation: {len(dim_geo)}")

Jumlah data unik pada kolom geolocation_zip_code_prefix: 19015
Ada duplikat data pada kolom geolocation_zip_code_prefix sebanyak 17972
Duplikat data berhasil dihapus. Jumlah data geolocation: 19015


In [169]:
dim_geo['geolocation_city'] = dim_geo['geolocation_city'].str.title()
dim_geo['geolocation_state'] = dim_geo['geolocation_state'].str.upper()
dim_location = dim_geo[['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state']]

dim_location = dim_location.drop_duplicates(subset=['geolocation_zip_code_prefix'])
dim_location = dim_location.reset_index(drop=True)
dim_location = dim_location.rename(columns={
    'geolocation_zip_code_prefix': 'zip_code_prefix', 
    'geolocation_city': 'city_name', 
    'geolocation_state': 'state_abbr'
})
dim_location

Unnamed: 0,zip_code_prefix,city_name,state_abbr
0,1037,Sao Paulo,SP
1,1046,Sao Paulo,SP
2,1041,Sao Paulo,SP
3,1035,Sao Paulo,SP
4,1012,São Paulo,SP
...,...,...,...
19010,99955,Vila Langaro,RS
19011,99970,Ciriaco,RS
19012,99910,Floriano Peixoto,RS
19013,99920,Erebango,RS


Dimensi Customers

In [170]:
dim_customers = customers_df.copy()
print(dim_customers.columns.tolist())

['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']


In [None]:
# Satu customer_unique_id bisa punya lebih dari satu customer_id
# Unique key: customer_id
# Foreign key: geolocation_sk -> dim_geo

check_duplicates(dim_customers, 'customer_id')

dim_customers = dim_customers.drop_duplicates(subset=['customer_id'])

Jumlah data unik pada kolom customer_id: 99441
Tidak ada duplikat data pada kolom customer_id
(99441, 5)


In [172]:
dim_customers = dim_customers.merge(dim_location[['zip_code_prefix']], how='left', left_on='customer_zip_code_prefix', right_on='zip_code_prefix')
dim_customers['customer_key'] = dim_customers['customer_id']
dim_customers = dim_customers[['customer_key', 'customer_id', 'customer_unique_id', 'zip_code_prefix']]
dim_customers['zip_code_prefix'] = dim_customers['zip_code_prefix'].fillna(0).astype(int)
dim_customers

Unnamed: 0,customer_key,customer_id,customer_unique_id,zip_code_prefix
0,06b8999e2fba1a1fbc88172c00ba8bc7,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409
1,18955e83d337fd6b2def6b18a428ac77,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790
2,4e7b3e00288586ebd08712fdd0374a03,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151
3,b2b6027bc5c5109e529d4dc6358b12c3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775
4,4f2d8ab171c80ec8364f7c12e35b23ad,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056
...,...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937
99437,e7b71a9017aa05c9a7fd292d714858e8,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764
99438,5e28dfe12db7fb50a4b2f691faecea5e,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115
99439,56b18e2166679b8a959d72dd06da27f9,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120


In [142]:
print(sellers_df.columns.tolist())
check_duplicates(sellers_df, 'seller_id')

['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state']
Jumlah data unik pada kolom seller_id: 3095
Tidak ada duplikat data pada kolom seller_id


### Load