In [8]:
%%sql
CREATE DATABASE IF NOT EXISTS dev_curated_ecommerce;

In [2]:
%%sql
drop table dev_curated_ecommerce.customers

In [9]:
%%sql
CREATE TABLE IF NOT EXISTS dev_curated_ecommerce.customers (
    customer_id                 bigint NOT NULL COMMENT 'unique id',
    name                        string,
    first_name                  string,
    last_name                   string,
    gender                      string,
    email                       string,
    phone                       string,
    country                     string,
    registration_date           timestamp NOT NULL,
    acquisition_channel_id      int
)
USING iceberg
PARTITIONED BY (days(registration_date));

In [3]:
from datetime import datetime

schema = spark.table("climate.weather").schema

data = [
    (datetime(2023,8,16), 76.2, 40.951908, -74.075272, "Partially sunny", 0.0, 3.5),
    (datetime(2023,8,17), 82.5, 40.951908, -74.075272, "Sunny", 0.0, 1.2),
    (datetime(2023,8,18), 70.9, 40.951908, -74.075272, "Cloudy", .5, 5.2)
  ]

df = spark.createDataFrame(data, schema)
df.writeTo("climate.weather").append()

                                                                                

In [4]:
df.show()

+-------------------+----+---------+----------+---------------+------+----------+
|           datetime|temp|      lat|      long| cloud_coverage|precip|wind_speed|
+-------------------+----+---------+----------+---------------+------+----------+
|2023-08-16 00:00:00|76.2|40.951908|-74.075272|Partially sunny|   0.0|       3.5|
|2023-08-17 00:00:00|82.5|40.951908|-74.075272|          Sunny|   0.0|       1.2|
|2023-08-18 00:00:00|70.9|40.951908|-74.075272|         Cloudy|   0.5|       5.2|
+-------------------+----+---------+----------+---------------+------+----------+



In [8]:
%%sql
select
    month(datetime), avg(wind_speed) 
from climate.weather
where lower(cloud_coverage) like '%sun%'
group by month(datetime)

month(datetime),avg(wind_speed)
8,2.35


### Test Postgres
---

In [6]:
tables = ["customer_acquisition_channels", "customers", "inventory", "order_items", "orders", "product_categories", "products"]
tables = ["customers"]
jdbc_url = "jdbc:postgresql://postgres/ecommerce"
jdbc_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}
def get_df(jdbc_url, jdbc_prop, table):
    # Read data from PostgreSQL
    print(f"Reading data from PostgreSQL, table: {table}...")
    query = f"""
        select * from {table}
        where registration_date > '2020-01-05 02:23:48.000'
    """
    # postgres_df = spark.read.jdbc(url=jdbc_url, query=query, table=table, properties=jdbc_prop)
    postgres_df = spark.read.format('jdbc') \
                    .option("url", jdbc_url) \
                    .option("query",query) \
                    .option("user","postgres") \
                    .option("password","postgres") \
                    .option("driver","org.postgresql.Driver") \
                    .load()
    print(postgres_df.show())

for table in tables:
    get_df(jdbc_url, jdbc_properties, table)

Reading data from PostgreSQL, table: customers...
+-----------+---------+------+-------------+--------------+-------+-------------------+----------------------+
|customer_id|     name|gender|        email|         phone|country|  registration_date|acquisition_channel_id|
+-----------+---------+------+-------------+--------------+-------+-------------------+----------------------+
|  999999991|test user|     M|test@mail.com|+1232131231231|    ABC|2020-01-05 02:23:49|                     3|
+-----------+---------+------+-------------+--------------+-------+-------------------+----------------------+

None


In [4]:
# Write data to MinIO in Iceberg format
iceberg_table = "demo.ecommerce.customers"

print(f"Writing data to Iceberg table: {iceberg_table}")
postgres_df.writeTo(iceberg_table).createOrReplace()

print("Data migration completed successfully!")

Writing data to Iceberg table: demo.ecommerce.customers


                                                                                

Data migration completed successfully!


In [3]:
%%sql
select * from dev_raw_ecommerce.customers

                                                                                

customer_id,name,gender,email,phone,country,registration_date,acquisition_channel_id
6,Muhammad Abdullah,M,Muhammad.Abdullah@mail.com,966123456789,SA,2020-01-03 23:00:14,12
8,Wang Yichen,M,Wang.Yichen@mail.com,86123456789,CN,2020-01-03 23:50:55,2
9,Sanjaya Putra,M,Sanjaya.Putra@mail.com,628123456789,ID,2020-01-04 18:04:12,1
1,John Doe,M,John.Doe@mail.com,11239879877,US,2020-01-01 07:16:37,11
3,Elon Musk,M,Elon.Musk@mail.com,11239879872,US,2020-01-01 09:46:43,10
5,Eleanor Rigby,M,Eleanor.Rigby@mail.com,441234567891,GB,2020-01-01 18:34:50,5
7,Eiichiro Oda,M,Eiichiro.Oda@mail.com,81123456789,JP,2020-01-01 23:04:31,4
2,Jane Doe,F,Jane.Doe@mail.com,11239879871,US,2020-01-02 00:21:00,7
4,Bill Gates,M,Bill.Gates@mail.com,11239879873,US,2020-01-02 16:26:29,1
10,Daran Tasha,F,Daran.Tasha@mail.com,7112323456789,RU,2020-01-05 02:23:48,11


In [8]:
%%sql
select * from dev_curated_ecommerce.customers c
-- delete from dev_curated_ecommerce.customers
-- where 1=1

customer_id,name,gender,email,phone,country,registration_date,acquisition_channel_id,first_name,last_name
6,Muhammad Abdullah,M,Muhammad.Abdullah@mail.com,966123456789,SA,2020-01-03 23:00:14,12,Muhammad,Abdullah
8,Wang Yichen,M,Wang.Yichen@mail.com,86123456789,CN,2020-01-03 23:50:55,2,Wang,Yichen
9,Sanjaya Putra,M,Sanjaya.Putra@mail.com,628123456789,ID,2020-01-04 18:04:12,1,Sanjaya,Putra
1,John Doe,M,John.Doe@mail.com,11239879877,US,2020-01-01 07:16:37,11,John,Doe
3,Elon Musk,M,Elon.Musk@mail.com,11239879872,US,2020-01-01 09:46:43,10,Elon,Musk
5,Eleanor Rigby,M,Eleanor.Rigby@mail.com,441234567891,GB,2020-01-01 18:34:50,5,Eleanor,Rigby
7,Eiichiro Oda,M,Eiichiro.Oda@mail.com,81123456789,JP,2020-01-01 23:04:31,4,Eiichiro,Oda
2,Jane Doe,F,Jane.Doe@mail.com,11239879871,US,2020-01-02 00:21:00,7,Jane,Doe
4,Bill Gates,M,Bill.Gates@mail.com,11239879873,US,2020-01-02 16:26:29,1,Bill,Gates
10,Daran Tasha,F,Daran.Tasha@mail.com,7112323456789,RU,2020-01-05 02:23:48,11,Daran,Tasha


In [9]:
%%sql
select * from dev_star_ecommerce.dim_customers

customer_id,name,gender,email,phone,country,registration_date,acquisition_channel_id,first_name,last_name,channel_id,category,channel_name,description,acq_channel_created_at
6,Muhammad Abdullah,M,Muhammad.Abdullah@mail.com,966123456789,SA,2020-01-03 23:00:14,12,Muhammad,Abdullah,12,Traditional Ad,Print Media,,2019-01-01 07:16:37
8,Wang Yichen,M,Wang.Yichen@mail.com,86123456789,CN,2020-01-03 23:50:55,2,Wang,Yichen,2,Search,Paid Search,"pay-per-click, is advertising on search engines. Plus, with platforms like Google Ads, we can place ads directly in search results and even on partner websites.",2019-01-01 07:16:37
9,Sanjaya Putra,M,Sanjaya.Putra@mail.com,628123456789,ID,2020-01-04 18:04:12,1,Sanjaya,Putra,1,Search,Organic Search,Organic search is all about showing up on search engine results pages. Leverage using SEO,2019-01-01 07:16:37
1,John Doe,M,John.Doe@mail.com,11239879877,US,2020-01-01 07:16:37,11,John,Doe,11,Traditional Ad,Radio,,2019-01-01 07:16:37
3,Elon Musk,M,Elon.Musk@mail.com,11239879872,US,2020-01-01 09:46:43,10,Elon,Musk,10,Traditional Ad,TV,,2019-01-01 07:16:37
5,Eleanor Rigby,M,Eleanor.Rigby@mail.com,441234567891,GB,2020-01-01 18:34:50,5,Eleanor,Rigby,5,Massaging,Email,"Email marketing, Whether it’s a birthday wish or a valuable promo, email is a great way to connect with our audience. It’s a direct line to their inbox, bypassing the noise of social media and search engines.",2019-01-01 07:16:37
7,Eiichiro Oda,M,Eiichiro.Oda@mail.com,81123456789,JP,2020-01-01 23:04:31,4,Eiichiro,Oda,4,Social Media,Paid Social Media,"For example, sponsored posts on Facebook, X, or Instagram to get content seen by more people (targeted audiences).",2019-01-01 07:16:37
2,Jane Doe,F,Jane.Doe@mail.com,11239879871,US,2020-01-02 00:21:00,7,Jane,Doe,7,Customer,Refferals,Referral marketing leverages the trust of personal recommendations to drive new customer acquisition.,2019-01-01 07:16:37
4,Bill Gates,M,Bill.Gates@mail.com,11239879873,US,2020-01-02 16:26:29,1,Bill,Gates,1,Search,Organic Search,Organic search is all about showing up on search engine results pages. Leverage using SEO,2019-01-01 07:16:37
10,Daran Tasha,F,Daran.Tasha@mail.com,7112323456789,RU,2020-01-05 02:23:48,11,Daran,Tasha,11,Traditional Ad,Radio,,2019-01-01 07:16:37


In [6]:
%%sql
insert into dev_raw_ecommerce.customers(customer_id, name, gender, email, phone, country, registration_date, acquisition_channel_id)
values (11, 'Michael Su', 'F', 'ms@mail.com', '123321123321', 'CN', cast(date_format('2020-01-15 03:23:48', 'yyyy-MM-dd HH:mm:ss') as timestamp), 2)

In [7]:
query = '''select * from dev_curated_ecommerce.customers c
left join dev_raw_ecommerce.customer_acquisition_channels ac
    on c.acquisition_channel_id = ac.channel_id'''
df = spark.sql(query)
df.show()

+-----------+-----------------+------+--------------------+--------------+-------+-------------------+----------------------+----------+---------+----------+--------------+-----------------+--------------------+-------------------+
|customer_id|             name|gender|               email|         phone|country|  registration_date|acquisition_channel_id|first_name|last_name|channel_id|      category|     channel_name|         description|         created_at|
+-----------+-----------------+------+--------------------+--------------+-------+-------------------+----------------------+----------+---------+----------+--------------+-----------------+--------------------+-------------------+
|          6|Muhammad Abdullah|     M|Muhammad.Abdullah...| +966123456789|    SA |2020-01-03 23:00:14|                    12|  Muhammad| Abdullah|        12|Traditional Ad|      Print Media|                NULL|2019-01-01 07:16:37|
|          8|      Wang Yichen|     M|Wang.Yichen@mail.com|  +8612345678

In [12]:
query = f'''select max(registration_date) from customers where registration_date < '{last_date}' '''
df = spark.read.format('jdbc') \
        .option("url", "jdbc:postgresql://postgres/ecommerce") \
        .option("query",query) \
        .option("user","postgres") \
        .option("password","postgres") \
        .option("driver","org.postgresql.Driver") \
        .load()
df.show()

+-------------------+
|                max|
+-------------------+
|2020-01-05 02:23:48|
+-------------------+



In [9]:
# %pip list

In [6]:
%%sql
-- CREATE DATABASE IF NOT EXISTS test_raw_ecommerce;
-- CREATE TABLE IF NOT EXISTS test_raw_ecommerce.customers (
--     customer_id                 bigint NOT NULL COMMENT 'unique id',
--     name                        string,
--     first_name                  string,
--     last_name                   string,
--     gender                      string,
--     email                       string,
--     phone                       string,
--     country                     string,
--     registration_date           timestamp NOT NULL,
--     acquisition_channel_id      int
-- )
-- USING iceberg
-- PARTITIONED BY (days(registration_date));

-- CREATE DATABASE IF NOT EXISTS test_curated_ecommerce;
-- CREATE TABLE IF NOT EXISTS test_curated_ecommerce.customers (
--     customer_id                 bigint NOT NULL COMMENT 'unique id',
--     name                        string,
--     first_name                  string,
--     last_name                   string,
--     gender                      string,
--     email                       string,
--     phone                       string,
--     country                     string,
--     registration_date           timestamp NOT NULL,
--     acquisition_channel_id      int
-- )
-- USING iceberg
-- PARTITIONED BY (days(registration_date));