In [2]:
import numpy as np
import pandas as pd
import duckdb
from pandas_gbq import to_gbq
from pandas_gbq import read_gbq

In [3]:
# query bigquery
project_id = "projectm2-aiess"
query = "SELECT * FROM olist_brazilian_ecommerce_target.DIM_CUSTOMERS"
df_customers = read_gbq(query, project_id=project_id)
query2 = "SELECT * FROM olist_brazilian_ecommerce_target.FCT_PAYMENTS"
df_payments = read_gbq(query2, project_id=project_id)
query3 = "SELECT * FROM olist_brazilian_ecommerce_target.FCT_REVIEWS"
df_reviews = read_gbq(query3, project_id=project_id)
query4 = "SELECT * FROM olist_brazilian_ecommerce_target.DIM_GEOLOCATION"
df_geolocation = read_gbq(query4, project_id=project_id)
query5 = "SELECT * FROM olist_brazilian_ecommerce_target.FCT_ORDER_ITEMS"
df_items =  read_gbq(query5, project_id=project_id)
query6 = "SELECT * FROM olist_brazilian_ecommerce_target.DIM_DATE"
df_date = read_gbq(query6, project_id=project_id)
query7 = "SELECT * FROM olist_brazilian_ecommerce_target.DIM_ORDERS"
df_orders = read_gbq(query7, project_id=project_id)


Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


<div class="alert alert-block alert-info">
Feature engineering

In [3]:
df_orders.columns

Index(['pk_order_sid', 'fk_customer_sid', 'fk_order_purchased_date_sid',
       'fk_order_approved_at_date_sid', 'fk_order_delivered_carrier_date_sid',
       'fk_order_delivered_customer_date_sid',
       'fk_order_estimated_delivery_date_sid', 'order_status', 'total_payment',
       'order_amt', 'freight_amt', 'total_order_amt_wf_freight', 'balance_amt',
       'payment_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'load_date'],
      dtype='object')

In [4]:
DS_orders_delivery = df_orders.copy()
DS_orders_delivery['actual_delivery_time'] = DS_orders_delivery['order_delivered_customer_date'] -  DS_orders_delivery['order_purchase_timestamp']
DS_orders_delivery['estimated delivery_time'] = DS_orders_delivery['order_estimated_delivery_date'] -DS_orders_delivery['order_purchase_timestamp']
DS_orders_delivery['actual_delivery_time_minutes'] = DS_orders_delivery['actual_delivery_time'].dt.total_seconds() / 60
DS_orders_delivery['estimated delivery_time_minutes'] = DS_orders_delivery['estimated delivery_time'].dt.total_seconds() / 60
DS_orders_delivery.columns

Index(['pk_order_sid', 'fk_customer_sid', 'fk_order_purchased_date_sid',
       'fk_order_approved_at_date_sid', 'fk_order_delivered_carrier_date_sid',
       'fk_order_delivered_customer_date_sid',
       'fk_order_estimated_delivery_date_sid', 'order_status', 'total_payment',
       'order_amt', 'freight_amt', 'total_order_amt_wf_freight', 'balance_amt',
       'payment_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'load_date', 'actual_delivery_time',
       'estimated delivery_time', 'actual_delivery_time_minutes',
       'estimated delivery_time_minutes'],
      dtype='object')

In [5]:
to_gbq(
    DS_orders_delivery,
    destination_table='olist_brazilian_ecommerce_DS.DS_orders_delivery',
    project_id='projectm2-aiess',
    if_exists='replace',
)

100%|██████████| 1/1 [00:00<00:00, 4029.11it/s]


In [4]:
df_orders.columns

Index(['pk_order_sid', 'fk_customer_sid', 'fk_order_purchased_date_sid',
       'fk_order_approved_at_date_sid', 'fk_order_delivered_carrier_date_sid',
       'fk_order_delivered_customer_date_sid',
       'fk_order_estimated_delivery_date_sid', 'order_status', 'total_payment',
       'order_amt', 'freight_amt', 'total_order_amt_wf_freight', 'balance_amt',
       'payment_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'load_date'],
      dtype='object')

In [5]:
null_payment_count = df_orders['total_payment'].isnull().sum()
print(f"Number of orders with NULL total_payment: {null_payment_count}")

Number of orders with NULL total_payment: 1


In [6]:
# Check how many unique customers per order
order_customer_counts = df_orders.groupby('pk_order_sid')['fk_customer_sid'].count().reset_index(name='unique_customers_per_order')

# Filter for any orders that have more than one customer (which should not happen)
multiple_customers = order_customer_counts[order_customer_counts['unique_customers_per_order'] > 1]

print(f"Total number of orders: {len(df_orders)}")
print(f"Orders with more than one customer: {len(multiple_customers)}")


Total number of orders: 99441
Orders with more than one customer: 0


In [7]:
df_customers.columns

Index(['pk_customer_sid', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'load_date'],
      dtype='object')

In [8]:
df_items.columns

Index(['fk_order_sid', 'pk_order_id', 'pk_order_item_id',
       'fk_shipping_limit_date_sid', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value',
       'product_category_name', 'product_category_name_english',
       'product_name_length', 'product_description_length',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'load_date'],
      dtype='object')

In [9]:
#get buyer city + orders in df
orders_customers = pd.merge(
    df_orders,
    df_customers,
    how='left',
    left_on='fk_customer_sid',
    right_on='pk_customer_sid'
)


In [23]:
# Count how many items per order
item_counts = df_items.groupby('fk_order_sid')['pk_order_item_id'].count().reset_index(name='item_count')

# Look at how many orders have more than 1 item
multiple_items_orders = item_counts[item_counts['item_count'] > 1]

print(f"Total number of orders: {len(item_counts)}")
print(f"Number of orders with multiple items: {len(multiple_items_orders)}")

Total number of orders: 98666
Number of orders with multiple items: 9803


there are orders with multiple items, item aggregation is required to avoid duplicate on merge

In [49]:
# Count unique categories per order
category_per_order_check = df_items.groupby('fk_order_sid')['product_category_name_english'].nunique()

# Check if any order has more than one category (should be 0)
orders_with_multiple_product_cat = category_per_order_check[category_per_order_check > 1]

print(f"orders with >1 product category: {len(orders_with_multiple_product_cat)}")

orders with >1 product category: 726


there are items with multiple product_category_name_english, we will use mode in this case

In [None]:
#group by order instead to avoid dup on merge
#using mode since there are items with multiple product_category_english 
item_agg = df_items.groupby('fk_order_sid').agg({
    'price': 'sum',
    'product_category_name_english': lambda x: x.mode().iloc[0] if not x.mode().empty else None
}).reset_index()
item_agg.head()

Unnamed: 0,fk_order_sid,price,product_category_name_english
0,00010242fe8c5a6d1ba2dd792cb16214,58.9,cool_stuff
1,00018f77f2f0320c557190d7a144bdd3,239.9,pet_shop
2,000229ec398224ef6ca0657da4fc703e,199.0,furniture_decor
3,00024acbcdf0a6daa1e931b038114c75,12.99,perfumery
4,00042b26cf59d7ce69dfabb4e55b4fd9,199.9,garden_tools


In [42]:
orders_full = pd.merge(df_orders, item_agg, left_on='pk_order_sid', right_on='fk_order_sid', how='left')

In [43]:
orders_full['profit'] = orders_full['total_payment'] - orders_full['price']

In [44]:
orders_full.columns

Index(['pk_order_sid', 'fk_customer_sid', 'fk_order_purchased_date_sid',
       'fk_order_approved_at_date_sid', 'fk_order_delivered_carrier_date_sid',
       'fk_order_delivered_customer_date_sid',
       'fk_order_estimated_delivery_date_sid', 'order_status', 'total_payment',
       'order_amt', 'freight_amt', 'total_order_amt_wf_freight', 'balance_amt',
       'payment_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'load_date', 'fk_order_sid', 'price',
       'product_category_name_english', 'profit'],
      dtype='object')

In [45]:
to_gbq(
    orders_full,
    destination_table='olist_brazilian_ecommerce_DS.DS_orders_full_profits',
    project_id='projectm2-aiess',
    if_exists='replace',
)

100%|██████████| 1/1 [00:00<00:00, 9776.93it/s]


In [26]:
DS_land_geolocation = df_geolocation.copy()

DS_land_geolocation = DS_land_geolocation[
    (DS_land_geolocation['geolocation_lat'].between(-34, 5)) &
    (DS_land_geolocation['geolocation_lng'].between(-74, -34))
]



In [27]:
to_gbq(
    DS_land_geolocation,
    destination_table='olist_brazilian_ecommerce_DS.DS_land_geolocation',
    project_id='projectm2-aiess',
    if_exists='replace',
)

100%|██████████| 1/1 [00:00<00:00, 7738.57it/s]
