## Read data

In [3]:
import pandas as pd
mart_customer_transactions=pd.read_csv('../data/brazilian_e-commerce/mart_customer_transactions.csv')
mart_customer_transactions.head()

Unnamed: 0,order_id,order_date,customer_id_unique,seller_id,total_item_value,total_freight_value,total_order_value
0,00010242fe8c5a6d1ba2dd792cb16214,2017-09-13,871766c5855e863f6eccc05f988b23cb,48436dade18ac8b2bce089ec2a041202,58.9,13.29,72.19
1,00018f77f2f0320c557190d7a144bdd3,2017-04-26,eb28e67c4c0b83846050ddfb8a35d051,dd7ddc04e1b6c2c614352b383efe2d36,239.9,19.93,259.83
2,000229ec398224ef6ca0657da4fc703e,2018-01-14,3818d81c6709e39d06b2738a8d3a2474,5b51032eddd242adc84c38acab88f23d,199.0,17.87,216.87
3,00024acbcdf0a6daa1e931b038114c75,2018-08-08,af861d436cfc08b2c2ddefd0ba074622,9d7a1d34a5052409006425275ba1c2b4,12.99,12.79,25.78
4,00042b26cf59d7ce69dfabb4e55b4fd9,2017-02-04,64b576fb70d441e8f1b2d7d446e483c5,df560393f3a51e74553ab94004ba5c87,199.9,18.14,218.04


## Data inspection: find data pattern

### customers per seller

In [4]:
# Count unique customers per seller
customers_per_seller = (
    mart_customer_transactions
        .groupby('seller_id')['customer_id_unique']
        .nunique()
        .reset_index(name='customer_count')
)

customers_per_seller

Unnamed: 0,seller_id,customer_count
0,0015a82c2db000af6aaaf3ae2ecb0532,3
1,001cca7ae9ae17fb1caed9dfb1094831,200
2,002100f778ceb8431b7a1020ff7ab48f,51
3,003554e2dce176b5555353e4f3555ac8,1
4,004c9cd9d87a3c30c522c48c4fc07416,157
...,...,...
3051,ffc470761de7d0232558ba5e786e57b7,28
3052,ffdd9f82b9a447f6f8d4b91554cc7dd3,18
3053,ffeee66ac5d5a62fe688b9d26f83f534,14
3054,fffd5413c0700ac820c7069d66d98c89,59


In [5]:
customers_per_seller = customers_per_seller.sort_values(
    by='customer_count',
    ascending=False
)
customers_per_seller

Unnamed: 0,seller_id,customer_count
1220,6560211a19b47992c3666cc44a7e94c0,1817
873,4a3ca9315b744ce9f8e9374361493884,1790
2452,cc419e0650a3c5ba77189a1882b7556a,1648
366,1f50f920176fa81dab994f9023523100,1387
1800,955fee9216a65b617aa5c0531780ce60,1281
...,...,...
802,436bf27f2f18474fc6047702e9f8a866,1
804,43753b27d77860f1654aa72e251a7878,1
2214,bac44fa8e13424950488659b5f765c41,1
2215,bac692d6c0ed08467878018e6cc9c26c,1


In [6]:
# Total number of sellers
total_sellers = len(customers_per_seller)

# Number of sellers with only one customer
one_customer_sellers = (customers_per_seller['customer_count'] == 1).sum()

# Percentage
percentage_one_customer = (one_customer_sellers / total_sellers) * 100

print(f"{percentage_one_customer:.2f}% of sellers have only one customer")


18.46% of sellers have only one customer


### orders per customer

In [7]:
orders_per_customer = (
    mart_customer_transactions
        .groupby('customer_id_unique')['order_id']
        .nunique()   # use nunique in case of duplicate order rows
        .reset_index(name='customer_order_count')
)

orders_per_customer

Unnamed: 0,customer_id_unique,customer_order_count
0,0000366f3b9a7992bf8c76cfdf3221e2,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1
2,0000f46a3911fa3c0805444483337064,1
3,0000f6ccb0745a6a4b88665a16c9f078,1
4,0004aac84e0df4da2b147fca70cf8255,1
...,...,...
94984,fffcf5a5ff07b0908bd4e2dbc735a684,1
94985,fffea47cd6d3cc0a88bd621562a9d061,1
94986,ffff371b4d645b6ecea244b27531430a,1
94987,ffff5962728ec6157033ef9805bacc48,1


In [8]:
orders_per_customer = orders_per_customer.sort_values(
    by='customer_order_count',
    ascending=False
).reset_index(drop=True)
orders_per_customer

Unnamed: 0,customer_id_unique,customer_order_count
0,8d50f5eadf50201ccdcedfb9e2ac8455,16
1,3e43e6105506432c953e165fb2acf44c,9
2,ca77025e7201e3b30c44b472ff346268,7
3,1b6c7548a2a1f9037c1fd3ddfed95f33,7
4,6469f99c1f9dfae7733b25662e7f1782,7
...,...,...
94984,5650c85bcdef051d8bc16e0771747686,1
94985,5650bf668868651d51740cd49908814e,1
94986,56509f7bc43892a0da09ee14f50a6f84,1
94987,564feb90710b4f84399f931fa97ea704,1


In [9]:
# Total number of customers
total_customers = len(orders_per_customer)

# Number of customers with only one order
one_order_customers = (orders_per_customer['customer_order_count'] == 1).sum()

# Percentage
percentage_one_order = (one_order_customers / total_customers) * 100

print(f"{percentage_one_order:.2f}% of customers have only one order")

96.96% of customers have only one order


## orders per seller

In [12]:
# Count unique order per seller
orders_per_seller = (
    mart_customer_transactions
        .groupby('seller_id')['order_id']
        .nunique()
        .reset_index(name='order_count')
)
orders_per_seller

Unnamed: 0,seller_id,order_count
0,0015a82c2db000af6aaaf3ae2ecb0532,3
1,001cca7ae9ae17fb1caed9dfb1094831,200
2,002100f778ceb8431b7a1020ff7ab48f,51
3,003554e2dce176b5555353e4f3555ac8,1
4,004c9cd9d87a3c30c522c48c4fc07416,158
...,...,...
3051,ffc470761de7d0232558ba5e786e57b7,28
3052,ffdd9f82b9a447f6f8d4b91554cc7dd3,18
3053,ffeee66ac5d5a62fe688b9d26f83f534,14
3054,fffd5413c0700ac820c7069d66d98c89,59


In [13]:
orders_per_seller = orders_per_seller.sort_values(
    by='order_count',
    ascending=False
)
orders_per_seller

Unnamed: 0,seller_id,order_count
1220,6560211a19b47992c3666cc44a7e94c0,1847
873,4a3ca9315b744ce9f8e9374361493884,1804
2452,cc419e0650a3c5ba77189a1882b7556a,1697
366,1f50f920176fa81dab994f9023523100,1403
2613,da8622b14eb17ae2831f4ac5b9dab84a,1314
...,...,...
2476,ce3ad9de960102d0677a81f5d0bb7b2d,1
2477,ce4755084bc097113867e6454f8f5e52,1
1130,5d043cd5512d4bd2f88e5ccdd5736c38,1
2483,ceb7b4fb9401cd378de7886317ad1b47,1


In [14]:
# Total number of sellers
total_sellers = len(orders_per_seller)

# Number of sellers with only one customer
one_order_sellers = (orders_per_seller['order_count'] == 1).sum()

# Percentage
percentage_one_order = (one_order_sellers / total_sellers) * 100

print(f"{percentage_one_order:.2f}% of sellers have only one order")

18.42% of sellers have only one order


In [15]:
from sqlalchemy import create_engine
from sqlalchemy import text 
import pandas as pd

In [16]:
from dotenv import dotenv_values

config = dotenv_values()

pg_user = config['POSTGRES_USER']  
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [17]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}' #the same like version 1

engine = create_engine(url, echo=False) 

my_schema = 'team_jjat'

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [18]:
prep_order_items = pd.read_sql('SELECT * FROM prep_order_items;', con=engine)
prep_order_items.head()

Unnamed: 0,order_id,product_id,order_item_id,seller_id,shipping_limit_date,price,freight_value,count,product_category
0,00010242fe8c5a6d1ba2dd792cb16214,4244733e06e7ecb4970a6e2683c13e61,1,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,1,miscellaneous
1,00018f77f2f0320c557190d7a144bdd3,e5f2d52b802189ee658865ca93d83a8f,1,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,1,pet_supplies
2,000229ec398224ef6ca0657da4fc703e,c777355d18b72b67abbeef9df44fd0fd,1,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87,1,furniture
3,00024acbcdf0a6daa1e931b038114c75,7634da152a4610f1595efa32f14722fc,1,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79,1,beauty
4,00042b26cf59d7ce69dfabb4e55b4fd9,ac6c3623068f30de03045865e4e10089,1,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14,1,garden_outdoor


## products per order

In [19]:
# Count unique product per order
products_per_order = (
    prep_order_items
        .groupby('order_id')['product_id']
        .nunique()
        .reset_index(name='product_count')
)
products_per_order

Unnamed: 0,order_id,product_count
0,00010242fe8c5a6d1ba2dd792cb16214,1
1,00018f77f2f0320c557190d7a144bdd3,1
2,000229ec398224ef6ca0657da4fc703e,1
3,00024acbcdf0a6daa1e931b038114c75,1
4,00042b26cf59d7ce69dfabb4e55b4fd9,1
...,...,...
98661,fffc94f6ce00a00581880bf54a75a037,1
98662,fffcd46ef2263f404302a634eb57f7eb,1
98663,fffce4705a9662cd70adb13d4a31832d,1
98664,fffe18544ffabc95dfada21779c9644f,1


In [20]:
products_per_order = products_per_order.sort_values(
    by='product_count',
    ascending=False
)
products_per_order

Unnamed: 0,order_id,product_count
77909,ca3625898fbd48669d50701aba51cd5f,8
48309,7d8f5bfd5aff648220374a2df62e84d5,7
46116,77df84f9195be22a4e9cb72ca9e8b4c2,7
66653,ad850e69fce9a512ada84086651a2e7d,7
36443,5efc0b7fe9df7f0c567404abaa4d25fc,6
...,...,...
33440,56d0a27ec26d1644336aff1ec939c089,1
33439,56d01e37808556640e75a40c8202b0a8,1
33438,56ce64b53ec3bb5b0525f21506619f8e,1
33437,56ce3873eb6d633a47581ec221f50a65,1


In [24]:
# Total number of orders
total_orders = len(products_per_order)

# Number of orders with only one product
one_product_order = (products_per_order['product_count'] == 1).sum()

# Percentage
percentage_one_product = (one_product_order / total_orders) * 100

print(f"{percentage_one_product:.2f}% of orders have only one product")

96.72% of orders have only one product


## products per seller

In [25]:
# Count unique product per seller
products_per_seller = (
    prep_order_items
        .groupby('seller_id')['product_id']
        .nunique()
        .reset_index(name='product_count')
)
products_per_seller

Unnamed: 0,seller_id,product_count
0,0015a82c2db000af6aaaf3ae2ecb0532,1
1,001cca7ae9ae17fb1caed9dfb1094831,11
2,001e6ad469a905060d959994f1b41e4f,1
3,002100f778ceb8431b7a1020ff7ab48f,24
4,003554e2dce176b5555353e4f3555ac8,1
...,...,...
3090,ffcfefa19b08742c5d315f2791395ee5,1
3091,ffdd9f82b9a447f6f8d4b91554cc7dd3,12
3092,ffeee66ac5d5a62fe688b9d26f83f534,3
3093,fffd5413c0700ac820c7069d66d98c89,29


In [26]:
products_per_seller = products_per_seller.sort_values(
    by='product_count',
    ascending=False
)
products_per_seller

Unnamed: 0,seller_id,product_count
881,4a3ca9315b744ce9f8e9374361493884,399
2485,cca3071e3e9bb7d12640c9fbe2301306,322
2617,d91fb3b7d041e83b64a00a3edfb37e4f,315
3024,fa1c13f2614d7b5c4749cbc52fecda94,289
1391,7142540dd4c91e2237acb7e911c4eba2,266
...,...,...
1704,8b181ee5518df84f18f4e1a43fe07923,1
1703,8b11118053756a9a2932f447e11fc1b5,1
1691,8a1ff5c35f6595a73fef4c7b96e4908a,1
1686,89d9a386b0b6e5fc8403071b03f7206a,1


In [27]:
# Total number of sellers
total_sellers = len(products_per_seller)

# Number of sellers with only one product
one_product_sellers = (products_per_seller['product_count'] == 1).sum()

# Percentage
percentage_one_product = (one_product_sellers / total_sellers) * 100

print(f"{percentage_one_product:.2f}% of sellers have only one product")

24.10% of sellers have only one product
