In [19]:
import pandas as pd
import numpy as np

import os
import glob

import re

pd.options.display.max_columns = None

## **Data Understanding**

This is a Brazilian ecommerce public dataset of orders made at Olist Store. The dataset has information of 100k orders from 2016 to 2018 made at multiple marketplaces in Brazil. Its features allows viewing an order from multiple dimensions: from order status, price, payment and freight performance to customer location, product attributes and finally reviews written by customers. We also released a geolocation dataset that relates Brazilian zip codes to lat/lng coordinates.

This is real commercial data, it has been anonymised, and references to the companies and partners in the review text have been replaced with the names of Game of Thrones great houses.

### Data Schema

### Feature Description

## **Load Data**

In [20]:
mydir = r"D:\PURWADHIKA\FINAL PROJECT\ecommerce-analysis\data\raw"

file_list = glob.glob(mydir + "\*.csv")
file_list

['D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_customers_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_geolocation_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_orders_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_order_items_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_order_payments_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_order_reviews_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_products_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_sellers_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\product_category_name_translation.csv']

In [21]:
def efn(text):
    file = text.split('\\')[-1].replace('.csv','')
    return file

def white_string_to_nan(data):
    return data.replace(r'^\s*$', np.nan, regex=True)

In [22]:
dfs = {f'{efn(file)}_df': white_string_to_nan(pd.read_csv(file, engine="pyarrow")) for file in file_list}

In [23]:
for n in dfs.keys():
    print(n)

olist_customers_dataset_df
olist_geolocation_dataset_df
olist_orders_dataset_df
olist_order_items_dataset_df
olist_order_payments_dataset_df
olist_order_reviews_dataset_df
olist_products_dataset_df
olist_sellers_dataset_df
product_category_name_translation_df


In [24]:
dataInfo = []

for n in dfs.keys():
    dataInfo.append({
        'dataset_name' : n,
        'shape' : dfs[n].shape,
        'columns' : list(dfs[n].columns),
        'duplicate' : dfs[n].duplicated().sum(),
        'null' : dfs[n].isna().sum().sum(),
        'null_columns' : dfs[n].columns[dfs[n].isna().any()].tolist()
    })

pd.DataFrame(dataInfo)


Unnamed: 0,dataset_name,shape,columns,duplicate,null,null_columns
0,olist_customers_dataset_df,"(99441, 5)","[customer_id, customer_unique_id, customer_zip...",0,0,[]
1,olist_geolocation_dataset_df,"(1000163, 5)","[geolocation_zip_code_prefix, geolocation_lat,...",261831,0,[]
2,olist_orders_dataset_df,"(99441, 8)","[order_id, customer_id, order_status, order_pu...",0,4908,"[order_approved_at, order_delivered_carrier_da..."
3,olist_order_items_dataset_df,"(112650, 7)","[order_id, order_item_id, product_id, seller_i...",0,0,[]
4,olist_order_payments_dataset_df,"(103886, 5)","[order_id, payment_sequential, payment_type, p...",0,0,[]
5,olist_order_reviews_dataset_df,"(99224, 7)","[review_id, order_id, review_score, review_com...",0,145932,"[review_comment_title, review_comment_message]"
6,olist_products_dataset_df,"(32951, 9)","[product_id, product_category_name, product_na...",0,2448,"[product_category_name, product_name_lenght, p..."
7,olist_sellers_dataset_df,"(3095, 4)","[seller_id, seller_zip_code_prefix, seller_cit...",0,0,[]
8,product_category_name_translation_df,"(71, 2)","[product_category_name, product_category_name_...",0,0,[]


In [38]:
# skimming function

def skim(data, keys):
    summary = pd.DataFrame({
                    'dataset': keys,
                    'column': data.columns.values,
                    'type': data.dtypes.values,
                    'n_unique': data.nunique().values,
                    'n_missing': data.isna().sum().reset_index()[0],
                    'pct_missing': round(data.isna().sum().reset_index()[0]/len(data)*100, 2),
                    'min': data.dropna().min().values,
                    'max': data.dropna().max().values
                })
    return summary

In [39]:
for n in dfs.keys():
    data = skim(dfs[n], n)

data

Unnamed: 0,dataset,column,type,n_unique,n_missing,pct_missing,min,max
0,product_category_name_translation_df,product_category_name,object,71,0,0.0,agro_industria_e_comercio,utilidades_domesticas
1,product_category_name_translation_df,product_category_name_english,object,71,0,0.0,agro_industry_and_commerce,watches_gifts


**olist_customers_dataset_df**

**olist_geolocation_dataset_df**

In [37]:
dfs['olist_geolocation_dataset_df'][dfs['olist_geolocation_dataset_df'].duplicated()]

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
15,1046,-23.546081,-46.644820,sao paulo,SP
44,1046,-23.546081,-46.644820,sao paulo,SP
65,1046,-23.546081,-46.644820,sao paulo,SP
66,1009,-23.546935,-46.636588,sao paulo,SP
67,1046,-23.546081,-46.644820,sao paulo,SP
...,...,...,...,...,...
1000153,99970,-28.343273,-51.873734,ciriaco,RS
1000154,99950,-28.070493,-52.011342,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS


**olist_orders_dataset_df**

**olist_order_items_dataset_df**

In [26]:
dfs['olist_order_items_dataset_df']

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.90,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.90,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.00,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.90,18.14
...,...,...,...,...,...,...,...
112645,fffc94f6ce00a00581880bf54a75a037,1,4aa6014eceb682077f9dc4bffebc05b0,b8bc237ba3788b23da09c0f1f3a3288c,2018-05-02 04:11:01,299.99,43.41
112646,fffcd46ef2263f404302a634eb57f7eb,1,32e07fd915822b0765e448c4dd74c828,f3c38ab652836d21de61fb8314b69182,2018-07-20 04:31:48,350.00,36.53
112647,fffce4705a9662cd70adb13d4a31832d,1,72a30483855e2eafc67aee5dc2560482,c3cfdc648177fdbbbb35635a37472c53,2017-10-30 17:14:25,99.90,16.95
112648,fffe18544ffabc95dfada21779c9644f,1,9c422a519119dcad7575db5af1ba540e,2b3e4a2a3ea8e01938cabda2a3e5cc79,2017-08-21 00:04:32,55.99,8.72


In [27]:
skim(dfs['olist_order_items_dataset_df'])

Unnamed: 0,column,type,n_unique,n_missing,pct_missing,min,max
0,order_id,object,98666,0,0.0,00010242fe8c5a6d1ba2dd792cb16214,fffe41c64501cc87c801fd61db3f6244
1,order_item_id,int64,21,0,0.0,1,21
2,product_id,object,32951,0,0.0,00066f42aeeb9f3007548bb9d3f33c38,fffe9eeff12fcbd74a2f2b007dde0c58
3,seller_id,object,3095,0,0.0,0015a82c2db000af6aaaf3ae2ecb0532,ffff564a4f9085cd26170f4732393726
4,shipping_limit_date,datetime64[ns],93318,0,0.0,2016-09-19 00:15:34,2020-04-09 22:35:08
5,price,float64,5968,0,0.0,0.85,6735.0
6,freight_value,float64,6999,0,0.0,0.0,409.68


**olist_order_payments_dataset_df**

In [28]:
payment = dfs['olist_order_payments_dataset_df']
payment.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [29]:
skim(payment)

Unnamed: 0,column,type,n_unique,n_missing,pct_missing,min,max
0,order_id,object,99440,0,0.0,00010242fe8c5a6d1ba2dd792cb16214,fffe41c64501cc87c801fd61db3f6244
1,payment_sequential,int64,29,0,0.0,1,29
2,payment_type,object,5,0,0.0,boleto,voucher
3,payment_installments,int64,24,0,0.0,0,24
4,payment_value,float64,29077,0,0.0,0.0,13664.08


**olist_order_reviews_dataset_df**

**olist_products_dataset_df**

In [30]:
dfs['olist_products_dataset_df']

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0


**olist_sellers_dataset_df**

**product_category_name_translation_df**