In [2]:
import pandas as pd
import numpy as np

import os
import glob

import re

pd.options.display.max_columns = None

## **Data Understanding**

Ini adalah dataset publik e-commerce Brasil dari pesanan yang dibuat di Olist Store. Dataset ini memiliki informasi dari 100 ribu pesanan dari tahun 2016 hingga 2018 yang dibuat di beberapa pasar di Brasil. Fitur-fiturnya memungkinkan untuk melihat pesanan dari berbagai dimensi: mulai dari status pesanan, harga, pembayaran, dan kinerja pengiriman hingga lokasi pelanggan, atribut produk, dan akhirnya ulasan yang ditulis oleh pelanggan. Selain itu ada juga dataset geolokasi yang menghubungkan kode pos Brasil dengan koordinat lintang/lintang.

### **Data Schema**

Data dibagi dalam beberapa dataset. Skema data bisa dilihat pada gambar berikut:
<p> <p>
<figure>
  <center><img src = "https://raw.githubusercontent.com/aridiawan/ecommerce-analysis/main/data/raw/ERD.jpg" alt ="Scheme Data Table" width = 900; height = 647></center>
  <figcaption align = "center">Gambar 1. <em>Scheme Data</em> </figcaption>
</figure>
<p> <p>

### **Feature Description**

Berikut adalah fitur-fitur yang tersedia dalam dataset:
<p> <p>
<figure>
  <center><img src = "https://raw.githubusercontent.com/aridiawan/ecommerce-analysis/main/data/raw/Olist-Dataset-Description.jpg" alt ="Scheme Data Table" width = 900; height = 1056></center>
  <figcaption align = "center">Gambar 2. <em>Deskripsi Feature</em> </figcaption>
</figure>
<p> <p>

## Load Data

In [3]:
mydir = r"D:\PURWADHIKA\FINAL PROJECT\ecommerce-analysis\data\raw"

file_list = glob.glob(mydir + "\*.csv")
file_list 

['D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_customers_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_geolocation_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_orders_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_order_items_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_order_payments_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_order_reviews_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_products_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\olist_sellers_dataset.csv',
 'D:\\PURWADHIKA\\FINAL PROJECT\\ecommerce-analysis\\data\\raw\\product_category_name_translation.csv']

In [4]:
def efn(text):
    file = text.split('\\')[-1].replace('.csv','')
    return file

def white_string_to_nan(data):
    return data.replace(r'^\s*$', np.nan, regex=True)

In [5]:
dfs = {f'{efn(file)}_df': white_string_to_nan(pd.read_csv(file, engine="pyarrow")) for file in file_list}

In [6]:
for n in dfs.keys():
    print(n)

olist_customers_dataset_df
olist_geolocation_dataset_df
olist_orders_dataset_df
olist_order_items_dataset_df
olist_order_payments_dataset_df
olist_order_reviews_dataset_df
olist_products_dataset_df
olist_sellers_dataset_df
product_category_name_translation_df


In [7]:
dataInfo = []

for n in dfs.keys():
    dataInfo.append({
        'dataset_name' : n,
        'shape' : dfs[n].shape,
        'columns' : list(dfs[n].columns),
        'duplicate' : dfs[n].duplicated().sum(),
        'null' : dfs[n].isna().sum().sum(),
        'null_columns' : dfs[n].columns[dfs[n].isna().any()].tolist()
    })

pd.DataFrame(dataInfo)


Unnamed: 0,dataset_name,shape,columns,duplicate,null,null_columns
0,olist_customers_dataset_df,"(99441, 5)","[customer_id, customer_unique_id, customer_zip...",0,0,[]
1,olist_geolocation_dataset_df,"(1000163, 5)","[geolocation_zip_code_prefix, geolocation_lat,...",261831,0,[]
2,olist_orders_dataset_df,"(99441, 8)","[order_id, customer_id, order_status, order_pu...",0,4908,"[order_approved_at, order_delivered_carrier_da..."
3,olist_order_items_dataset_df,"(112650, 7)","[order_id, order_item_id, product_id, seller_i...",0,0,[]
4,olist_order_payments_dataset_df,"(103886, 5)","[order_id, payment_sequential, payment_type, p...",0,0,[]
5,olist_order_reviews_dataset_df,"(99224, 7)","[review_id, order_id, review_score, review_com...",0,145932,"[review_comment_title, review_comment_message]"
6,olist_products_dataset_df,"(32951, 9)","[product_id, product_category_name, product_na...",0,2448,"[product_category_name, product_name_lenght, p..."
7,olist_sellers_dataset_df,"(3095, 4)","[seller_id, seller_zip_code_prefix, seller_cit...",0,0,[]
8,product_category_name_translation_df,"(71, 2)","[product_category_name, product_category_name_...",0,0,[]


In [18]:
# skimming function

def skim(data, name):
    summary = {
                'dataset' : name,
                'column': data.columns.values,
                'type': data.dtypes.values,
                'n_unique': data.nunique().values,
                'n_missing': data.isna().sum().reset_index()[0],
                'pct_missing': round(data.isna().sum().reset_index()[0]/len(data)*100, 2),
                'min': data.dropna().min().values,
                'max': data.dropna().max().values
                }
    return summary

In [22]:
summ = []

for n in dfs.keys():
    summm = skim(dfs[n],n)

summm

{'dataset': 'product_category_name_translation_df',
 'column': array(['product_category_name', 'product_category_name_english'],
       dtype=object),
 'type': array([dtype('O'), dtype('O')], dtype=object),
 'n_unique': array([71, 71], dtype=int64),
 'n_missing': 0    0
 1    0
 Name: 0, dtype: int64,
 'pct_missing': 0    0.0
 1    0.0
 Name: 0, dtype: float64,
 'min': array(['agro_industria_e_comercio', 'agro_industry_and_commerce'],
       dtype=object),
 'max': array(['utilidades_domesticas', 'watches_gifts'], dtype=object)}

**olist_customers_dataset_df**

**olist_geolocation_dataset_df**

**olist_orders_dataset_df**

**olist_order_items_dataset_df**

**olist_order_payments_dataset_df**

**olist_order_reviews_dataset_df**

**olist_products_dataset_df**

**olist_sellers_dataset_df**

**product_category_name_translation_df**