# Pré-processamento do Dataset da Olist

Origem do dataset: 
kaggle/olistbr-brazilian-ecommerce

Segundo a descrição do kaggle, este dataset contém:

dados coletados entre 2016 e 2018;
~100 mil pedidos de múltiplos marketplaces no Brasil.

## Para baixar os dados
Entre no link do Kaggle e faça download dos datasets e coloque-os na pasta chamada data:

- olist_customers_dataset.csv
- olist_orders_dataset.csv
- olist_order_items_dataset.csv
- olist_geolocation_dataset.csv
- olist_products_dataset.csv
- olist_order_reviews_dataset.csv

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
cliente_df = pd.read_csv("data/olist_customers_dataset.csv")
pedido_df = pd.read_csv("data/olist_orders_dataset.csv")
pedido_itens_df = pd.read_csv("data/olist_order_items_dataset.csv")
produto_df = pd.read_csv("data/olist_products_dataset.csv")
review_df = pd.read_csv("data/olist_order_reviews_dataset.csv")
geo_df = pd.read_csv("data/olist_geolocation_dataset.csv")

In [3]:
geo_df.columns = [col.replace("geolocation_", "") for col in geo_df.columns]

In [4]:
datasets_preview = {
    "cliente": cliente_df.head(n=2),
    "pedido": pedido_df.head(n=2),
    "pedido_itens": pedido_itens_df.head(n=2),
    "produto": produto_df.head(n=2),
    "geo": geo_df.head(n=2),
    "review": review_df.head(n=2)
}

In [5]:
for name, dataset in datasets_preview.items():
    print(f"\t\t\t\t**{name}**")
    display(dataset)

				**cliente**


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


				**pedido**


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00


				**pedido_itens**


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93


				**produto**


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0


				**geo**


Unnamed: 0,zip_code_prefix,lat,lng,city,state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP


				**review**


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13


## Preparando o dataset de pedidos


**Passo 1**

- incluir as colunas customer_zip_code_prefix, customer_city e customer_state a partir do dataframe cliente_df
- incluir a coluna product_category_name a partir do dataframe produto_df
- remover colunas customer_id


In [6]:
df = cliente_df \
    .merge(cliente_df.drop("customer_unique_id", 1), on="customer_id") \
    .merge(pedido_df, on="customer_id") \
    .merge(pedido_itens_df, on="order_id") \
    .merge(review_df, on="order_id") \
    .merge(produto_df, on="product_id")     

In [7]:
df = df.drop(["customer_zip_code_prefix_x", "customer_city_x", "customer_state_x"], axis=1)

In [8]:
df.columns = [col.replace("_y", "") for col in df.columns]

**Passo 2**

incluir colunas de geolocalização (latitude, longitude)

In [9]:
geo_df["lat_long"] = geo_df.apply(lambda row: (row["lat"], row["lng"]), axis=1)

In [10]:
lat_long_dict = geo_df.set_index("zip_code_prefix")["lat_long"].to_dict()

In [11]:
for user_type in ["customer"]:
    zip_codes = df[f"{user_type}_zip_code_prefix"].tolist()
    lat_long_list = []
    for zc in tqdm(zip_codes):
        lat_long_list.append(lat_long_dict.get(zc, (0, 0)))
    lats, longs = list(zip(*lat_long_list))
    df[f"{user_type}_lat"] = lats
    df[f"{user_type}_long"] = longs

100%|██████████| 113322/113322 [00:00<00:00, 513342.52it/s]


In [12]:
df.columns

Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'order_id', 'order_status',
       'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'order_item_id', 'product_id',
       'seller_id', 'shipping_limit_date', 'price', 'freight_value',
       'review_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'customer_lat',
       'customer_long'],
      dtype='object')

In [13]:
df = df.drop(['customer_unique_id', 'customer_zip_code_prefix', 'order_status',
       'order_approved_at','order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date', 'seller_id', 
       'shipping_limit_date','freight_value','product_name_lenght', 'review_comment_title',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'review_comment_message', 'review_creation_date', 'review_answer_timestamp',],
        axis=1)

In [14]:
df.columns

Index(['customer_id', 'customer_city', 'customer_state', 'order_id',
       'order_purchase_timestamp', 'order_item_id', 'product_id', 'price',
       'review_id', 'review_score', 'product_category_name', 'customer_lat',
       'customer_long'],
      dtype='object')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113322 entries, 0 to 113321
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   customer_id               113322 non-null  object 
 1   customer_city             113322 non-null  object 
 2   customer_state            113322 non-null  object 
 3   order_id                  113322 non-null  object 
 4   order_purchase_timestamp  113322 non-null  object 
 5   order_item_id             113322 non-null  int64  
 6   product_id                113322 non-null  object 
 7   price                     113322 non-null  float64
 8   review_id                 113322 non-null  object 
 9   review_score              113322 non-null  int64  
 10  product_category_name     111710 non-null  object 
 11  customer_lat              113322 non-null  float64
 12  customer_long             113322 non-null  float64
dtypes: float64(3), int64(2), object(8)
memory us

**Passo 3**

Criando uma nova coluna com o número de produto comprado pelo mesmo cliente

In [16]:
quantidade = df.groupby(['product_category_name','customer_id']).agg({'product_id':'count'})

In [17]:
quantidade.columns=['quantidade_item']

In [18]:
quantidade.reset_index(inplace=True)

In [19]:
df = df \
    .merge(quantidade, on="customer_id") 

In [20]:
df.columns

Index(['customer_id', 'customer_city', 'customer_state', 'order_id',
       'order_purchase_timestamp', 'order_item_id', 'product_id', 'price',
       'review_id', 'review_score', 'product_category_name_x', 'customer_lat',
       'customer_long', 'product_category_name_y', 'quantidade_item'],
      dtype='object')

In [21]:
df = df.drop(["product_category_name_x"], axis=1)
df.columns = [col.replace("_y", "") for col in df.columns]

In [22]:
df.to_csv("olist_processado.csv", index=False)