# Data Cleaning

In [2]:
# import relevant libraries
import pandas as pd

In [3]:
# import all raw data into datasets

df_customers = pd.read_csv("data/raw/olist_customers_dataset.csv")
df_geolocation = pd.read_csv("data/raw/olist_geolocation_dataset.csv")
df_order_items = pd.read_csv("data/raw/olist_order_items_dataset.csv")
df_order_payments = pd.read_csv("data/raw/olist_order_payments_dataset.csv")
df_order_reviews = pd.read_csv("data/raw/olist_order_reviews_dataset.csv")
df_orders = pd.read_csv("data/raw/olist_orders_dataset.csv")
df_products = pd.read_csv("data/raw/olist_products_dataset.csv")
df_sellers = pd.read_csv("data/raw/olist_sellers_dataset.csv")
df_customers

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP
...,...,...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP
99437,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP
99438,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE
99439,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS


## Customers


In [7]:
# having a look at the data
df_customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [9]:
# checking data types
df_customers.dtypes

customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

In [22]:
# looking to see which is a primary key
df_customers["customer_unique_id"].nunique()

96096

In [23]:
# looking to see which is a primary key
df_customers["customer_id"].nunique()

99441

It appears that the primary key is the customer_id and some customers appear in the data twice

In [44]:
# dropping duplicate rows
df_customers.dropna(inplace=True)
df_customers.shape

(99441, 5)

No duplicate rows found

In [15]:
# looking for null values
df_customers.isnull().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [30]:
# Looking at number of distinct values per column
print(f"There are {df_customers['customer_state'].nunique()} different states.")
print(f"There are {df_customers['customer_city'].nunique()} different cities.")
print(f"There are {df_customers['customer_zip_code_prefix'].nunique()} different ZIP code prefixes.")

There are 27 different states.
There are 4119 different cities.
There are 14994 different ZIP code prefixes.


The data is clean and can be sent directly to be enriched

In [31]:
# exporting to the clean folder
df_customers.to_csv("data/cleaned/clean_customers.csv", index=False)

## Geolocation


In [32]:
# quick glance at the data
df_geolocation.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [35]:
# checking data types
df_geolocation.dtypes

geolocation_zip_code_prefix      int64
geolocation_lat                float64
geolocation_lng                float64
geolocation_city                object
geolocation_state               object
dtype: object

In [38]:
# checking the shape
df_geolocation.shape

(1000163, 5)

In [37]:
# checking for a primary key
df_geolocation["geolocation_zip_code_prefix"].nunique()

19015

This isn't a dataset of zip codes with their relevant rough geolcations, but rather a massive dump if lat/lon geolocations that have their relevant zip/city/state. This dataset is probably the least useful as this level of granularity is probably overkill

In [45]:
# dropping duplicate rows
df_geolocation.dropna(inplace=True)
df_geolocation.shape

(1000163, 5)

No duplicate rows found

In [40]:
# looking for null values
df_geolocation.isnull().sum()

geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64

In [41]:
# Checking to see if the columns match the customer columns or exceed it
print(f"There are {df_geolocation['geolocation_state'].nunique()} different states.")
print(f"There are {df_geolocation['geolocation_city'].nunique()} different cities.")
print(f"There are {df_geolocation['geolocation_zip_code_prefix'].nunique()} different ZIP code prefixes.")

There are 27 different states.
There are 8011 different cities.
There are 19015 different ZIP code prefixes.


This could be that there are sellers who are at locations that there are no customers, or that this dump contains all locations within a range whether there is a seller or customer there. Or both. But it suggests there are likely no missing locations if a join with customers is necessary

In [54]:
import folium
from IPython.display import display

# Define the minimum and maximum latitude and longitude coordinates
min_lat, max_lat = df_geolocation["geolocation_lat"].min(), df_geolocation["geolocation_lat"].max()
min_lon, max_lon = df_geolocation["geolocation_lng"].min(), df_geolocation["geolocation_lng"].max()

# Calculate the center of the rectangle
center_lat = (min_lat + max_lat) / 2
center_lon = (min_lon + max_lon) / 2

# Create a map centered around the center of the rectangle
mymap = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add a rectangle to the map
folium.Rectangle(
    bounds=[[min_lat, min_lon], [max_lat, max_lon]],
    fill=True,
    fill_color='blue',
    fill_opacity=0.2,
    color='blue',
    opacity=0.4,
    weight=2
).add_to(mymap)

# Display map
display(mymap)

This shows the maximum area covered in the geo data. Note the entirety of Brazil and Africa but NOT USA, Australia, Asia or Europe. Especial note to states like California and texas not being part of the data, gives a clue as to the customer/sellers base

The data is clean and can be sent directly to be enriched

In [46]:
# exporting to the clean folder
df_geolocation.to_csv("data/cleaned/clean_geolocation.csv", index=False)

## Order Items

In [47]:
# quick glance at the data
df_order_items.head()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


In [49]:
# checking data types
df_order_items.dtypes

order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object

In [50]:
#  converting the shipping_limit_date to a datetime format
df_order_items["shipping_limit_date"] = pd.to_datetime(df_order_items["shipping_limit_date"])
df_order_items.dtypes

order_id                       object
order_item_id                   int64
product_id                     object
seller_id                      object
shipping_limit_date    datetime64[ns]
price                         float64
freight_value                 float64
dtype: object

In [56]:
df_order_items.describe()

Unnamed: 0,order_item_id,shipping_limit_date,price,freight_value
count,112650.0,112650,112650.0,112650.0
mean,1.197834,2018-01-07 15:36:52.192685312,120.653739,19.99032
min,1.0,2016-09-19 00:15:34,0.85,0.0
25%,1.0,2017-09-20 20:57:27.500000,39.9,13.08
50%,1.0,2018-01-26 13:59:35,74.99,16.26
75%,1.0,2018-05-10 14:34:00.750000128,134.9,21.15
max,21.0,2020-04-09 22:35:08,6735.0,409.68
std,0.705124,,183.633928,15.806405


Average price of 120, with min of 0.85 and max of 6,735. Doesn't raise any red flags immediately.

Average freight value of 20, with min of 0 and max of 409. Doesn't raise any red flags neither. 

Date ranges from Sep 2016 to April 2020 for 3.5 years of data

In [57]:
# checking the shape
df_order_items.shape

(112650, 7)

In [60]:
# checking for a primary key
print(f"{df_order_items['order_id'].nunique()} unique values for order_id.")
print(f"{df_order_items['order_item_id'].nunique()} unique values for order_item_id.")
print(f"{df_order_items['product_id'].nunique()} unique values for product_id.")
print(f"{df_order_items['seller_id'].nunique()} unique values for seller_id.")

98666 unique values for order_id.
21 unique values for order_item_id.
32951 unique values for product_id.
3095 unique values for seller_id.


None of these are a primary key, suggesting this is the list of products per order. So the primary key would be the combination of the order_id and the order_item_id (each individual product within the order). It wouldn't be order_id and product_id as there are likely orders that contain the same product twice and since there is no quantity value here the way to represent that would be to just list the product id and order id for each ordered.

In [67]:
# given this, rather than dropping the duplicate rows its best to check if any exist:
df_order_items[df_order_items.duplicated(keep=False)]
df_order_items.shape

(112650, 7)

Better to check before removing sales data, but it seems to be as I suspected

In [77]:
# checking if order_id and payment_sequential as a combination are a primary key
is_unique = df_order_items.set_index(['order_id', 'order_item_id']).index.is_unique
is_unique

True

In [69]:
# looking for null values
df_order_items.isnull().sum()

order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

The data is clean and can be sent directly to be enriched

In [70]:
# exporting to the clean folder
df_order_items.to_csv("data/cleaned/clean_order_items.csv", index=False)

## Order Payments

In [71]:
## Order Payments

In [72]:
# quick glance at the data
df_order_payments.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [73]:
# checking data types
df_order_payments.dtypes

order_id                 object
payment_sequential        int64
payment_type             object
payment_installments      int64
payment_value           float64
dtype: object

In [74]:
# checking the shape
df_order_payments.shape

(103886, 5)

In [75]:
# checking for a primary key
df_order_payments["order_id"].nunique()

99440

order_id is not a primary key, suggesting an order can have multiple payments, so the primary key is likely a combination of order_id and payment_sequential (the payment number)

In [76]:
# checking if order_id and payment_sequential as a combination are a primary key
is_unique = df_order_payments.set_index(['order_id', 'payment_sequential']).index.is_unique
is_unique

True

In [78]:
# dropping duplicate rows
df_order_payments.dropna(inplace=True)
df_order_payments.shape

(103886, 5)

In [79]:
df_order_payments.describe()

Unnamed: 0,payment_sequential,payment_installments,payment_value
count,103886.0,103886.0,103886.0
mean,1.092679,2.853349,154.10038
std,0.706584,2.687051,217.494064
min,1.0,0.0,0.0
25%,1.0,1.0,56.79
50%,1.0,1.0,100.0
75%,1.0,4.0,171.8375
max,29.0,24.0,13664.08


Max number of payments is 29, with min being 1, mean being very slightlt more than 1, showing most orders are paid for in a single payment; seems reasonable

Min payment of 0 with max of 13,664 seems plausible, mean of 154

In [80]:
# looking for null values
df_order_payments.isnull().sum()

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [84]:
# what are the differnt payment methods?
df_order_payments["payment_type"].value_counts()

payment_type
credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: count, dtype: int64

Boleto means ‘ticket’ and is a popular Brazilian cash payment method. A ‘boleto’ is a printed or virtual voucher with a barcode, payment details, and customer information. Boletos can be paid at thousands of locations across Brazil, from ATMs to digital banks to post offices to in-person branch banks and convenience stores. Boleto can be used for anything from paying rent to eCommerce purchases.

The data is clean and can be sent directly to be enriched

In [85]:
# exporting to the clean folder
df_order_payments.to_csv("data/cleaned/clean_order_payments.csv", index=False)

## Order Reviews

In [86]:
# quick glance at the data
df_order_reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [87]:
# checking data types
df_order_reviews.dtypes

review_id                  object
order_id                   object
review_score                int64
review_comment_title       object
review_comment_message     object
review_creation_date       object
review_answer_timestamp    object
dtype: object

In [88]:
#  converting the review_creation_date and review_answer_timestamp to a datetime format
df_order_reviews["review_creation_date"] = pd.to_datetime(df_order_reviews["review_creation_date"])
df_order_reviews["review_answer_timestamp"] = pd.to_datetime(df_order_reviews["review_answer_timestamp"])
df_order_reviews.dtypes

review_id                          object
order_id                           object
review_score                        int64
review_comment_title               object
review_comment_message             object
review_creation_date       datetime64[ns]
review_answer_timestamp    datetime64[ns]
dtype: object

In [89]:
df_order_reviews.describe()

Unnamed: 0,review_score,review_creation_date,review_answer_timestamp
count,99224.0,99224,99224
mean,4.086421,2018-01-12 20:49:23.948238336,2018-01-16 00:23:56.977938688
min,1.0,2016-10-02 00:00:00,2016-10-07 18:32:28
25%,4.0,2017-09-23 00:00:00,2017-09-27 01:53:27.249999872
50%,5.0,2018-02-02 00:00:00,2018-02-04 22:41:47.500000
75%,5.0,2018-05-16 00:00:00,2018-05-20 12:11:21.500000
max,5.0,2018-08-31 00:00:00,2018-10-29 12:27:35
std,1.347579,,


review scores are from 1-5 with an average just over 4

range of dates is from 2016 october to 2018 october, so covering a large portion of the sales' data range

In [90]:
# checking the shape
df_order_reviews.shape

(99224, 7)

In [91]:
# checking for a primary key
print(f"{df_order_reviews['review_id'].nunique()} unique values for review_id.")
print(f"{df_order_reviews['order_id'].nunique()} unique values for order_id.")

98410 unique values for review_id.
98673 unique values for order_id.


Neither are primary keys, so this data has multiples of the same review and multiples of the same order, probably suggesting the orders and their replies

In [97]:
# check if any duplicate columns exist:
df_order_reviews[df_order_reviews.duplicated(keep=False)]
df_order_reviews.shape

(99224, 7)

In [96]:
# checking if order_id and review_id as a combination are a primary key
is_unique = df_order_reviews.set_index(['order_id', 'review_id']).index.is_unique
is_unique

True

The combination of the order_id and the review id is a primary key

In [102]:
df_order_reviews[df_order_reviews.duplicated('order_id', keep=False)].sort_values(by="order_id").head(30)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
25612,89a02c45c340aeeb1354a24e7d4b2c1e,0035246a40f520710769010f752e7507,5,,,2017-08-29,2017-08-30 01:59:12
22423,2a74b0559eb58fc1ff842ecc999594cb,0035246a40f520710769010f752e7507,5,,Estou acostumada a comprar produtos pelo barat...,2017-08-25,2017-08-29 21:45:57
22779,ab30810c29da5da8045216f0f62652a2,013056cfe49763c6f66bda03396c5ee3,5,,,2018-02-22,2018-02-23 12:12:30
68633,73413b847f63e02bc752b364f6d05ee9,013056cfe49763c6f66bda03396c5ee3,4,,,2018-03-04,2018-03-05 17:02:00
854,830636803620cdf8b6ffaf1b2f6e92b2,0176a6846bcb3b0d3aa3116a9a768597,5,,,2017-12-30,2018-01-02 10:54:06
83224,d8e8c42271c8fb67b9dad95d98c8ff80,0176a6846bcb3b0d3aa3116a9a768597,5,,,2017-12-30,2018-01-02 10:54:47
17582,017f0e1ea6386de662cbeba299c59ad1,02355020fd0a40a0d56df9f6ff060413,1,,ja reclamei varias vezes e ate hoje não sei on...,2018-03-29,2018-03-30 03:16:19
89888,0c8e7347f1cdd2aede37371543e3d163,02355020fd0a40a0d56df9f6ff060413,3,,UM DOS PRODUTOS (ENTREGA02) COMPRADOS NESTE PE...,2018-03-21,2018-03-22 01:32:08
55137,61fe4e7d1ae801bbe169eb67b86c6eda,029863af4b968de1e5d6a82782e662f5,4,,,2017-07-19,2017-07-20 12:06:11
37911,04d945e95c788a3aa1ffbee42105637b,029863af4b968de1e5d6a82782e662f5,5,,,2017-07-14,2017-07-17 13:58:06


Lets translate these reviews to try to understand what these instances of multiple reviews on an order_id represent

In [130]:
import os

# bigquery account credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials/integral-cell-418310-f922d5b5ba7d.json'

In [143]:
from google.cloud import translate_v2 as translate
import numpy as np

translate_client = translate.Client()
target = "en"

# Define translation function
def translate_text(text):
    if pd.isnull(text):  # Check if the value is NaN
        return ""  # Replace NaN with an empty string
    translation = translate_client.translate(text, target_language=target)
    return translation['translatedText']

# Apply translation function to each element in the Series
df_order_reviews["review_comment_title_en"] = df_order_reviews["review_comment_title"].apply(translate_text)
df_order_reviews["review_comment_message_en"] = df_order_reviews["review_comment_message"].apply(translate_text)

# Check duplicated order_ids
df_duplicates = df_order_reviews[df_order_reviews.duplicated('order_id', keep=False)].sort_values(by="order_id").head(30)


In [144]:
df_order_reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,review_comment_title_en,review_comment_message_en
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59,,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13,,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24,,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06,,I received it well before the stipulated deadl...
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53,,Congratulations lannister stores I loved shopp...


In [None]:
# looking for null values
df_order_reviews.isnull().sum()

In [146]:
df_order_reviews.to_csv("data/cleaned/clean_reviews_translated.csv", index=False)

# Orders

In [4]:
# quick glance at the data
df_orders.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


# checking data types
df_order_items.dtypes

In [8]:
# checking data types
df_orders.dtypes

order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

In [9]:
#  converting the shipping_limit_date to a datetime format
df_orders["order_purchase_timestamp"] = pd.to_datetime(df_orders["order_purchase_timestamp"])
df_orders["order_approved_at"] = pd.to_datetime(df_orders["order_approved_at"])
df_orders["order_delivered_carrier_date"] = pd.to_datetime(df_orders["order_delivered_carrier_date"])
df_orders["order_delivered_customer_date"] = pd.to_datetime(df_orders["order_delivered_customer_date"])
df_orders["order_estimated_delivery_date"] = pd.to_datetime(df_orders["order_estimated_delivery_date"])
df_orders.dtypes

order_id                                 object
customer_id                              object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
dtype: object

In [10]:
df_orders.describe()

Unnamed: 0,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
count,99441,99281,97658,96476,99441
mean,2017-12-31 08:43:12.776581120,2017-12-31 18:35:24.098800128,2018-01-04 21:49:48.138278656,2018-01-14 12:09:19.035542272,2018-01-24 03:08:37.730111232
min,2016-09-04 21:15:19,2016-09-15 12:16:38,2016-10-08 10:34:01,2016-10-11 13:46:32,2016-09-30 00:00:00
25%,2017-09-12 14:46:19,2017-09-12 23:24:16,2017-09-15 22:28:50.249999872,2017-09-25 22:07:22.249999872,2017-10-03 00:00:00
50%,2018-01-18 23:04:36,2018-01-19 11:36:13,2018-01-24 16:10:58,2018-02-02 19:28:10.500000,2018-02-15 00:00:00
75%,2018-05-04 15:42:16,2018-05-04 20:35:10,2018-05-08 13:37:45,2018-05-15 22:48:52.249999872,2018-05-25 00:00:00
max,2018-10-17 17:30:18,2018-09-03 17:40:06,2018-09-11 19:48:28,2018-10-17 13:22:46,2018-11-12 00:00:00


In [11]:
# checking the shape
df_orders.shape

(99441, 8)

In [13]:
# order_id is the primary key 
print(f"{df_orders['order_id'].nunique()} unique values for order_id.")


99441 unique values for order_id.


In [15]:
# given this, rather than dropping the duplicate rows its best to check if any exist:
df_orders[df_orders.duplicated(keep=False)].shape

(0, 8)

In [16]:
# looking for null values
df_orders.isnull().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

In [17]:
# exporting to the clean folder
df_orders.to_csv("data/cleaned/clean_orders.csv", index=False)