# Data exploration and cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data loading

In [2]:
orders = pd.read_csv('../00.Data/orders_cripted.csv')

In [3]:
orders.head()

Unnamed: 0,Name,Financial Status,Paid at,Fulfillment Status,Fulfilled at,Accepts Marketing,Currency,Subtotal,Shipping,Taxes,...,Tax 1 Value,Tax 2 Name,Tax 2 Value,Tax 3 Name,Tax 3 Value,Tax 4 Name,Tax 4 Value,Tax 5 Name,Tax 5 Value,Receipt Number
0,#1244,paid,2019-04-15 09:51:49 +0200,unfulfilled,,yes,EUR,45.0,4.9,0.0,...,,,,,,,,,,
1,#1243,paid,2019-04-11 23:35:23 +0200,fulfilled,2019-04-11 23:43:20 +0200,yes,EUR,40.1,4.9,0.0,...,,,,,,,,,,
2,#1243,,,,,,,,,,...,,,,,,,,,,
3,#1242,refunded,2019-04-11 23:21:35 +0200,unfulfilled,,yes,EUR,40.1,4.9,0.0,...,,,,,,,,,,
4,#1242,,,,,,,,,,...,,,,,,,,,,


It seems that we have a lot of nan values and I'm not sure what the Tax N Value and Name are and seem to be empty. Let's check all Nan values compared to the total amount of data we have.

In [7]:
orders.shape

(2415, 49)

In [8]:
orders.isna().sum()

Name                              0
Financial Status               2178
Paid at                        2258
Fulfillment Status             2178
Fulfilled at                   2208
Accepts Marketing              2178
Currency                       2178
Subtotal                       2178
Shipping                       2178
Taxes                          2178
Total                          2178
Discount Code                  2378
Discount Amount                2178
Shipping Method                2188
Created at                        0
Lineitem quantity                 0
Lineitem name                     0
Lineitem price                    0
Lineitem compare at price      2392
Lineitem sku                     79
Lineitem requires shipping        0
Lineitem taxable                  0
Lineitem fulfillment status       0
Billing City                   2186
Billing Zip                    2186
Billing Country                2185
Notes                          2362
Note Attributes             

Ok, so we have plenty of NaNs. We'll drop all 'Tax' related values except the #1 as there is no data in it.

In [9]:
orders.drop(columns=['Tax 2 Name', 'Tax 2 Value', 'Tax 3 Name', 'Tax 3 Value', 'Tax 4 Name', 'Tax 4 Value', 'Tax 5 Name', 'Tax 5 Value'], inplace=True)

Let's check the nan values for 'Financial status' as it seems a critical value to validate a sale.

In [22]:
pd.set_option('display.max_columns', 500)
orders[orders['Financial Status'].isna()]

Unnamed: 0,Name,Financial Status,Paid at,Fulfillment Status,Fulfilled at,Accepts Marketing,Currency,Subtotal,Shipping,Taxes,Total,Discount Code,Discount Amount,Shipping Method,Created at,Lineitem quantity,Lineitem name,Lineitem price,Lineitem compare at price,Lineitem sku,Lineitem requires shipping,Lineitem taxable,Lineitem fulfillment status,Billing City,Billing Zip,Billing Country,Notes,Note Attributes,Cancelled at,Payment Method,Payment Reference,Refunded Amount,Vendor,Id,Tags,Risk Level,Source,Lineitem discount,Tax 1 Name,Tax 1 Value,Receipt Number
2,#1243,,,,,,,,,,,,,,2019-04-11 23:35:23 +0200,1,TEM - QA - 17 - 2,32.50,,,True,False,fulfilled,,,,,,,,,,Mercat a Casa,,,,,0,,,
4,#1242,,,,,,,,,,,,,,2019-04-11 23:21:34 +0200,1,TEM - QA - 17 - 2,32.50,,,True,False,pending,,,,,,,,,,Mercat a Casa,,,,,0,,,
7,#1240,,,,,,,,,,,,,,2019-04-10 18:04:09 +0200,1,Desayuno salado - 2,15.00,,,True,False,fulfilled,,,,,,,,,,Mercat a Casa,,,,,0,,,
8,#1240,,,,,,,,,,,,,,2019-04-10 18:04:09 +0200,1,Cesta de temporada (1/2 pensión) - 2,59.00,,,True,False,fulfilled,,,,,,,,,,Mercat a Casa,,,,,0,,,
10,#1239,,,,,,,,,,,,,,2019-04-09 20:44:26 +0200,1,Pechuga de pollo fileteada (350 grs.),3.15,,12.0,True,False,fulfilled,,,,,,,,,,El Pagés,,,,,0,,,
11,#1239,,,,,,,,,,,,,,2019-04-09 20:44:26 +0200,1,Brie (150 grs.),2.60,,414.0,True,False,fulfilled,,,,,,,,,,Xarcuteria Alonso Andrés,,,,,0,,,
12,#1239,,,,,,,,,,,,,,2019-04-09 20:44:26 +0200,1,Cesta familiar (1/2 pensión) - 3,79.00,,,True,False,fulfilled,,,,,,,,,,Mercat a Casa,,,,,0,,,
14,#1238,,,,,,,,,,,,,,2019-04-09 17:00:02 +0200,1,Cesta de temporada (1/2 pensión) - 2,59.00,,,True,False,fulfilled,,,,,,,,,,Mercat a Casa,,,,,0,,,
18,#1235,,,,,,,,,,,,,,2019-04-06 20:26:56 +0200,1,Cesta vegetariana (1/2 pensión) - 2,49.95,,,True,False,fulfilled,,,,,,,,,,Mercat a Casa,,,,,0,,,
19,#1235,,,,,,,,,,,,,,2019-04-06 20:26:56 +0200,1,Cesta familiar (1/2 pensión) - 3,79.00,,,True,False,fulfilled,,,,,,,,,,Mercat a Casa,,,,,0,,,


In [21]:
orders[orders['Financial Status'].isna()].isna().sum()

Name                              0
Financial Status               2178
Paid at                        2178
Fulfillment Status             2178
Fulfilled at                   2178
Accepts Marketing              2178
Currency                       2178
Subtotal                       2178
Shipping                       2178
Taxes                          2178
Total                          2178
Discount Code                  2178
Discount Amount                2178
Shipping Method                2178
Created at                        0
Lineitem quantity                 0
Lineitem name                     0
Lineitem price                    0
Lineitem compare at price      2160
Lineitem sku                     34
Lineitem requires shipping        0
Lineitem taxable                  0
Lineitem fulfillment status       0
Billing City                   2178
Billing Zip                    2178
Billing Country                2178
Notes                          2178
Note Attributes             

In [15]:
orders.columns

Index(['Name', 'Financial Status', 'Paid at', 'Fulfillment Status',
       'Fulfilled at', 'Accepts Marketing', 'Currency', 'Subtotal', 'Shipping',
       'Taxes', 'Total', 'Discount Code', 'Discount Amount', 'Shipping Method',
       'Created at', 'Lineitem quantity', 'Lineitem name', 'Lineitem price',
       'Lineitem compare at price', 'Lineitem sku',
       'Lineitem requires shipping', 'Lineitem taxable',
       'Lineitem fulfillment status', 'Billing City', 'Billing Zip',
       'Billing Country', 'Notes', 'Note Attributes', 'Cancelled at',
       'Payment Method', 'Payment Reference', 'Refunded Amount', 'Vendor',
       'Id', 'Tags', 'Risk Level', 'Source', 'Lineitem discount', 'Tax 1 Name',
       'Tax 1 Value', 'Receipt Number'],
      dtype='object')

Is it possible that some columns refer to a the details of an order, thus nan do not apply and the dataframe should be split?