In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import zipfile

# zf = zipfile.ZipFile('C:/Users/Analytics Vidhya/Desktop/test.zip') # having First.csv zipped file.
# df = pd.read_csv(zf.open('First.csv'))

### 1. Read Data

Ecommerce dataset is a csv file and stored in zip. So, we have to unzip it before reading using pandas.

In [3]:
zf    = zipfile.ZipFile('../data/raw/ecommerce-data.zip')
order = pd.read_csv(zf.open('data.csv'), encoding ='ISO-8859-1', parse_dates= ["InvoiceDate"])
order.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


1. InvoiceNo: Invoice number. If this code starts with letter 'c', it indicates a cancellation.
2. StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product. 
3. Description: Product (item) name. Nominal. 
4. Quantity: The quantities of each product (item) per transaction. Numeric. 
5. InvoiceDate: Invoice Date and time. Numeric, the day and time when each transaction was generated. 
6. UnitPrice: Unit price. Numeric, Product price per unit in sterling. 
7. CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer. 
8. Country: Country name. Nominal, the name of the country where each customer resides.

In [None]:
order.shape

In [None]:
order.set_index(['InvoiceDate'], inplace=True)
order.head()

In [None]:
order.index

### 2. Handle Missing Data

In [None]:
order.isnull().sum()

Assign "unknown" and -99999 to handle missing data in description and CostumerID column. 

In [None]:
order.fillna({})
order = order.fillna({
    'Description': 'unknown',
    'CustomerID' : -99999
})
order.head()

In [None]:
order.isnull().sum()

### 3. Check the Datatype

In [None]:
order.info()

In [None]:
print("InvoiceNo    :", type(order.InvoiceNo[0]))
print("StockCode    :", type(order.StockCode[0]))
print("Description  :", type(order.Description[0]))
print("Quantity     :", type(order.Quantity[0]))
print("UnitPrice    :", type(order.UnitPrice[0]))
print("CustomerID   :", type(order.CustomerID[0]))
print("Country      :", type(order.Country[0]))

Note : 
1. InvoiceNo's and StockCode's data type are a string because some of the data starts with a character 'c'. It means the costumer make a cancelation.
2. CostumerID's data type is a float, it should be an integer.

In [None]:
order.Quantity.astype("int")
print("Quantity     :", type(order.Quantity[0]))

### 4. Handle Wrong Format

Let's check wrong format in each column.

In [None]:
df = order[order['Quantity']<=0]
print(df.shape)
df.head()

In [None]:
df.Description.unique()

In [None]:
df = order[order['StockCode']=='D']
print(df.shape)
df.head()

In [None]:
df = order[order['UnitPrice']<0]
print(df.shape)
df

It is possible to get the price equals to zero. So we keep the data. 

We don't need Discount and "Adjust Bad Debt" data, so we'll delete this data on the order data frame.

In [None]:
df = order[order['UnitPrice']==0]
df = df[df['Quantity']>0]
print(df.shape)
df

In [None]:
real_order = order[order['Quantity']  > 0]
real_order = real_order[real_order['UnitPrice'] > 0]
real_order.shape

### 4. Check Duplicate Data

In [None]:
duplicate_order = order[order.duplicated()]
duplicate_order

There is no duplicate data in the dataframe. 

### 5. Separate Dataframe Between Real & Cancel Order

#### 5.1 Real Order

Real order can be identify from invoice number or quantity. 
The cancelation order happen if invoice number started with 'c' or the quantity less than zero. 

In [None]:
real_order   = order[order['Quantity']>0 ]
real_order   = order[order['UnitPrice']>0 ]
real_order.head()

In [None]:
real_order.shape

In [None]:
num_real = real_order.shape[0]
print('Real Order persentage %f%%' %(num_real/order.shape[0]*100))

#### 5.2 Cancel Order

In [None]:
cancel_order = order[order['Quantity']<0]
cancel_order.head()

In [None]:
cancel_order.shape

In [None]:
num_cancel = cancel_order.shape[0]
print('cancelation persentage %f%%' %(num_cancel/order.shape[0]*100))

#### 5.3 Bad Debt

In [None]:
bad_debt = order[order['UnitPrice']<0]
bad_debt

In [None]:
num_bad = bad_debt.shape[0]
print('Bad Debt persentage %f%%' %(num_bad/order.shape[0]*100))

### 6. Data Visualization