# 0.0 Imports

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import inflection

from sklearn.model_selection import train_test_split

# 0.1 Helper Functions

# 0.2 Loading Data

In [9]:
df_raw = pd.read_csv('ecommerce.csv', encoding='latin1')
df_raw = df_raw.drop(columns=['Unnamed: 8'], axis=1)
df_raw.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom


# 1.0 Data Description

In [11]:
df1 = df_raw.copy()

## 1.1 Rename Columns

In [14]:
cols = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

snakecase = lambda x: inflection.underscore(x)

cols_new = list(map(snakecase, cols))

df1.columns = cols_new

## 1.2 Data Dimension

In [15]:
df1.shape

(541909, 8)

## 1.3 Data Types

In [16]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   invoice_no    541909 non-null  object 
 1   stock_code    541909 non-null  object 
 2   description   540455 non-null  object 
 3   quantity      541909 non-null  int64  
 4   invoice_date  541909 non-null  object 
 5   unit_price    541909 non-null  float64
 6   customer_id   406829 non-null  float64
 7   country       541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


## 1.4 Check NA

In [17]:
df1.isna().sum()

invoice_no           0
stock_code           0
description       1454
quantity             0
invoice_date         0
unit_price           0
customer_id     135080
country              0
dtype: int64

## 1.5 Replace NA

In [21]:
df1 = df1.dropna(subset=['customer_id', 'description'])
df1.isna().sum()

invoice_no      0
stock_code      0
description     0
quantity        0
invoice_date    0
unit_price      0
customer_id     0
country         0
dtype: int64

## 1.6 Change Types

In [26]:
df1['invoice_date'] = pd.to_datetime( df1['invoice_date'], format='%d-%b-%y')

df1['customer_id'] = df1['customer_id'].astype(int)

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice_no    406829 non-null  object        
 1   stock_code    406829 non-null  object        
 2   description   406829 non-null  object        
 3   quantity      406829 non-null  int64         
 4   invoice_date  406829 non-null  datetime64[ns]
 5   unit_price    406829 non-null  float64       
 6   customer_id   406829 non-null  int32         
 7   country       406829 non-null  object        
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(4)
memory usage: 26.4+ MB


## 1.7 Descriptive Statistical 

### 1.7.1 Numerical Attributes

In [23]:
df1.describe()

Unnamed: 0,quantity,invoice_date,unit_price,customer_id
count,406829.0,406829,406829.0,406829.0
mean,12.061303,2017-07-08 03:16:19.124890624,3.460471,15287.69057
min,-80995.0,2016-11-29 00:00:00,0.0,12346.0
25%,2.0,2017-04-04 00:00:00,1.25,13953.0
50%,5.0,2017-07-29 00:00:00,1.95,15152.0
75%,12.0,2017-10-18 00:00:00,3.75,16791.0
max,80995.0,2017-12-07 00:00:00,38970.0,18287.0
std,248.69337,,69.315162,1713.600303


### 1.7.2 Categrical Attributes

# 2.0 Feature Engeneering

# 3.0 Exploratory Data Analysis

# 4.0 Data Preparation

# 5.0 Machine Learning Models