## Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Load data set

In [2]:
data = pd.read_csv("../data/data.csv", encoding = 'unicode_escape')

## Explore data

In [3]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

## Renaming the columns

In [4]:
data.rename(columns = {'InvoiceNo':'Invoice_Num',
                       'StockCode':'Item_Code',
                       'Description':'Item_Description',
                       'Quantity':'Amount_Purchased',
                       'InvoiceDate':'Invoice_Date',
                       'UnitPrice':'Price_Per_Unit',
                       'CustomerID':'Customer_ID',
                       'Country':'Country'},
            inplace = True)

In [5]:
data.columns

Index(['Invoice_Num', 'Item_Code', 'Item_Description', 'Amount_Purchased',
       'Invoice_Date', 'Price_Per_Unit', 'Customer_ID', 'Country'],
      dtype='object')

## Top 10 and Last 10 rows of data set

In [6]:
data.head(10)

Unnamed: 0,Invoice_Num,Item_Code,Item_Description,Amount_Purchased,Invoice_Date,Price_Per_Unit,Customer_ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047.0,United Kingdom


In [7]:
data.tail(10)

Unnamed: 0,Invoice_Num,Item_Code,Item_Description,Amount_Purchased,Invoice_Date,Price_Per_Unit,Customer_ID,Country
541899,581587,22726,ALARM CLOCK BAKELIKE GREEN,4,12/9/2011 12:50,3.75,12680.0,France
541900,581587,22730,ALARM CLOCK BAKELIKE IVORY,4,12/9/2011 12:50,3.75,12680.0,France
541901,581587,22367,CHILDRENS APRON SPACEBOY DESIGN,8,12/9/2011 12:50,1.95,12680.0,France
541902,581587,22629,SPACEBOY LUNCH BOX,12,12/9/2011 12:50,1.95,12680.0,France
541903,581587,23256,CHILDRENS CUTLERY SPACEBOY,4,12/9/2011 12:50,4.15,12680.0,France
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,12/9/2011 12:50,4.95,12680.0,France


## Shape of dataframe

In [8]:
data.shape

(541909, 8)

## Item_Description and Customer_ID are missing values

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
Invoice_Num         541909 non-null object
Item_Code           541909 non-null object
Item_Description    540455 non-null object
Amount_Purchased    541909 non-null int64
Invoice_Date        541909 non-null object
Price_Per_Unit      541909 non-null float64
Customer_ID         406829 non-null float64
Country             541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [13]:
data[pd.isna(data['Customer_ID']) == True].count()

Invoice_Num         135080
Item_Code           135080
Item_Description    133626
Amount_Purchased    135080
Invoice_Date        135080
Price_Per_Unit      135080
Customer_ID              0
Country             135080
dtype: int64

## Top 10 Customers by Number of Orders

In [None]:
data['Customer_ID'].value_counts().head(10)

## Basic statistics

In [None]:
data.describe()