# Descriptive Statistics and Grouping

# Import Packages

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
os.getcwd()

'C:\\Users\\LENOVO\\Python\\Beginner'

# Import data from CSV to Dataframe

In [3]:
df = pd.read_csv('C:/Users/LENOVO/Python/Online Retail Data.csv', header=0)
df

Unnamed: 0,order_id,product_code,product_name,quantity,order_date,price,customer_id
0,493410,TEST001,This is a test product.,5,2010-01-04 09:24:00,4.50,12346.0
1,C493411,21539,RETRO SPOTS BUTTER DISH,-1,2010-01-04 09:43:00,4.25,14590.0
2,493412,TEST001,This is a test product.,5,2010-01-04 09:53:00,4.50,12346.0
3,493413,21724,PANDA AND BUNNIES STICKER SHEET,1,2010-01-04 09:54:00,0.85,
4,493413,84578,ELEPHANT TOY WITH BLUE T-SHIRT,1,2010-01-04 09:54:00,3.75,
...,...,...,...,...,...,...,...
461768,539991,21618,4 WILDFLOWER BOTANICAL CANDLES,1,2010-12-23 16:49:00,1.25,
461769,539991,72741,GRAND CHOCOLATECANDLE,4,2010-12-23 16:49:00,1.45,
461770,539992,21470,FLOWER VINE RAFFIA FOOD COVER,1,2010-12-23 17:41:00,3.75,
461771,539992,22258,FELT FARM ANIMAL RABBIT,1,2010-12-23 17:41:00,1.25,


# Data Cleansing

In [4]:
df_clean = df.copy()

delete all lines without product_name


In [5]:
df_clean = df_clean[~df_clean['product_name'].isna()]

make all products


In [6]:
df_clean['product_name'] = df_clean['product_name'].str.lower()

Delete all rows with product_code or product_name test


In [7]:
df_clean = df_clean[(~df_clean['product_code'].str.lower().str.contains('test')) |
                    (~df_clean['product_name'].str.contains('test '))]

create order_status column with value 'cancelled' if order_id starts with letter 'c' and 'delivered' if order_id does not start with letter 'c'

In [8]:
df_clean['order_status'] = np.where(df_clean['order_id'].str[:1]=='C', 'cancelled', 'delivered')

change the negative quantity value to be positive because the negative value only indicates the order cancelled


In [9]:
df_clean['quantity'] = df_clean['quantity'].abs()

delete lines with negative price price


In [10]:
df_clean = df_clean[df_clean['price']>0]

make an amount value, which is the multiplication between quantity and price


In [11]:
df_clean['amount'] = df_clean['quantity'] * df_clean['price']

replace product_name from product_code which has several product_names with one of its product_names that most often appears


In [12]:
most_freq_product_name = df_clean.groupby(['product_code','product_name'], as_index=False).agg(order_cnt=('order_id','nunique')).sort_values(['product_code','order_cnt'], ascending=[True,False])
most_freq_product_name['rank'] = most_freq_product_name.groupby('product_code')['order_cnt'].rank(method='first', ascending=False)
most_freq_product_name = most_freq_product_name[most_freq_product_name['rank']==1].drop(columns=['order_cnt','rank'])

In [13]:
df_clean = df_clean.merge(most_freq_product_name.rename(columns={'product_name':'most_freq_product_name'}), how='left', on='product_code')
df_clean['product_name'] = df_clean['most_freq_product_name']
df_clean = df_clean.drop(columns='most_freq_product_name')

Converting customer_id to string

In [14]:
df_clean['customer_id'] = df_clean['customer_id'].astype(str)
df_clean = df_clean.reset_index(drop=True)
df_clean

Unnamed: 0,order_id,product_code,product_name,quantity,order_date,price,customer_id,order_status,amount
0,C493411,21539,red retrospot butter dish,1,2010-01-04 09:43:00,4.25,14590.0,cancelled,4.25
1,493413,21724,panda and bunnies sticker sheet,1,2010-01-04 09:54:00,0.85,,delivered,0.85
2,493413,84578,elephant toy with blue t-shirt,1,2010-01-04 09:54:00,3.75,,delivered,3.75
3,493413,21723,alphabet hearts sticker sheet,1,2010-01-04 09:54:00,0.85,,delivered,0.85
4,493414,21844,red retrospot mug,36,2010-01-04 10:28:00,2.55,14590.0,delivered,91.80
...,...,...,...,...,...,...,...,...,...
458241,539991,21618,4 wildflower botanical candles,1,2010-12-23 16:49:00,1.25,,delivered,1.25
458242,539991,72741,grand chocolatecandle,4,2010-12-23 16:49:00,1.45,,delivered,5.80
458243,539992,21470,flower vine raffia food cover,1,2010-12-23 17:41:00,3.75,,delivered,3.75
458244,539992,22258,felt farm animal rabbit,1,2010-12-23 17:41:00,1.25,,delivered,1.25


# Things to do frequently in EDA

## Knowing many rows/records and columns/variables/features

In [15]:
df_clean.shape

(458246, 9)

## Know the data type of each column

In [16]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458246 entries, 0 to 458245
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      458246 non-null  object 
 1   product_code  458246 non-null  object 
 2   product_name  458246 non-null  object 
 3   quantity      458246 non-null  int64  
 4   order_date    458246 non-null  object 
 5   price         458246 non-null  float64
 6   customer_id   458246 non-null  object 
 7   order_status  458246 non-null  object 
 8   amount        458246 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 31.5+ MB


## Know the unique value of the categorical column/variable

In [17]:
# Without the frequency information of each unique value
df_clean['order_status'].unique()

array(['cancelled', 'delivered'], dtype=object)

In [18]:
# with the frequency information of each unique value
df_clean['order_status'].value_counts()

order_status
delivered    449996
cancelled      8250
Name: count, dtype: int64

# Calculating Descriptive Statistics from Dataframe

## Knowing the statistics of the numerical data distribution in the whole column in the dataframe as well as quantitatively

In [19]:
df_clean.describe()

Unnamed: 0,quantity,price,amount
count,458246.0,458246.0,458246.0
mean,10.004764,4.903037,19.512293
std,61.180755,99.536887,118.566258
min,1.0,0.001,0.001
25%,1.0,1.25,3.75
50%,3.0,2.1,9.3
75%,10.0,4.21,17.0
max,10000.0,25111.09,25111.09


## Know certain statistics individually

In [20]:
# Frequency (Frequency) - Not Unique
df_clean['order_id'].count()

np.int64(458246)

In [21]:
# Frequency (Frequency) - Unique
df_clean['order_id'].nunique()

22244

In [22]:
# summons (sum)
df_clean['quantity'].sum()

np.int64(4584643)

In [23]:
# summons (sum) subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].sum()

np.int64(166722)

In [24]:
# mean
df_clean['quantity'].mean()

np.float64(10.004763816814549)

In [25]:
# mean subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].mean()

np.float64(9.657205746061168)

In [26]:
# median
df_clean['quantity'].median()

3.0

In [27]:
# median subset from data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].median()

3.0

In [28]:
# mode (mode)
df_clean['quantity'].mode()[0]

np.int64(1)

In [29]:
# mode (mode) subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].mode()[0]

np.int64(1)

In [30]:
# range (range)
df_clean['quantity'].max() - df_clean['quantity'].min()

9999

In [31]:
# Range (range) subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].max() - df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].min()

4607

In [32]:
# Standard Deviation (Standard Deviation)
df_clean['quantity'].std()

61.18075511600227

In [33]:
# Standard Deviation (Standard Deviation) Subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].std()

44.8225496763168

In [34]:
# variance (variance)
df_clean['quantity'].var()

3743.084796564238

In [35]:
df_clean['quantity'].std()**2

3743.0847965642374

In [36]:
# variance (variance) subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].var()

2009.0609594858872

In [37]:
# lower quartile (lower quartile)
df_clean['quantity'].quantile(.25)

np.float64(1.0)

In [38]:
# lower quartile (lower quartile) subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].quantile(.25)

np.float64(1.0)

In [39]:
# Upper quartile (upper quartile)
df_clean['quantity'].quantile(.75)

np.float64(10.0)

In [40]:
# Upper quartile (upper quartile) subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].quantile(.75)

np.float64(8.0)

In [41]:
# Interquartile range (Interquartile Range)
df_clean['quantity'].quantile(.75) - df_clean['quantity'].quantile(.25)

np.float64(9.0)

In [42]:
# Interquartile range (interquartile range) subset of data
df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].quantile(.75) - df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'].quantile(.25)

np.float64(7.0)

In [43]:
def iqr(x):
    q1 = x.quantile(.25)
    q3 = x.quantile(.75)
    return q3-q1

In [44]:
iqr(df_clean.loc[df_clean['product_name'].str.contains('tea'), 'quantity'])

np.float64(7.0)

# Calculating Descriptive Statistics with Grouping

## Aggregate from 1 column only

In [45]:
# with 1 column as grouping
df_clean.loc[df_clean['product_name'].str.contains('tea')].groupby('product_name')['quantity'].sum().sort_values(ascending=False)

product_name
60 teatime fairy cake cases           27584
antique silver tea glass etched       17190
moroccan tea glass                     7218
antique silver tea glass engraved      6941
potting shed tea mug                   6222
                                      ...  
english rose tea set in gift box          5
light topaz teal/aqua col necklace        4
ceramic cake teapot with cherry           3
dotcomgiftshop tea towel                  2
teatime round cake tins                   1
Name: quantity, Length: 87, dtype: int64

In [46]:
df_clean.loc[df_clean['product_name'].str.contains('tea')].groupby('product_name', as_index=False)['quantity'].sum().sort_values('quantity', ascending=False)

Unnamed: 0,product_name,quantity
0,60 teatime fairy cake cases,27584
2,antique silver tea glass etched,17190
30,moroccan tea glass,7218
1,antique silver tea glass engraved,6941
35,potting shed tea mug,6222
...,...,...
17,english rose tea set in gift box,5
27,light topaz teal/aqua col necklace,4
10,ceramic cake teapot with cherry,3
13,dotcomgiftshop tea towel,2


In [47]:
# aggregate with more than 1 column as grouping
df_clean.loc[df_clean['product_name'].str.contains('tea')].groupby(['product_name','order_status'])['quantity'].sum().sort_values(ascending=False)

product_name                       order_status
60 teatime fairy cake cases        delivered       27432
antique silver tea glass etched    delivered       17083
moroccan tea glass                 delivered        7129
antique silver tea glass engraved  delivered        6935
potting shed tea mug               delivered        6113
                                                   ...  
french chateau oval platter        cancelled           1
tea time mug in gift box           cancelled           1
tea time breakfast basket          cancelled           1
teatime round cake tins            cancelled           1
white tea,coffee,sugar jars        cancelled           1
Name: quantity, Length: 139, dtype: int64

## Aggregate from several columns

In [48]:
# with 1 column as grouping
df_clean.loc[df_clean['product_name'].str.contains('tea')].groupby('product_name', as_index=False).agg(total_quantity=('quantity','sum'),total_amount=('amount','sum')).sort_values('total_quantity', ascending=False)

Unnamed: 0,product_name,total_quantity,total_amount
0,60 teatime fairy cake cases,27584,13885.20
2,antique silver tea glass etched,17190,23522.85
30,moroccan tea glass,7218,5744.56
1,antique silver tea glass engraved,6941,8272.97
35,potting shed tea mug,6222,7456.26
...,...,...,...
17,english rose tea set in gift box,5,23.25
27,light topaz teal/aqua col necklace,4,20.36
10,ceramic cake teapot with cherry,3,13.45
13,dotcomgiftshop tea towel,2,6.72


In [49]:
# with more than 1 column as grouping
df_clean.loc[df_clean['product_name'].str.contains('tea')].groupby(['product_name','order_status'], as_index=False).agg(total_quantity=('quantity','sum'),total_amount=('amount','sum')).sort_values('total_quantity', ascending=False)

Unnamed: 0,product_name,order_status,total_quantity,total_amount
1,60 teatime fairy cake cases,delivered,27432,13806.15
5,antique silver tea glass etched,delivered,17083,23402.78
47,moroccan tea glass,delivered,7129,5669.51
3,antique silver tea glass engraved,delivered,6935,8265.47
55,potting shed tea mug,delivered,6113,7321.53
...,...,...,...,...
30,french chateau oval platter,cancelled,1,7.95
105,tea time mug in gift box,cancelled,1,2.95
97,tea time breakfast basket,cancelled,1,2.10
129,teatime round cake tins,cancelled,1,9.95
