In [1]:
import pandas as pd
import plotly.express as pe

### Product Type: Indicates category of product sold
### Net Quantity : How many pieces/units of this type were sold
### Gross Sales : Total money received for all units sold of this product
### Discounts : How much money was lost due to discounts
### Returns : How much money was lost due to product being returned
### Total Net Sales : Gross sales - discounts - Returns


In [2]:
path = r"/home/harshit/Desktop/PythonDA/datasets/business.retailsales.csv"

df = pd.read_csv(path)

df

Unnamed: 0,Product Type,Net Quantity,Gross Sales,Discounts,Returns,Total Net Sales
0,Art & Sculpture,34,14935.0,-594.00,-1609.00,12732.00
1,Basket,13,3744.0,-316.80,0.00,3427.20
2,Basket,12,3825.0,-201.60,-288.00,3335.40
3,Basket,17,3035.0,-63.25,0.00,2971.75
4,Art & Sculpture,47,2696.8,-44.16,0.00,2652.64
...,...,...,...,...,...,...
1770,Kitchen,0,28.0,-2.81,-25.19,0.00
1771,Jewelry,0,28.0,0.00,-28.00,0.00
1772,Basket,0,116.0,-23.20,-92.80,0.00
1773,Kitchen,0,16.5,0.00,-16.50,0.00


### Initial exploration of data

In [4]:
df.shape

(1775, 6)

### check how many missing values per column

In [5]:
df.isna().sum()

Product Type       8
Net Quantity       0
Gross Sales        0
Discounts          0
Returns            0
Total Net Sales    0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1775 entries, 0 to 1774
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Product Type     1767 non-null   object 
 1   Net Quantity     1775 non-null   int64  
 2   Gross Sales      1775 non-null   float64
 3   Discounts        1775 non-null   float64
 4   Returns          1775 non-null   float64
 5   Total Net Sales  1775 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 83.3+ KB


In [7]:
df.nunique()

Product Type        18
Net Quantity        42
Gross Sales        388
Discounts          295
Returns             62
Total Net Sales    729
dtype: int64

In [8]:
df.columns

Index(['Product Type', 'Net Quantity', 'Gross Sales', 'Discounts', 'Returns',
       'Total Net Sales'],
      dtype='object')

In [9]:
categorical_columns=["Product Type"]
real_value_columns = [ 'Net Quantity', 'Gross Sales', 'Discounts', 'Returns','Total Net Sales']

### show statistical description of columns

In [11]:
df[  categorical_columns   ].describe()

Unnamed: 0,Product Type
count,1767
unique,18
top,Basket
freq,551


In [10]:
df.describe()

Unnamed: 0,Net Quantity,Gross Sales,Discounts,Returns,Total Net Sales
count,1775.0,1775.0,1775.0,1775.0,1775.0
mean,3.712676,199.671746,-6.317623,-5.385437,187.968687
std,6.243078,464.880638,20.903517,46.654269,414.547039
min,-1.0,0.0,-594.0,-1609.0,-106.25
25%,1.0,48.0,-6.0,0.0,44.8
50%,2.0,100.0,0.0,0.0,96.0
75%,4.0,185.5,0.0,0.0,184.75
max,96.0,14935.0,0.0,0.0,12732.0


### Report

#### How many products of each product type we have?

In [14]:
ans = df.groupby(  ["Product Type"]  ).size()
display(ans)

Product Type
Accessories          39
Art & Sculpture     337
Basket              551
Christmas            73
Easter                1
Fair Trade Gifts     28
Furniture            16
Gift Baskets          1
Home Decor          131
Jewelry             210
Kids                 63
Kitchen             161
Music                29
One-of-a-Kind        12
Recycled Art         23
Skin Care            11
Soapstone            67
Textiles             14
dtype: int64

In [22]:
ans = ans.sort_values()

pe.histogram(

    data_frame=ans,
    x=ans.index,
    y=ans.values,
    color=ans.index
)

### What is the average, min & max amounts for 
    -Gross sales
    - Net Sales
    FOR EACH PRODUCT TYPE

                        Gross Sales                    Net Sales
                    min   max   average               min max  average
Jewelry                          
Arts & Sculpture
Kitchen

In [24]:
key=["Product Type"]
cols = ["Gross Sales","Total Net Sales"]
ops = ["min","max","mean"]

ans_df = df.groupby(key)[     cols    ].agg(     ops    )

names = [    "_".join(col) for col in ans_df.columns     ]

ans_df.columns = names

ans_df

Unnamed: 0_level_0,Gross Sales_min,Gross Sales_max,Gross Sales_mean,Total Net Sales_min,Total Net Sales_max,Total Net Sales_mean
Product Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Accessories,22.0,472.0,99.805128,22.0,458.0,97.061026
Art & Sculpture,13.2,14935.0,268.00178,13.2,12732.0,250.685015
Basket,24.0,3825.0,261.008167,0.0,3427.2,244.630472
Christmas,18.0,2160.0,212.0,0.0,2114.41,198.093288
Easter,38.0,38.0,38.0,34.2,34.2,34.2
Fair Trade Gifts,12.0,432.0,80.642857,12.0,412.4,78.738214
Furniture,38.0,350.0,127.125,33.75,280.0,116.56
Gift Baskets,19.5,19.5,19.5,19.5,19.5,19.5
Home Decor,18.0,1748.0,206.981298,18.0,1701.15,196.18313
Jewelry,12.0,1584.0,147.847619,0.0,1516.83,140.823571


In [26]:
for col in ans_df.columns:
    fig = pe.bar(
        x=ans_df.index,
        y=col,
        data_frame = ans_df
    )

    display(fig)

### What is the total units sold for each category?

In [29]:
key=["Product Type"]
col=["Net Quantity"]

result_df = df.groupby(key) [      col   ].sum()

display(result_df)


fig = pe.bar(
    x=ans_df.index,
    y="Net Quantity",
    data_frame=result_df
)

display(fig)

Unnamed: 0_level_0,Net Quantity
Product Type,Unnamed: 1_level_1
Accessories,84
Art & Sculpture,1427
Basket,1461
Christmas,575
Easter,1
Fair Trade Gifts,110
Furniture,27
Gift Baskets,1
Home Decor,404
Jewelry,991
