In [170]:
import pandas as pd
import numpy as np
import glob

In [171]:
## PANDAS VERSION BEING USED

pd.__version__

'1.4.3'

## This notebook shows steps involved in EDA as well as general Pandas operations 

## SECTION A : load data and benchmarking

### Step 1 - read file into pandas dataframe

In [172]:
df = pd.read_csv('BigBasket_Products.csv')

### Step 2 - get Total rows and Columns - df.shape

In [173]:
df.shape
print(f'columns:{df.shape[0]}, rows:{df.shape[1]}')

columns:27555, rows:10


In [174]:
df.size

275550

In [175]:
df.index

RangeIndex(start=0, stop=27555, step=1)

### Step 3 - Initial Benchmarking
1. **%%timeit** - how long does it take to run the df <br>
2. **(memory_usage='deep')** - what is the total memory used <br>

In [176]:
#%%timeit
#df
## this is used to see total time to load dataset

In [177]:
## lets get total memory usage - based on initial df
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 28.2 MB


In [178]:
## returns total memory usage - in kilobytes
rnd = (df.memory_usage(deep=True).sum()/1024).round()
rnd

28851.0

In [179]:
convert_to_kb = df.memory_usage().sum()/1024
convert_to_kb

2152.859375

In [180]:
## which columns has high memory usage
df.memory_usage().sum()/1024

2152.859375

In [181]:
## used in conjunction with above to see which columns takes up memory
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 2.1+ MB


### Benchmark Summary

In [182]:
## timeit -  11.4 ns ± 0.0254 ns per loop (mean ± std. dev. of 7 runs, 100,000,000 loops each)
## memory -  memory usage: 28.2 MB

## SECTION B : - inspecting the dataset itself

### Showing the df at a glance

In [183]:
## see first 5 and last 5 rows
df

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.00,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.00,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.00,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.00,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.00,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...
...,...,...,...,...,...,...,...,...,...,...
27550,27551,"Wottagirl! Perfume Spray - Heaven, Classic",Beauty & Hygiene,Fragrances & Deos,Layerr,199.20,249.0,Perfume,3.9,Layerr brings you Wottagirl Classic fragrant b...
27551,27552,Rosemary,Gourmet & World Food,Cooking & Baking Needs,Puramate,67.50,75.0,"Herbs, Seasonings & Rubs",4.0,Puramate rosemary is enough to transform a dis...
27552,27553,Peri-Peri Sweet Potato Chips,Gourmet & World Food,"Snacks, Dry Fruits, Nuts",FabBox,200.00,200.0,Nachos & Chips,3.8,We have taken the richness of Sweet Potatoes (...
27553,27554,Green Tea - Pure Original,Beverages,Tea,Tetley,396.00,495.0,Tea Bags,4.2,"Tetley Green Tea with its refreshing pure, ori..."


In [184]:
## first 5
df.head()
## see specified number
## df.head(n=20)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [185]:
## last 5 rows
df.tail()
## see specified number
## df.tail(n=20)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
27550,27551,"Wottagirl! Perfume Spray - Heaven, Classic",Beauty & Hygiene,Fragrances & Deos,Layerr,199.2,249.0,Perfume,3.9,Layerr brings you Wottagirl Classic fragrant b...
27551,27552,Rosemary,Gourmet & World Food,Cooking & Baking Needs,Puramate,67.5,75.0,"Herbs, Seasonings & Rubs",4.0,Puramate rosemary is enough to transform a dis...
27552,27553,Peri-Peri Sweet Potato Chips,Gourmet & World Food,"Snacks, Dry Fruits, Nuts",FabBox,200.0,200.0,Nachos & Chips,3.8,We have taken the richness of Sweet Potatoes (...
27553,27554,Green Tea - Pure Original,Beverages,Tea,Tetley,396.0,495.0,Tea Bags,4.2,"Tetley Green Tea with its refreshing pure, ori..."
27554,27555,United Dreams Go Far Deodorant,Beauty & Hygiene,Men's Grooming,United Colors Of Benetton,214.53,390.0,Men's Deodorants,4.5,The new mens fragrance from the United Dreams ...


In [186]:
## random sample of rows and columns
df.sample(n=10)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
22562,22563,Ayurvedic Ayush Kwath,Beauty & Hygiene,Health & Medicine,Meghdoot,110.0,110.0,Ayurveda,,Meghdoot Ayurvedic Ayush Kwath is a powerful a...
3327,3328,Noodles - Long Life Egg,Gourmet & World Food,"Pasta, Soup & Noodles",Bali Kitchen,170.0,170.0,Imported Noodles,4.6,Quick to Prepare meal. Low in Fat Content. Enr...
9033,9034,"Blue Lagoon Iced Tea With Lemongrass, Butterfl...",Beverages,Energy & Soft Drinks,Karma Kettle,350.0,350.0,"Icetea, Non Aerated Drink",,Our best selling blend- Blue Lagoon tea is pac...
27401,27402,Cinnamon Filter Coffee - Arabica Roast & Ground,Gourmet & World Food,Drinks & Beverages,Colombian Brew Coffee,135.0,135.0,Coffee & Pre-Mix,,"This is an elegant pack of 100g, 3.5oz - 10 se..."
3088,3089,Milk Shake - Strawberry,"Bakery, Cakes & Dairy",Dairy,Milky Mist,20.1,30.0,"Flavoured, Soya Milk",3.8,The thick and smooth taste of this creamy Milk...
21618,21619,Total 10 Hand Wash,Beauty & Hygiene,Bath & Hand Wash,Lifebuoy,25.0,25.0,Hand Wash & Sanitizers,4.3,Perfect for keeping your hands clean and dirt-...
6967,6968,Lunch Box/Tiffin Set Stainless Steel with Stee...,"Kitchen, Garden & Pets",Steel Utensils,NanoNine,399.0,594.0,Steel Lunch Boxes,1.5,NanoNine Duo Stainless Steel Single wall lunch...
2066,2067,Adhesive - Shoe Fix Shoe Repair,Cleaning & Household,Stationery,Pidilite,76.0,80.0,"Scissor, Glue & Tape",4.0,Pidilite Adhesive Shoe Fix is a fast setting a...
20418,20419,2X French Perfume Fabric Conditioner Refill Pack,Cleaning & Household,Detergents & Dishwash,Softouch,760.32,880.0,"Fabric Pre, Post Wash",4.4,Throw away your fancy perfumes because New Sof...
5614,5615,Mug Chain Coffee Mugs - Super S360,"Kitchen, Garden & Pets",Crockery & Cutlery,Claycraft,379.0,499.0,"Cups, Mugs & Tumblers",3.8,Presenting to you the artistically designed an...


In [187]:
## random sample of columns
df.sample(n=5, axis=1)

Unnamed: 0,index,brand,market_price,sale_price,sub_category
0,1,Sri Sri Ayurveda,220.0,220.00,Hair Care
1,2,Mastercook,180.0,180.00,Storage & Accessories
2,3,Trm,250.0,119.00,Pooja Needs
3,4,Nakoda,176.0,149.00,Bins & Bathroom Ware
4,5,Nivea,162.0,162.00,Bath & Hand Wash
...,...,...,...,...,...
27550,27551,Layerr,249.0,199.20,Fragrances & Deos
27551,27552,Puramate,75.0,67.50,Cooking & Baking Needs
27552,27553,FabBox,200.0,200.00,"Snacks, Dry Fruits, Nuts"
27553,27554,Tetley,495.0,396.00,Tea


In [188]:
## random sample of rows
df.sample(n=5, axis=0)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
23429,23430,Pumpkin Seeds/Kumbalakayi Bija,"Foodgrains, Oil & Masala",Dry Fruits,Himalayan Natives,112.0,140.0,Other Dry Fruits,4.2,Himalayan Natives 100% Natural Pumpkin Seeds a...
3513,3514,Mach3 - Manual Shaving Razor,Beauty & Hygiene,Men's Grooming,Gillette,245.0,250.0,Shaving Care,4.4,The original MACH3 continues its legacy of clo...
19523,19524,Chikmagaluru's Aroma Platinum Filter Coffee De...,Beverages,Coffee,RISE UP,160.0,160.0,Instant Coffee,3.5,Rise up aroma platinum presents filter coffee ...
1579,1580,Idli - Sooji 2kg + Sugar 2kg + Peanuts/ Mungap...,"Foodgrains, Oil & Masala","Atta, Flours & Sooji",bb Royal,243.0,375.0,"Sooji, Maida & Besan",,BB Royal is our in house premium brand of the ...
25935,25936,Cherry,Snacks & Branded Foods,Ready To Cook & Eat,Ask Foods,50.0,50.0,Home Baking,4.0,Decorative items used in making cakes chocolat...


## Section C - data cleaning

In [189]:
##### NOTES......lets look at column headers first
## removing white spaces from column-headers
## cast to upper/lower string
## replace special characters
## dropping columns
## renaming columns
## moving columns into positions
## creating  and inserting new column headers

In [190]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 28.2 MB


In [244]:
## show columns only - in list format
## df.columns
##df.columns.tolist()
df.columns.values.tolist() ##numpy method more memory effecient

['index',
 'product',
 'category',
 'sub_category',
 'brand',
 'sale_price',
 'market_price',
 'type',
 'rating',
 'description']

In [192]:
## df.columns.str.strip().str.upper().tolist()
## df.columns.str.strip().str.lower().tolist()

In [193]:
## replace characters......
## df.columns.str.replace ('_','&&').tolist()

## Section D - converting dtypes for optimal performance

In [194]:
df.head(n=3)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."


In [195]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 28.2 MB


In [196]:
newdf = df.convert_dtypes()
newdf.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  Int64  
 1   product       27554 non-null  string 
 2   category      27555 non-null  string 
 3   sub_category  27555 non-null  string 
 4   brand         27554 non-null  string 
 5   sale_price    27555 non-null  Float64
 6   market_price  27555 non-null  Float64
 7   type          27555 non-null  string 
 8   rating        18929 non-null  Float64
 9   description   27440 non-null  string 
dtypes: Float64(3), Int64(1), string(6)
memory usage: 28.3 MB


In [197]:
newdf.memory_usage().sum()/1024

2260.49609375

In [245]:
## selcting columns based on dtypes
df.select_dtypes(include='object').head()

Unnamed: 0,product,category,sub_category,brand,type,description
0,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,Hair Oil & Serum,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,Water & Fridge Bottles,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,Lamp & Lamp Oil,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,"Laundry, Storage Baskets",Multipurpose container with an attractive desi...
4,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,Bathing Bars & Soaps,Nivea Creme Soft Soap gives your skin the best...


In [251]:
df.select_dtypes(include='Float64').sample(n=15)

Unnamed: 0,sale_price,market_price,rating
26722,59.25,79.0,4.0
15251,175.45,319.0,4.2
25827,159.0,159.0,4.5
16996,27.3,30.0,4.3
15299,420.0,420.0,3.5
2785,1299.0,1299.0,
21811,649.0,649.0,3.8
8803,25.0,25.0,3.9
21593,619.0,1070.0,
3954,153.0,153.0,


## Section E - missing values

In [198]:
## Notes ---- missing values and counts

In [225]:
## first get all count of rows and columns
df.shape
print(f'columns:{df.shape[0]},\nrows:{df.shape[1]}')

columns:27555,
rows:10


In [226]:
## for all numeric data use: df.describe()
## only returns count for numeric type columns - excludes object/string type columns
## count means - count of total non-null values
df.describe()


Unnamed: 0,index,sale_price,market_price,rating
count,27555.0,27555.0,27555.0,18929.0
mean,13778.0,322.514808,382.056664,3.94341
std,7954.58767,486.263116,581.730717,0.739063
min,1.0,2.45,3.0,1.0
25%,6889.5,95.0,100.0,3.7
50%,13778.0,190.0,220.0,4.1
75%,20666.5,359.0,425.0,4.3
max,27555.0,12500.0,12500.0,5.0


In [236]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 28.2 MB


In [202]:
df.isnull().sum()

index              0
product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64

In [203]:
df.isna().sum()

index              0
product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64

In [260]:
df['product'].value_counts()

Turmeric Powder/Arisina Pudi                                     26
Extra Virgin Olive Oil                                           15
Cow Ghee/Tuppa                                                   14
Soft Drink                                                       12
Colorsilk Hair Colour With Keratin                               12
                                                                 ..
Pepper & Herb Salami Chicken                                      1
Nutmeg Powder                                                     1
Disney Mickey Mouse Plastic Kids Sipper Bottle - Coolest Dude     1
Chocolates-Roasted Peanut Chocolate                               1
Green Tea - Pure Original                                         1
Name: product, Length: 23540, dtype: int64

In [265]:
df['product'].value_counts().unique()

array([26, 15, 14, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])