In [1]:
import numpy as np
import pandas as pd

#### Loading the dataset

In [3]:
data = pd.read_csv('Fashion Dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,p_id,name,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes
0,0,17048614.0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
1,1,16524740.0,InWeave Women Orange Solid Kurta with Palazzos...,5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
2,2,16331376.0,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
3,3,14709966.0,Nayo Women Red Floral Printed Kurta With Trous...,3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
4,4,11056154.0,AHIKA Women Black & Green Printed Straight Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size..."


### Performing EDA on the data

In [5]:
len(data)

14330

In [6]:
data.dtypes

Unnamed: 0        int64
p_id            float64
name             object
price           float64
colour           object
brand            object
img              object
ratingCount     float64
avg_rating      float64
description      object
p_attributes     object
dtype: object

**Checking for the number of unique values**

In [8]:
data.nunique()

Unnamed: 0        990
p_id            14223
name            13882
price            1209
colour             49
brand            1022
img             14223
ratingCount       829
avg_rating       2368
description     14235
p_attributes    13096
dtype: int64

Dropping the first column. It is not significant

In [10]:
data.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
data.head(1)

Unnamed: 0,p_id,name,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes
0,17048614.0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."


**Checking data for null values**

In [12]:
data.isna().sum()

p_id              18
name              18
price             18
colour            21
brand             18
img               18
ratingCount     7749
avg_rating      7749
description       18
p_attributes      18
dtype: int64

Dropping rows where p_id (Product Id) is null.

In [14]:
data = data[~data['p_id'].isna()]
data.isna().sum()

p_id               0
name               0
price              0
colour             3
brand              0
img                0
ratingCount     7731
avg_rating      7731
description        0
p_attributes       0
dtype: int64

Fixing the 'ratingCount' and 'avg_rating' columns.

The rating is probably null because these could be new products and no one has rated them yet.

In [16]:
data.loc[data['ratingCount'].isna(), 'ratingCount'] = 0
data.loc[data['avg_rating'].isna(), 'avg_rating'] = 0
data.isna().sum()

p_id            0
name            0
price           0
colour          3
brand           0
img             0
ratingCount     0
avg_rating      0
description     0
p_attributes    0
dtype: int64

In [17]:
data[data['colour'].isna()]

Unnamed: 0,p_id,name,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes
367,19145038.0,Baisacrafts Women Pure Cotton Kurta with Trous...,5450.0,,Baisacrafts,http://assets.myntassets.com/assets/images/191...,0.0,0.0,Solid Kurta with Trousers with dupatta<br><br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
2458,19142060.0,LIVE OK Women Boyfriend Fit High-Rise Stretcha...,1999.0,,LIVE OK,http://assets.myntassets.com/assets/images/191...,0.0,0.0,"<ul> <li> Dark shade, no fade jeans </li> <li...","{'Add-Ons': 'NA', 'Body or Garment Size': 'To-..."
14129,16124786.0,MANGO Women Hooded Sweatshirt,2390.0,,MANGO,http://assets.myntassets.com/assets/images/161...,0.0,0.0,"Solid sweatshirt has a hooded, short drop shou...","{'Body Shape ID': '443,424,324', 'Body or Garm..."


Since there are only 3 rows missing the color, the approach is to examine the images and enter the color manually

In [19]:
data.loc[data['p_id'] == 19145038.0, 'colour'] = 'Olive'
data.loc[data['p_id'] == 19142060.0, 'colour'] = 'Grey'
data.loc[data['p_id'] == 16124786.0, 'colour'] = 'Olive'
data.isna().sum()

p_id            0
name            0
price           0
colour          0
brand           0
img             0
ratingCount     0
avg_rating      0
description     0
p_attributes    0
dtype: int64

##### Examining data types

In [21]:
data.dtypes

p_id            float64
name             object
price           float64
colour           object
brand            object
img              object
ratingCount     float64
avg_rating      float64
description      object
p_attributes     object
dtype: object

In [22]:
data = data.convert_dtypes()

In [23]:
data.dtypes

p_id                     Int64
name            string[python]
price                    Int64
colour          string[python]
brand           string[python]
img             string[python]
ratingCount              Int64
avg_rating             Float64
description     string[python]
p_attributes    string[python]
dtype: object

EDA is now complete