# 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re

# 2. Load Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datahut Qa Assignment/women_dresses_raw.csv")
df.head()

Unnamed: 0,Product Name,Brand,Category Type,MRP (Maximum Retail Price),Discounted Price / Sale Price,Rating,Number of Reviews,Product URL
0,Women Layered Black Knee Length Dress,decoris,,1299.0,258.0,4.2,235.0,https://www.flipkart.com/neha-fashions-women-l...
1,Women Fit and Flare Red Midi/Calf Length Dress,Stylish Arrow,Midi,999.0,357.0,,,https://www.flipkart.com/stylish-arrow-women-f...
2,Women Gown Maroon Midi/Calf Length Dress,RANGRAIL,Midi,2499.0,369.0,3.6,12.0,https://www.flipkart.com/rangrail-women-gown-m...
3,Women Fit and Flare Blue Midi/Calf Length Dress,Fashion2wear,Midi,999.0,460.0,3.9,357.0,https://www.flipkart.com/fashion2wear-women-fi...
4,Women A-line Gold Midi/Calf Length Dress,ZWERLON,Midi,1999.0,414.0,4.1,5344.0,https://www.flipkart.com/zwerlon-women-a-line-...


# 3. Check & Remove Duplicates

In [None]:
print(f"Duplicate rows (%): {df.duplicated().mean() * 100}")

Duplicate rows (%): 30.561122244488974


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
print(f"Remaining duplicate rows: {df.duplicated().sum()}")

Remaining duplicate rows: 0


# 4. Handle Missing Values

In [None]:
print(f"Missing values: {df.isnull().sum() * 100/len(df)}")

Missing values: Product Name                     0.000000
Brand                            0.000000
Category Type                    9.668110
MRP (Maximum Retail Price)       0.432900
Discounted Price / Sale Price    0.432900
Rating                           4.473304
Number of Reviews                4.473304
Product URL                      0.000000
dtype: float64


In [None]:
#  Drop rows where critical price info is missing
df_cleaned=df.dropna(subset=["MRP (Maximum Retail Price)", "Discounted Price / Sale Price"])

In [None]:
df_cleaned.loc[:, "Rating"] = df_cleaned["Rating"].fillna(0)
df_cleaned.loc[:, "Number of Reviews"] = df_cleaned["Number of Reviews"].fillna(0)
df_cleaned.loc[:, "Category Type"] = df_cleaned["Category Type"].fillna("Unknown")

In [None]:
print(f"Missing values: {df_cleaned.isnull().sum() * 100/len(df_cleaned)}")

Missing values: Product Name                     0.0
Brand                            0.0
Category Type                    0.0
MRP (Maximum Retail Price)       0.0
Discounted Price / Sale Price    0.0
Rating                           0.0
Number of Reviews                0.0
Product URL                      0.0
dtype: float64


# 5. Ensure Numeric Columns

In [None]:
df_cleaned[["MRP (Maximum Retail Price)",
    "Discounted Price / Sale Price",
    "Rating",
    "Number of Reviews"]].dtypes


Unnamed: 0,0
MRP (Maximum Retail Price),float64
Discounted Price / Sale Price,float64
Rating,float64
Number of Reviews,float64


**The columns MRP (Maximum Retail Price), Discounted Price / Sale Price, Rating, and Number of Reviews were already in numeric (float64) format.**

# 6. Brand Cleaning & Standardization

In [None]:
unique_brands = df_cleaned['Brand'].unique()
print("✅ Unique Brands:", unique_brands)
print("\nTotal unique brands:", df_cleaned['Brand'].nunique())

✅ Unique Brands: ['decoris' 'Stylish Arrow' 'RANGRAIL' 'Fashion2wear' 'ZWERLON' 'AAYU'
 'Sheetal Associates' 'Road Rider' 'Dream Tree' 'london belly' 'TANDUL'
 'LuxeStyle' 'TIARA WORLD' 'Attire Empire' 'Crown Creation' 'Honky Tonky'
 'IQRAAR' 'Daevish' 'PURSHOTTAM WALA' 'Tokyo Talkies' 'DRAPE AND DAZZLE'
 'Aahwan' 'Krishna Enterprises' 'NG CREATION' 'Jash Creation'
 'COTTON MULMUL STORE' 'Kirav Ventures' 'AASK' 'MANOJAVA' 'DIPANI'
 'GAZLA FASHION' 'Pant n lites' 'PURVAJA' 'Evergreen trend' 'Bhawanaprint'
 'Moda Rapido' 'DORESUZAA' 'Deklook' 'Uptownie Lite' 'ANOUK' 'Rudraaksha'
 'Gujari' 'PLUSS' 'Ethniclook' 'Asmi Myra' 'TULSATTVA' 'STREET9'
 'JAHU MART' 'Janasya' 'Vishudh' 'Miss Chase' 'VYMO' 'CHERIMODA' 'TIVANTE'
 'indibelle' 'VBUYZ' 'Stylum' 'Emeros' 'Demirner' 'Siddiqa Creation'
 'MUDRAS FASHION' 'Jaipuri Bunaai' 'KRULRIN' 'Get Glamr' 'Globus'
 'Be4Me.com' 'HARPA' 'TESSAVEGAS' 'woclo enterprise' 'God Bless'
 'Sidh Sawai' 'SASSAFRAS' 'DMPRSN' 'Dviera Trendz' 'Nayo'
 'Shakshi Creation

In [None]:
# Manual brand mapping
brand_mapping = {
    # NG variations
    "ng fashion": "Ng",
    "ng store": "Ng",
    "Ng Creation": "Ng",

    # Rare variations
    "Rare Fashion": "Rare",
    "rare": "Rare",

    # Sassafras variations
    "sassafras": "Sassafras",
    "Sassafras Curve": "Sassafras",

    # Prettyplus variations
    "prettyplus": "Prettyplus",
    "Prettyplus By Desinoor": "Prettyplus",

    # Berrylush variations
    "berrylush curve": "Berrylush",
    "berrylush": "Berrylush",

    # As Fashion variations
    "as fashion": "As fashion",
    "A S Fashion": "As fashion",

    # Herway variations
    "herway": "Herway",
    "herby invictus": "Herway",

    # Moda rapido variations
    "Modli  Fashion": "Modli  Fashion ",
    "modli 20 fashion": "Modli  Fashion",
    "Modli  Fashion" : "Modli  Fashion",

    # Be4Me variations
    "be4mecom": "Be4me",
    "be4me.com": "Be4me",

    # U&F variations
    "Uf": "U&F",
    "u & f": "U&F",

    # Zayn variations
    "Zaynfashion": "Zayn",

    # Gulabi Jaipur
    "gulabi jaipur": "Gulabi",

    # Life With Pockets variations
    "life with pockets": "Life with pockets",
    "life w pockets": "Life with pockets",

    # Oomph variations
    "oomph": "Oomph",
    "oomph!": "Oomph",


    # French Connection
    "french connection": "French connection",
    "frenchconnectn": "French connection",

    # Dressberry
    "dressberry": "Dressberry",
    "dress berry": "Dressberry",

    # Ketch
    "ketch": "Ketch",
    "ketch fashion": "Ketch",

    # Vishudh
    "vishudh": "Vishudh",
    "vishud": "Vishudh",

    # Stylestone
    "stylestone": "Stylestone",
    "style stone": "Stylestone",

    # Instafab
    "instafab plus": "Instafab",

    # Pluss
    "Pluss": "Plus",
    "plus": "Plus",

    # Others
    "Fashionwear": "Fashion wear",
    "Crown Creation" : "Crown",
    "Krishna Enterprises" : "Krishna",
    "Jash Creation" : "Jash",
    "Stylestone" : "Style stone",
    "Ethniclook" : "Ethnic look",
    "cotton mulmul store": "Cotton mulmul",
    "road rider": "Road rider",
    "woclo enterprise": "Woclo",
    "Houseofmmon": "House of common",
    'Sheetal Associates': "Sheetal",
    "Bhawanaprint": "Bhawana print",
    "jmdenterprises": "Jmd",
    "maruti nandan impex": "Maruti nandan",
    "manisukmi fashion": "Manisukmi",
    "labhanshi creation": "Labhanshi",
    "Siddiqa Creation": "Siddiqa",
    "Shakshi Creations": "Shakshi",
    "Simran Llections": "Simran",
    "Sheffali Creation" : "Sheffali",
    "Buyuniquefashion" : "Buy unique Fashion"
}



In [None]:
#  Cleaning function for brands

def clean_brand(brand):
    brand = str(brand).strip()
    brand = re.sub(r'\.com$', '', brand)
    brand = re.sub(r'inc\.?|ltd\.?|store|ag|co\.?|corporation', '', brand, flags=re.I)  # remove suffixes
    brand = re.sub(r'[^a-zA-Z\s]', '', brand)
    brand = brand.title().strip()
    return brand

# Apply cleaning function
df_cleaned.loc[:, 'Brand'] = df_cleaned['Brand'].apply(clean_brand)

#  Apply manual mapping
df_cleaned.loc[:, 'Brand'] = df_cleaned['Brand'].replace(brand_mapping)

# 7. Check Brands & Counts

In [None]:
unique_brands = df_cleaned['Brand'].unique()
print("✅ Unique brands after cleaning and mapping:")
print(unique_brands)

# count how many times each brand appears
brand_counts = df_cleaned['Brand'].value_counts()
print("\n✅ Brand counts:")
print(brand_counts)

✅ Unique brands after cleaning and mapping:
['Deris' 'Stylish Arrow' 'Rangrail' 'Fashion wear' 'Zwerlon' 'Aayu'
 'Sheetal' 'Road Rider' 'Dream Tree' 'London Belly' 'Tandul' 'Luxestyle'
 'Tiara World' 'Attire Empire' 'Crown' 'Honky Tonky' 'Iqraar' 'Daevish'
 'Purshottam Wala' 'Tokyo Talkies' 'Drape And Dazzle' 'Aahwan' 'Krishna'
 'Ng' 'Jash' 'Tton Mulmul' 'Kirav Ventures' 'Aask' 'Manojava' 'Dipani'
 'Gazla Fashion' 'Pant N Lites' 'Purvaja' 'Evergreen Trend'
 'Bhawana print' 'Moda Rapido' 'Doresuzaa' 'Deklook' 'Uptownie Lite'
 'Anouk' 'Rudraaksha' 'Gujari' 'Plus' 'Ethnic look' 'Asmi Myra'
 'Tulsattva' 'Street' 'Jahu Mart' 'Janasya' 'Vishudh' 'Miss Chase' 'Vymo'
 'Cherimoda' 'Tivante' 'Indibelle' 'Vbuyz' 'Stylum' 'Emeros' 'Demirner'
 'Siddiqa' 'Mudras Fashion' 'Jaipuri Bunaai' 'Krulrin' 'Get Glamr'
 'Globus' 'Beme' 'Harpa' 'Tessavegas' 'Woclo Enterprise' 'God Bless'
 'Sidh Sawai' 'Sassafras' 'Dmprsn' 'Dviera Trendz' 'Nayo' 'Shakshi' 'Rare'
 'Mialo Fashion' 'Juniper' 'Nk Design' 'Leetos' '

# 8. Save Cleaned Dataset

In [None]:
df_cleaned.to_csv("women_dresses_cleaned.csv", index=False)
print("Cleaned data saved to 'women_dresses_cleaned.csv' and available as df_cleaned DataFrame")

Cleaned data saved to 'women_dresses_cleaned.csv' and available as df_cleaned DataFrame
