# Overview

This notebook is used to analyze available attributes and their values, and potentially map the attributes with wider range of values to a smaller set of generic values so that there is enough sample available for each class.

In [47]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [48]:
import os
import mysql.connector

conn = mysql.connector.connect(host=os.getenv("MYSQL_HOST") , user=os.getenv("MYSQL_USER"), password=os.getenv("MYSQL_PASSWORD"), database=os.getenv("MYSQL_DB"), auth_plugin='mysql_native_password')

cursor = conn.cursor()

In [49]:
import pandas as pd

def get_annotable_items():
	query = """ 
			SELECT p.id, p.product_name, p.category, p.sub_category, r.details 
			FROM products p 
			JOIN raw_details r ON p.id = r.product_id
			JOIN annotation_status a on p.id=a.product_id
			WHERE a.flagged = 0;
			"""
	
	cursor.execute(query)
	
	rows = cursor.fetchall()
	columns = [i[0] for i in cursor.description]

	df = pd.DataFrame(rows, columns=columns)
	df_normalized = pd.json_normalize(df['details'].apply(eval))
	df = pd.concat([df, df_normalized], axis=1)

	df = df[[
		'id',
		'category',
		'sub_category',
		'title',
		'description',
		'images',
		'specs.Colour',
		'specs.Fabric',
		'specs.Value Addition',
		'specs.Cut /Fit',
		'specs.Collar/Neck',
		'specs.Sleeve',
		'specs.Bottom Colour',
		'specs.Bottom Fabric',
		'specs.Dupatta Colour',
		'specs.Dupatta Fabric'
	]]

	return df

In [50]:
df = get_annotable_items()

df.sample(2)

Unnamed: 0,id,category,sub_category,title,description,images,specs.Colour,specs.Fabric,specs.Value Addition,specs.Cut /Fit,specs.Collar/Neck,specs.Sleeve,specs.Bottom Colour,specs.Bottom Fabric,specs.Dupatta Colour,specs.Dupatta Fabric
4293,7007,Panjabi,,Beige Textured Cotton Panjabi,Beige textured cotton panjabi. Whether it's a ...,[https://www.aarong.com/media/catalog/product/...,Brown,Cotton,Block Print,A-Line,Band Collar,3-Quarter Sleeve,,,,
2602,2604,Shalwar Kameez,Casual,Peach Printed and Embroidered Cotton Shalwar K...,Peach printed cotton kameez with shades of blu...,[https://www.aarong.com/media/catalog/product/...,Orange,Cotton,Hand Embroidery,A-Line,V Neck,3-Quarter Sleeve,,,,


In [51]:
df['specs.Colour'].value_counts()

specs.Colour
Green      821
Brown      569
Blue       559
Purple     458
Orange     440
White      422
Pink       366
Yellow     301
Red        283
Grey       257
Black      248
Cyan       155
Magenta    153
Olive      129
Maroon     100
Golden      25
Name: count, dtype: int64

In [52]:
for index, row in df.iterrows():
	print(row['specs.Value Addition'])
	break

Hand Embroidery


In [53]:
fabric_mapping = {
		"Cotton": "Cotton",
		"Handloom Cotton": "Cotton",
		"Vortex-Cotton": "Cotton",
		"Vortex Cotton": "Cotton",
		"Ramie Cotton": "Cotton",
		"Mixed Cotton": "Cotton Blend",
		"Cottray (Cotton & Rayon)": "Cotton Blend",
		"Single Jersey Knit Cotton": "Cotton Knit",
		"Muslin": "Muslin",
		"Endi Muslin": "Muslin",
		"Mixed Muslin (Muslin & Endi)": "Muslin Blend",
		"Blended Muslin (Muslin & Silk)": "Muslin Blend",
		"Mixed Muslin": "Muslin Blend",
		"Silk": "Silk",
		"Soft Silk": "Silk",
		"Dupion Silk": "Silk",
		"Joysree Silk": "Silk",
		"Katan": "Silk",
		"Endi Silk": "Silk",
		"Endi Silk-Muslin": "Silk Blend",
		"Half Silk (Silk & Cotton)": "Silk Blend",
		"Mixed Silk (Silk & Cotton)": "Silk Blend",
		"Jamdani Half Silk (Silk & Cotton)": "Silk Blend",
		"SICO (Silk & Cotton)": "Silk Blend",
		"Mixed Silk": "Silk Blend",
		"Viscose": "Viscose",
		"Jacquard Viscose": "Viscose",
		"Handloom Viscose": "Viscose",
		"Mixed Viscose": "Viscose Blend",
		"Viscott (Viscose & Cotton)": "Viscose Blend",
		"Linen": "Linen",
		"Satin": "Satin",
		"Satin Cotton": "Satin Blend",
		"Georgette": "Georgette",
		"Poplin": "Poplin",
		"Denim": "Denim",
		"Mixed Vortex": "Blended Fabric",
		"Addi Cotton": "Blended Fabric",
		"Cashmilon": "Synthetic",
		"Wool": "Wool",
	}

if 'Wool' in fabric_mapping:
	print("Wool is in the mapping")

Wool is in the mapping


In [54]:
df['specs.Fabric'].value_counts()

specs.Fabric
Cotton                               1999
Viscott (Viscose & Cotton)            720
Silk                                  705
Viscose                               618
Muslin                                395
Voile                                 227
Half Silk (Silk & Cotton)             129
Linen                                  78
Handloom Viscose                       73
Endi Silk                              66
Mixed Cotton                           39
Soft Silk                              34
Handloom Cotton                        29
Jamdani Half Silk (Silk & Cotton)      27
Mixed Viscose                          25
Mixed Silk (Silk & Cotton)             20
Cashmilon                              14
Poplin                                 11
Addi Cotton                            10
Jacquard Viscose                        8
Mixed Muslin (Muslin & Endi)            7
Ramie Cotton                            7
Wool                                    7
Blended Muslin (Musli

In [10]:
df['specs.Value Addition'].value_counts()

specs.Value Addition
Hand Embroidery                                         1380
Block Print                                             1055
Screen Print                                             522
Machine Embroidery                                       377
Hand Paint                                               208
Nakshi Kantha Embroidery                                 163
Tie & Dye                                                122
Machine Applique                                         118
Batik Print                                               40
Erri Embroidery                                           40
Hand Applique                                             33
Exclusive Hand Embroidery                                 26
Digital Print                                              6
Cylinder Dye                                               3
Brush Paint                                                3
Fountain Dye                                               3
Aci

In [14]:
fabric_mapping = {
    "Cotton": "Cotton",
	"Cotton Knit": "Cotton",
    "Handloom Cotton": "Cotton",
    "Vortex-Cotton": "Cotton",
    "Vortex Cotton": "Cotton",
    "Ramie Cotton": "Cotton",

    "Mixed Cotton": "Cotton Blend",
    "Cottray (Cotton & Rayon)": "Cotton Blend",

    "Single Jersey Knit Cotton": "Cotton",
    
	"Muslin": "Muslin",
    "Endi Muslin": "Muslin",
    "Mixed Muslin (Muslin & Endi)": "Muslin",
    "Blended Muslin (Muslin & Silk)": "Muslin",
    "Mixed Muslin": "Muslin",
	"Muslin Blend": "Muslin",

    "Silk": "Silk",
    "Soft Silk": "Silk",
    "Dupion Silk": "Silk",
    "Joysree Silk": "Silk",
    "Katan": "Silk",
    "Endi Silk": "Silk",

    "Endi Silk-Muslin": "Silk Blend",
    "Half Silk (Silk & Cotton)": "Silk Blend",
    "Mixed Silk (Silk & Cotton)": "Silk Blend",
    "Jamdani Half Silk (Silk & Cotton)": "Silk Blend",
    "SICO (Silk & Cotton)": "Silk Blend",
    "Mixed Silk": "Silk Blend",

    "Viscose": "Viscose",
    "Jacquard Viscose": "Viscose",
    "Handloom Viscose": "Viscose",

    "Mixed Viscose": "Viscose Blend",
    "Viscott (Viscose & Cotton)": "Viscose Blend",

    "Linen": "Linen",

    "Satin": "Satin",
    "Satin Cotton": "Satin",

    "Georgette": "Georgette",
    "Poplin": "Poplin",
    "Denim": "Denim",

    "Mixed Vortex": "Blended Fabric",
    "Addi Cotton": "Blended Fabric",
    
	"Cashmilon": "Synthetic",
    "Wool": "Wool"
}


lost = fabric_mapping.keys() - set(fabric_mapping.values())

lost

{'Addi Cotton',
 'Blended Muslin (Muslin & Silk)',
 'Cashmilon',
 'Cotton Knit',
 'Cottray (Cotton & Rayon)',
 'Dupion Silk',
 'Endi Muslin',
 'Endi Silk',
 'Endi Silk-Muslin',
 'Half Silk (Silk & Cotton)',
 'Handloom Cotton',
 'Handloom Viscose',
 'Jacquard Viscose',
 'Jamdani Half Silk (Silk & Cotton)',
 'Joysree Silk',
 'Katan',
 'Mixed Cotton',
 'Mixed Muslin',
 'Mixed Muslin (Muslin & Endi)',
 'Mixed Silk',
 'Mixed Silk (Silk & Cotton)',
 'Mixed Viscose',
 'Mixed Vortex',
 'Muslin Blend',
 'Ramie Cotton',
 'SICO (Silk & Cotton)',
 'Satin Cotton',
 'Single Jersey Knit Cotton',
 'Soft Silk',
 'Viscott (Viscose & Cotton)',
 'Vortex Cotton',
 'Vortex-Cotton'}

In [15]:
set(fabric_mapping.values())

{'Blended Fabric',
 'Cotton',
 'Cotton Blend',
 'Denim',
 'Georgette',
 'Linen',
 'Muslin',
 'Poplin',
 'Satin',
 'Silk',
 'Silk Blend',
 'Synthetic',
 'Viscose',
 'Viscose Blend',
 'Wool'}

In [16]:
fabric_raw_label_counts = df['specs.Fabric'].value_counts()

fabric_mapped_label_counts = {}

# for each raw fabric label, find the mapped label and add the count to the mapped label

for raw_label, count in fabric_raw_label_counts.items():
	mapped_label = fabric_mapping.get(raw_label, None)
	if mapped_label:
		fabric_mapped_label_counts[mapped_label] = fabric_mapped_label_counts.get(mapped_label, 0) + count


# print(f"Raw Fabric Label Counts:")
# print(fabric_raw_label_counts.to_frame())

print(f"\nMapped Fabric Label Counts:")
for k, v in fabric_mapped_label_counts.items():
	print(f"{k}: {v}")


Mapped Fabric Label Counts:
Cotton: 2040
Viscose Blend: 745
Silk: 814
Viscose: 699
Muslin: 410
Silk Blend: 181
Linen: 78
Cotton Blend: 43
Synthetic: 14
Poplin: 11
Blended Fabric: 11
Wool: 7
Georgette: 1
Satin: 2
Denim: 1


## Fabric Classes

Above cells show the

---

In [24]:
# All rows where title contains "floral"

df[df['description'].str.contains("floral", case=False, na=False)]

Unnamed: 0,id,category,sub_category,title,description,images,specs.Colour,specs.Fabric,specs.Value Addition,specs.Cut /Fit,specs.Collar/Neck,specs.Sleeve,specs.Bottom Colour,specs.Bottom Fabric,specs.Dupatta Colour,specs.Dupatta Fabric
0,1,Saree,Cotton,White Printed and Embroidered Cotton Saree,White printed cotton saree with floral embroid...,[https://www.aarong.com/media/catalog/product/...,White,Cotton,Hand Embroidery,,,,,,,
6,7,Saree,Cotton,Magenta Printed and Embroidered Cotton Saree,Magenta printed cotton saree with floral embro...,[https://www.aarong.com/media/catalog/product/...,Magenta,Cotton,Machine Embroidery,,,,,,,
49,50,Saree,Cotton,White/Orange Printed And Embroidered Cotton Saree,White cotton saree with multicolour floral pri...,[https://www.aarong.com/media/catalog/product/...,White,Cotton,Hand Embroidery,,,,,,,
50,51,Saree,Cotton,Green Printed Cotton Saree,Green cotton saree with multicolour floral pri...,[https://www.aarong.com/media/catalog/product/...,Green,Cotton,Screen Print,,,,,,,
52,53,Saree,Cotton,Light Cyan Printed Cotton Saree,Light cyan cotton saree with multicolour flora...,[https://www.aarong.com/media/catalog/product/...,Cyan,Cotton,Screen Print,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5213,7976,Nightwear,,Midnight Blue Printed Voile Nightwear,Midnight blue voile nightwear with multicolour...,[https://www.aarong.com/media/catalog/product/...,Blue,Voile,,Flared,Round Neck,Half Sleeve,,,,
5219,7982,Nightwear,,Dusty Pink Printed and Embroidered Voile Night...,Dusty pink printed voile nightwear with floral...,[https://www.aarong.com/media/catalog/product/...,Pink,Voile,Hand Embroidery,Flared,Round Neck,Short Sleeve,,,,
5230,7993,Nightwear,,Rust Printed Cotton Nightwear,Rust cotton nightwear with floral prints. Come...,[https://www.aarong.com/media/catalog/product/...,Brown,Voile,Screen Print,A-Line,Round Neck,Half Sleeve,,,,
5250,8013,Nightwear,,Peach Printed and Embroidered Voile Nightwear Set,Peach voile nightwear with embroidery. Comes w...,[https://www.aarong.com/media/catalog/product/...,Orange,Voile,Hand Embroidery,A-Line,Round Neck,3-Quarter Sleeve,,,,


In [25]:
# Save id and title as a csv

df[['id', 'title', 'description']].to_csv('../../data/id-title-description.csv', index=False)

In [None]:
# Get all the titles as list
titles = df['title'].tolist()[:20]

print(titles)

['White Printed and Embroidered Cotton Saree', 'Peach Printed Cotton Saree', 'Light Pink Printed Cotton Saree', 'Peach Appliqued and Embroidered Cotton Saree', 'Magenta Printed and Embroidered Cotton Saree', 'Maroon Wax Dyed and Printed Cotton Saree', 'Dark Red Printed and Embroidered Cotton Saree', 'Red Printed And Embroidered Cotton Saree', 'Dark Magenta Printed Cotton Saree', 'Pink Printed Cotton Saree', 'Pink Printed and Embroidered Cotton Saree', 'Multicolour Sand Splash Dyed and Printed Cotton Saree', 'Purple Printed And Embroidered Cotton Saree', 'Pastel Green Printed Cotton Saree', 'Brown Printed And Embroidered Cotton Saree', 'White/Orange Printed Cotton Saree', 'Peach/Mustard Wax Dyed and Brush Painted Cotton Saree', 'Grey Sand Splash Dyed and Brush Painted Cotton Saree', 'Hot Pink Appliqued and Embroidered Cotton Saree', 'Ivory Printed Cotton Saree', 'Yellow Printed Cotton Saree', 'Yellow Appliqued and Embroidered Cotton Saree', 'Yellow Printed Cotton Saree', 'Mustard Printe

In [None]:
# Get id and title as list of tuples
#id_title = df[['id', 'title', 'description', 'images']].values.tolist()[:2]

#id_title

[[1,
  'White Printed and Embroidered Cotton Saree',
  'White printed cotton saree with floral embroidery. Comes with matching unstitched blouse piece attached at the end of saree. Blouse shown in the photo is a styling suggestion, it is not a part of the actual product.',
  ['https://www.aarong.com/media/catalog/product/0/5/0560000072957.jpg?optimize=high&bg-color=255,255,255&fit=bounds&height=667&width=500&canvas=500:667',
   'https://www.aarong.com/media/catalog/product/0/5/0560000072957_2.jpg?optimize=high&bg-color=255,255,255&fit=bounds&height=667&width=500&canvas=500:667',
   'https://www.aarong.com/media/catalog/product/0/5/0560000072957_1.jpg?optimize=high&bg-color=255,255,255&fit=bounds&height=667&width=500&canvas=500:667']],
 [4,
  'Peach Printed Cotton Saree',
  'Peach cotton saree with multicolour prints. Comes with matching unstitched blouse piece attached at the end of saree. Blouse shown in the photo is a styling suggestion, it is not a part of the actual product.',
  ['

In [37]:
new_df = df[['id', 'title', 'description', 'images']]

# Explode images

#new_df['eval_images'] = new_df['images'].apply(eval)

new_df.explode('images', ignore_index=True)

Unnamed: 0,id,title,description,images
0,1,White Printed and Embroidered Cotton Saree,White printed cotton saree with floral embroid...,https://www.aarong.com/media/catalog/product/0...
1,1,White Printed and Embroidered Cotton Saree,White printed cotton saree with floral embroid...,https://www.aarong.com/media/catalog/product/0...
2,1,White Printed and Embroidered Cotton Saree,White printed cotton saree with floral embroid...,https://www.aarong.com/media/catalog/product/0...
3,4,Peach Printed Cotton Saree,Peach cotton saree with multicolour prints. Co...,https://www.aarong.com/media/catalog/product/0...
4,4,Peach Printed Cotton Saree,Peach cotton saree with multicolour prints. Co...,https://www.aarong.com/media/catalog/product/0...
...,...,...,...,...
13611,8046,Pink Tie-Dyed Voile Nightwear,Pink and purple tie-dyes voile nightwear with ...,https://www.aarong.com/media/catalog/product/0...
13612,8047,Light Purple Tie-Dyed and Embroidered Voile Ni...,Light purple and white tie-dyed voile nightwea...,https://www.aarong.com/media/catalog/product/0...
13613,8047,Light Purple Tie-Dyed and Embroidered Voile Ni...,Light purple and white tie-dyed voile nightwea...,https://www.aarong.com/media/catalog/product/0...
13614,8048,Blue Tie-Dyed and Printed Voile Nightwear,Blue printed and tie-dyed voile nightwear with...,https://www.aarong.com/media/catalog/product/0...


In [36]:
# type of images column

new_df['images'].dtype

dtype('O')

In [14]:
# get list of unique categories
a = df['specs.Bottom Colour'].unique().tolist()

b = df['specs.Dupatta Colour'].unique().tolist()

c = df['specs.Colour'].unique().tolist()

all = a + b + c

# remove duplicates
all = list(dict.fromkeys(all))

print(all)

[nan, 'Olive Green', 'Onion Pink', 'White', 'Blue', 'Mustard', 'Pink', 'Stone Blue', 'Multicolour', 'Black', 'Pastel Green', 'Brick Red', 'Fire Orange', 'Yellow Ochre', 'Grey', 'Green', 'Brown', 'Turquoise', 'Dark Olive', 'Orange', 'Peach', 'Sky Blue', 'Blue Grey', 'Seafoam Green', 'Light Green', 'Burgundy', 'Beige', 'Light Lavender', 'Yellow', 'Lavender', 'Lime Green', 'Plum', 'Navy Blue', 'Coral', 'Cider', 'Sage Green', 'Dark Grey', 'Pastel Yellow', 'Strawberry Red', 'Red', 'Maroon', 'Off White', 'Khaki', 'Ivory', 'Olive', 'Burnt Orange', 'Deep Pink', 'Teal', 'Deep Fuchsia', 'Fuchsia', 'Crimson Red', 'Moss Green', 'Lilac', 'Light Khaki', 'Bottle Green', 'Coffee', 'Purple', 'Cyan', 'Deep Peach', 'Light Brown', 'Sienna', 'Dark Teal', 'Pastel Orange', 'Moonlight Blue', 'Watermelon', 'Ecru', 'Hot Pink', 'Twilight Blue', 'Golden', 'Light Peach', 'Honey Mustard', 'Desert Dust', 'Pale Yellow', 'Lemon Yellow', 'Light Mauve', 'Mulberry', 'Sunny Lime', 'Mint Blue', 'Mauve', 'Magenta', 'Light P

In [21]:
# get list of unique categories
a = df['specs.Fabric'].unique().tolist()

b = df['specs.Bottom Fabric'].unique().tolist()

c = df['specs.Dupatta Fabric'].unique().tolist()

d = df['sub_category'].unique().tolist()

all = a + b + c + d

# remove duplicates
all = list(dict.fromkeys(all))

# print(len(a))
# print(len(b))
# print(len(c))
# print(len(d))

print(all)

['Cotton', 'Muslin', 'Silk', 'Endi Muslin', 'Mixed Muslin (Muslin & Endi)', 'Soft Silk', 'Half Silk (Silk & Cotton)', 'Endi Silk', 'Mixed Silk (Silk & Cotton)', 'Dupion Silk', 'Katan', 'Mixed Cotton', 'Jamdani Half Silk (Silk & Cotton)', 'Viscott (Viscose & Cotton)', 'Viscose', 'Voile', 'Vortex-Cotton', 'Handloom Cotton', 'Jacquard Viscose', 'Vortex Cotton', 'Mixed Viscose', 'Cottray (Cotton & Rayon)', 'Handloom Viscose', 'Ramie Cotton', 'Mixed Silk', 'Joysree Silk', 'Blended Muslin (Muslin & Silk)', 'Mixed Muslin', 'Endi Silk-Muslin', 'Linen', 'Satin', 'SICO (Silk & Cotton)', 'Cashmilon', 'Satin Cotton', 'Wool', nan, 'Georgette', 'Poplin', 'Mixed Vortex', 'Addi Cotton', 'Single Jersey Knit Cotton', 'Denim', 'Sico (Silk & Cotton)', 'Jacquard Cotton', 'Blended Muslin (Muslin+Silk+Endi)', 'Nakshi Kantha', 'Jamdani', 'Brac Silk', 'Casual', 'Semi-Dressy', 'Exclusive', 'Tops', 'Tunics', 'Dresses', 'Pants', 'Sleepwears', '3-Piece Sets', 'Blouse Pieces', 'Yokes', 'N/A']


In [45]:
import re

unique_prefix_counts = {}
unique_suffix_counts = {}

for line in lines:
	# Find all occurrences of "texture" or "textured" and capture the previous and next words
	matches = re.findall(r'(\w+)\s+(nakshi?)\s+(\w+)', line)
	for match in matches:
		previous_word = match[0]
		next_word = match[2]
		#print(f"{previous_word} {match[1]} {next_word} ")

		# Count unique prefixes and suffixes
		if previous_word not in unique_prefix_counts:
			unique_prefix_counts[previous_word] = 0
		unique_prefix_counts[previous_word] += 1
		if next_word not in unique_suffix_counts:
			unique_suffix_counts[next_word] = 0
		unique_suffix_counts[next_word] += 1

# Print the counts of unique prefixes and suffixes
print("Unique Prefix Counts:")

df_prefix_counts = pd.DataFrame(unique_prefix_counts.items(), columns=['Prefix', 'Count'])
df_suffix_counts = pd.DataFrame(unique_suffix_counts.items(), columns=['Suffix', 'Count'])

df_prefix_counts.sort_values(by='Count', ascending=False)[:20]

# for prefix, count in unique_prefix_counts.items():
# 	print(f"{prefix}: {count}")

# print("\nUnique Suffix Counts:")
# for suffix, count in unique_suffix_counts.items():
# 	print(f"{suffix}: {count}")

Unique Prefix Counts:


Unnamed: 0,Prefix,Count
5,of,6
0,brown,3
1,red,1
2,aqua,1
3,copper,1
4,olive,1
6,yellow,1
7,green,1
8,blue,1
9,pink,1


In [46]:
df_suffix_counts.sort_values(by='Count', ascending=False)[:20]

Unnamed: 0,Suffix,Count
0,kantha,18
