This notebook is intended to experiment on different ways to encode the labels to understand which one makes sense.

In [24]:
# Load dataset

import pandas as pd

df = pd.read_csv('../../data/aarong_dataset_clean.csv')

df.sample(5)

Unnamed: 0,product_id,annotated_image_name,fabric_types_clean,wear_types_clean,colors_clean,textures_clean,value_additions_clean
51,11,0560000069371_2.jpg___dress.png,Cotton,Saree,red,"Screen Print,Printed",Screen Print
736,1627,0540000023673_2.jpg,Silk,Saree,"brown,red,pink","applique,embroidery",applique
186,8000,0460000028987_2.jpg,Linen,Nightwear,yellow,"embroidery,printed",
484,3116,0410000105598.jpg,Cotton Blend,Maternity,"black,purple,blue",embroidery,
428,861,0560000070030.jpg,Cotton,Saree,"golden,purple,pink",opera,


Try MLB for textures

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer

def ml_binarizer(df, column_name):
	"""
	Function to transform a column of lists into one-hot encoded columns.
	"""
	mlb = MultiLabelBinarizer()

	temp_col = column_name + '_as_list'
	df[temp_col] = df[column_name].apply(lambda x: x.split(',') if isinstance(x, str) else [])

	one_hot = mlb.fit_transform(df[temp_col])

	one_hot_cols = [f"{column_name}__{c}" for c in mlb.classes_]

	one_hot_df = pd.DataFrame(one_hot, columns=one_hot_cols, index=df.index)

	return pd.concat([df, one_hot_df], axis=1).drop(columns=[temp_col])



In [26]:
df = ml_binarizer(df, 'textures_clean')

df.sample(5)

Unnamed: 0,product_id,annotated_image_name,fabric_types_clean,wear_types_clean,colors_clean,textures_clean,value_additions_clean,textures_clean__Block Print,textures_clean__Floral,textures_clean__Hand Embroidery,...,textures_clean__hand paint,textures_clean__mauve,textures_clean__nakshi,textures_clean__opera,textures_clean__printed,textures_clean__screen print,textures_clean__striped,textures_clean__textured,textures_clean__tie dye,textures_clean__wax dye
646,2134,0550000147859_1.jpg___dress.png,Muslin,Saree,"grey,purple,blue,black,cyan","embroidery,nakshi,printed",nakshi,0,0,0,...,0,0,1,0,1,0,0,0,0,0
383,2927,0430000116817_3.jpg___full.png,Silk,Shalwar Kameez,"yellow,white,green","embroidery,printed,striped",,0,0,0,...,0,0,0,0,1,0,1,0,0,0
70,13,0560000069851_2.jpg,Cotton,Saree,"orange,pink,blue","embroidery,printed",,0,0,0,...,0,0,0,0,1,0,0,0,0,0
64,2,0560000073100_2.jpg___dress.png,Cotton,Saree,"grey,white,red",screen print,,0,0,0,...,0,0,0,0,0,1,0,0,0,0
718,1737,0550000149616_2.jpg,Silk,Saree,"golden,green",opera,,0,0,0,...,0,0,0,1,0,0,0,0,0,0


Let's try one hot instead of mlb to single label feature columns.

In [27]:
from sklearn.preprocessing import OneHotEncoder

def onehot_encode(df, column_name):
	"""
	Generate onehot encoded columns for given column
	"""
	ohe = OneHotEncoder(sparse_output=False, drop='first')
	
	one_hot = ohe.fit_transform(df[[column_name]])
	
	#oh_columns = [f"{column_name}__{oh_category}" for oh_category in ohe.categories_[0]]
	oh_columns = [f"{column_name}__{oh_category}" for oh_category in ohe.categories_[0][1:]]
	one_hot_df = pd.DataFrame(one_hot, columns=oh_columns, index=df.index)

	return pd.concat([df, one_hot_df], axis=1)


In [28]:
df = onehot_encode(df, 'fabric_types_clean')

df.sample(5)

Unnamed: 0,product_id,annotated_image_name,fabric_types_clean,wear_types_clean,colors_clean,textures_clean,value_additions_clean,textures_clean__Block Print,textures_clean__Floral,textures_clean__Hand Embroidery,...,fabric_types_clean__Linen,fabric_types_clean__Muslin,fabric_types_clean__Muslin Blend,fabric_types_clean__Poplin,fabric_types_clean__Silk,fabric_types_clean__Silk Blend,fabric_types_clean__Synthetic,fabric_types_clean__Viscose,fabric_types_clean__Viscose Blend,fabric_types_clean__Voile
637,6619,0570000128211.jpg,Viscose,Panjabi,"pink,purple,blue",block print,,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
786,1985,0540000024800.jpg,Silk,Saree,"brown,golden,pink",,,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75,14,0560000072297_2.jpg___dress.png,Cotton,Saree,"grey,blue","tie dye,screen print",,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
739,1165,0560000067816.jpg,Cotton,Saree,"golden,green,blue",block print,,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,6888,0570000125697.jpg___full.png,Viscose,Panjabi,"brown,yellow,grey",block print,,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
from my_utils.dataset_utils import get_multilabel_counts