# Overview

This notebook is used to analyze the raw data scrapped from the website, and eventually proceeded to segment the related images.

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import os
import mysql.connector

conn = mysql.connector.connect(host=os.getenv("MYSQL_HOST") , user=os.getenv("MYSQL_USER"), password=os.getenv("MYSQL_PASSWORD"), database=os.getenv("MYSQL_DB"), auth_plugin='mysql_native_password')

cursor = conn.cursor()

In [None]:
query = """

SELECT p.id, p.product_name, p.category, p.sub_category, r.details 
FROM products p 
JOIN raw_details r ON p.id = r.product_id

WHERE p.category in ("Saree", "Panjabi", "Shalwar Kameez", "Kurta", "Skirts", "Tops", "Scarves", "Coats & Jackets", "Shawls", "Nightwear", "Maternity", "Fabric (Metres)")

"""

cursor.execute(query)

rows = cursor.fetchall()
columns = [i[0] for i in cursor.description]

print(f"{len(rows)} rows fetched")
print(f"Columns: {columns}")

In [None]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)
df

In [None]:
df_normalized = pd.json_normalize(df['details'].apply(eval))

df_normalized.head(n=3)

In [None]:
# Adds the normalized columns to the original DataFrame
df = pd.concat([df, df_normalized], axis=1)
df.head(n=2)

In [None]:
df_selected = df[[
	'id',
	'category',
	'sub_category',
	'title',
	'description',
	'images',
	'specs.Colour',
	'specs.Fabric',
	'specs.Value Addition',
	'specs.Cut /Fit',
	'specs.Collar/Neck',
	'specs.Sleeve',
	'specs.Bottom Colour',
	'specs.Bottom Fabric',
	'specs.Dupatta Colour',
	'specs.Dupatta Fabric'
]]

df_selected.sample(n=5)

In [None]:
def get_annotation_candidates(conn):
	cursor = conn.cursor()
	cursor.execute("""SELECT product_id FROM annotation_status WHERE annotated=0 and flagged=0""")
	rows = cursor.fetchall()	
	ids = [row[0] for row in rows]
	
	return ids


def get_segmentation_candidates(conn):
	cursor = conn.cursor()
	cursor.execute("""SELECT product_id FROM annotation_status WHERE segmented=0 and flagged=0""")
	rows = cursor.fetchall()	
	ids = [row[0] for row in rows]
	
	return ids

def get_item(id):
	# Select the row from df_selected where id matches the given id
	item = df_selected[df_selected['id'] == id]
	if not item.empty:
		return item.iloc[0]
	else:
		return None


In [None]:
get_item(1)

In [None]:
candidates = get_segmentation_candidates(conn)

In [None]:
len(candidates)

In [None]:
def resolve_local_image_path(image_url, image_dir):
	# url "https://www.aarong.com/media/catalog/product/0/5/0560000072957_1.jpg?optimize=high&bg-color=255,255,255&fit=bounds&height=667&width=500&canvas=500:667"
	# resolved as f"{image_dir}/0560000072957_1.jpg"
	image_name = image_url.split("/")[-1].split("?")[0]
	return f"{image_dir}/{image_name}"

In [None]:
for id in candidates:
	item = get_item(id)

	print(item)
	break

In [None]:
from transformers import pipeline
from PIL import Image
import numpy as np
import os

class SegmentationHelper:
	def __init__(self, data_dir, segmented_dir, label_combinations):
		self.data_dir = data_dir
		self.segmented_dir = segmented_dir
		self.label_combinations = label_combinations

		self.segmenter = pipeline(model="mattmdjaga/segformer_b2_clothes")

	def get_masked_image(self, segments, image, labels):
		mask_list = []
		for s in segments:
			if s['label'] in labels:
				mask_list.append(s['mask'])

		if len(mask_list) == 0:
			return None

		final_mask = np.zeros_like(mask_list[0])

		for mask in mask_list:
			final_mask += mask

		#final_mask = np.clip(final_mask, 0, 1)

		final_mask = Image.fromarray(final_mask)
		masked_image = image.copy()

		masked_image.putalpha(final_mask)

		return masked_image

	def get_segmented_images(self, image_path):
		image = Image.open(image_path)
		
		# Perform segmentation
		segments = self.segmenter(image)

		segmented_images = {}

		for key, labels in self.label_combinations.items():
			#print(f"Getting segment {key} with labels: {labels}")
			masked_image = self.get_masked_image(segments, image, labels)

			if masked_image is not None:
				segmented_images[key] = masked_image

		return segmented_images


In [None]:
seg_helper = SegmentationHelper(
    data_dir="../data/raw_images/resized",
    segmented_dir="../data/raw_images/segmented",
    label_combinations={
        "full": ["Upper-clothes", "Skirt", "Pants", "Dress", "Scarf"],
        "dress": ["Dress"],
        "upper": ["Upper-clothes"],
        "lower": ["Skirt", "Pants"],
    }
)

In [None]:
import os

image_dir = seg_helper.data_dir
segmented_dir = seg_helper.segmented_dir

candidates = get_segmentation_candidates(conn)

for id in candidates[:500]:
	item = get_item(id)
	
	# Get the list of images
	all_images = [resolve_local_image_path(image_url, image_dir) for image_url in item['images']]

	#print(all_images)

	# Generate segments for each image, store them in respective folder
	for image_path in all_images:
		if os.path.exists(image_path):
			segmented_images = seg_helper.get_segmented_images(image_path)

			for key, masked_image in segmented_images.items():
				#print(f"Segment '{key}' for image: {image_path}")
				#masked_image.show()

				# Save masked_image to file, use original image file name as dir name
				orig_image_file_name = os.path.basename(image_path)
				output_dir = os.path.join(segmented_dir, orig_image_file_name)

				if not os.path.exists(output_dir):
					os.makedirs(output_dir)

				segmented_file_path = os.path.join(output_dir, f"{orig_image_file_name}___{key}.png")
				masked_image.save(segmented_file_path)

	
	# Update db, mark segmented = 1
	cursor = conn.cursor()
	cursor.execute(f"""UPDATE annotation_status SET segmented=1 WHERE product_id={id}""")
	conn.commit()
