In [2]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [3]:
import os
import mysql.connector

conn = mysql.connector.connect(host=os.getenv("MYSQL_HOST") , user=os.getenv("MYSQL_USER"), password=os.getenv("MYSQL_PASSWORD"), database=os.getenv("MYSQL_DB"), auth_plugin='mysql_native_password')

cursor = conn.cursor()

In [4]:
import pandas as pd

def get_annotable_items():
	query = """ 
			SELECT p.id, p.product_name, p.category, p.sub_category, r.details 
			FROM products p 
			JOIN raw_details r ON p.id = r.product_id
			JOIN annotation_status a on p.id=a.product_id
			WHERE a.flagged = 0;
			"""
	
	cursor.execute(query)
	
	rows = cursor.fetchall()
	columns = [i[0] for i in cursor.description]

	df = pd.DataFrame(rows, columns=columns)
	df_normalized = pd.json_normalize(df['details'].apply(eval))
	df = pd.concat([df, df_normalized], axis=1)

	df = df[[
		'id',
		'category',
		'sub_category',
		'title',
		'description',
		'images',
		'specs.Colour',
		'specs.Fabric',
		'specs.Value Addition',
		'specs.Cut /Fit',
		'specs.Collar/Neck',
		'specs.Sleeve',
		'specs.Bottom Colour',
		'specs.Bottom Fabric',
		'specs.Dupatta Colour',
		'specs.Dupatta Fabric'
	]]

	return df

In [331]:
df = get_annotable_items()

df

Unnamed: 0,id,category,sub_category,title,description,images,specs.Colour,specs.Fabric,specs.Value Addition,specs.Cut /Fit,specs.Collar/Neck,specs.Sleeve,specs.Bottom Colour,specs.Bottom Fabric,specs.Dupatta Colour,specs.Dupatta Fabric
0,1,Saree,Cotton,White Printed and Embroidered Cotton Saree,White printed cotton saree with floral embroid...,[https://www.aarong.com/media/catalog/product/...,White,Cotton,Hand Embroidery,,,,,,,
1,2,Saree,Cotton,Ivory Printed Cotton Saree,"Ivory cotton saree with golden, red, beige, ma...",[https://www.aarong.com/media/catalog/product/...,White,Cotton,Screen Print,,,,,,,
2,3,Saree,Cotton,White Printed And Embroidered Cotton Saree,White printed cotton saree with orange and red...,[https://www.aarong.com/media/catalog/product/...,White,Cotton,Hand Embroidery,,,,,,,
3,4,Saree,Cotton,Peach Printed Cotton Saree,Peach cotton saree with multicolour prints. Co...,[https://www.aarong.com/media/catalog/product/...,Orange,Cotton,Screen Print,,,,,,,
4,5,Saree,Cotton,Light Pink Printed Cotton Saree,"Light pink cotton saree with orange, pink, pur...",[https://www.aarong.com/media/catalog/product/...,Pink,Cotton,Screen Print,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5281,8044,Nightwear,,Stone Blue Printed Voile Nightwear,Stone blue printed voile nightwear with matchi...,[https://www.aarong.com/media/catalog/product/...,Blue,Voile,Block Print,Kaftan Style,Round Neck,Short Sleeve,,,,
5282,8045,Nightwear,,White Tie-Dyed Voile Nightwear,White and green tie-dyes voile nightwear with ...,[https://www.aarong.com/media/catalog/product/...,White,Voile,Screen Print,Kaftan Style,Round Neck,Kaftan Sleeve,,,,
5283,8046,Nightwear,,Pink Tie-Dyed Voile Nightwear,Pink and purple tie-dyes voile nightwear with ...,[https://www.aarong.com/media/catalog/product/...,Pink,Voile,Screen Print,Kaftan Style,Round Neck,Kaftan Sleeve,,,,
5284,8047,Nightwear,,Light Purple Tie-Dyed and Embroidered Voile Ni...,Light purple and white tie-dyed voile nightwea...,[https://www.aarong.com/media/catalog/product/...,Purple,Voile,Machine Embroidery,A-Line,Square Neck,Short Sleeve,,,,


In [6]:
# rows where title or description is null or empty

df[(df['title'].isnull()) | (df['title'] == '') | (df['description'].isnull()) | (df['description'] == '')]

Unnamed: 0,id,category,sub_category,title,description,images,specs.Colour,specs.Fabric,specs.Value Addition,specs.Cut /Fit,specs.Collar/Neck,specs.Sleeve,specs.Bottom Colour,specs.Bottom Fabric,specs.Dupatta Colour,specs.Dupatta Fabric


In [7]:
# Take id, title, description to a new df

df_clean = df[['id', 'title', 'description']].copy()

df_clean.sample(2)

Unnamed: 0,id,title,description
1378,1379,Pastel Green Printed And Embroidered Muslin Saree,"Pastel green printed muslin saree with green, ..."
2186,2187,Grey Printed and Nakshi Kantha Embroidered Hal...,"Grey printed half-silk saree with beige, brown..."


In [1]:
import nltk

print(nltk.__version__)

3.9.1


In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

from nltk.stem import WordNetLemmatizer
from nltk import wordnet
nltk.download('wordnet')
nltk.download('punkt_tab')


stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()


def cleanup_text(text, return_as_tokens=False, lemmatize=False, remove_stop_words=True):
	if not isinstance(text, str):
		return ''
	
	text = text.lower()						# Lowercase the text

	text = re.sub(r'[^\w\s]', '', text)			# Remove punctuation

	tokens = word_tokenize(text)				# Tokenize the text

	if remove_stop_words:
		filtered_words = [word for word in tokens if word not in stop_words]	# Remove stop words
	else:
		filtered_words = tokens

	if lemmatize:
		filtered_words = [wnl.lemmatize(word) for word in filtered_words]		# Lemmatize words

	if return_as_tokens:
		return filtered_words
	else:
		return ' '.join(filtered_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amahmud\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\amahmud\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [11]:
df_clean['title_clean'] = df_clean['title'].apply(lambda x: cleanup_text(x, lemmatize=True, remove_stop_words=True))
df_clean['description_clean'] = df_clean['description'].apply(lambda x: cleanup_text(x, lemmatize=True, remove_stop_words=True))

In [12]:
df_clean.sample(5)

Unnamed: 0,id,title,description,title_clean,description_clean
4558,7272,Brown Printed Viscose-Cotton Panjabi,Brown viscose-cotton panjabi with black and go...,brown printed viscosecotton panjabi,brown viscosecotton panjabi black golden print
2876,2879,White Printed and Embroidered Viscose-Cotton S...,White textured printed viscose-cotton kameez w...,white printed embroidered viscosecotton shalwa...,white textured printed viscosecotton kameez mu...
2856,2859,Pink Printed and Embroidered Viscose Shalwar K...,Pink printed dobby textured viscose kameez wit...,pink printed embroidered viscose shalwar kameez,pink printed dobby textured viscose kameez tea...
1929,1930,Fuchsia Printed and Embroidered Silk Saree,Fuchsia printed silk saree with multicolour em...,fuchsia printed embroidered silk saree,fuchsia printed silk saree multicolour embroid...
2135,2136,Golden Printed and Nakshi Kantha Embroidered S...,"Golden printed silk saree with salmon pink, re...",golden printed nakshi kantha embroidered silk ...,golden printed silk saree salmon pink red wate...


In [36]:
from collections import Counter
import re


# Apply cleanup and get tokenized workds for description
description_words = df['description'].apply(lambda x: cleanup_text(x, return_as_tokens=True, lemmatize=False, remove_stop_words=True))

all_words = []
for words in description_words:
	all_words.extend(words)

len(all_words)


92241

In [39]:
word_freq = Counter(all_words)

In [40]:
word_freq.most_common(20)

[('blouse', 4593),
 ('saree', 4405),
 ('comes', 2989),
 ('matching', 2721),
 ('cotton', 2684),
 ('printed', 2376),
 ('piece', 2323),
 ('unstitched', 2304),
 ('photo', 2287),
 ('part', 2287),
 ('shown', 2286),
 ('styling', 2286),
 ('suggestion', 2286),
 ('actual', 2286),
 ('product', 2285),
 ('embroidery', 2132),
 ('green', 1956),
 ('prints', 1929),
 ('attached', 1573),
 ('end', 1573)]

In [44]:
# Number of words with frequency > 10

len([k for k, v in word_freq.items() if v > 10])

247

In [None]:
# Review those word frequencies to identify potential labels

wf = pd.DataFrame(word_freq.most_common(250), columns=['word', 'frequency'])

In [None]:
with pd.option_context('display.max_rows', 250):
    display(wf)

In [48]:
print(wf['word'].to_list())

['blouse', 'saree', 'comes', 'matching', 'cotton', 'printed', 'piece', 'unstitched', 'photo', 'part', 'shown', 'styling', 'suggestion', 'actual', 'product', 'embroidery', 'green', 'prints', 'attached', 'end', 'blue', 'white', 'panjabi', 'silk', 'black', 'viscose', 'pink', 'viscosecotton', 'golden', 'brown', 'red', 'shalwar', 'orange', 'kameez', 'dupatta', 'yellow', 'weaving', 'purple', 'grey', 'textured', 'multicolour', 'crafted', 'district', 'bangladesh', 'origin', 'tangail', 'muslin', 'beige', 'light', 'magenta', 'casual', 'teal', 'maternity', 'sirajganj', 'tunic', 'top', 'taaga', 'painted', 'voile', 'turquoise', 'peach', 'olive', 'tiedyed', 'details', 'scarf', 'maroon', 'whether', 'handloom', 'shades', 'pair', 'dyed', 'appliqued', 'elegant', 'nakshi', 'beautiful', 'day', 'outdoors', 'meetup', 'attire', 'presence', 'kurta', 'nightwear', 'mustard', 'accessories', 'hand', 'try', 'features', 'coral', 'pabna', 'half', 'silver', 'kantha', 'detailing', 'brush', 'katan', 'opera', 'baluchari

In [None]:
# Save wf to csv
wf.to_csv('../../data/word_freq_new.csv', index=False)

In [261]:
import re

def show_common_prefix_suffix(lines, word, top=20):
	unique_prefix_counts = {}
	unique_suffix_counts = {}

	unique_words = set()

	for line in lines:
		#matches = re.findall(r'(\w*)\s*(' + re.escape(word) + r'\w*)\s*(\w*)', line, re.IGNORECASE)
		matches = re.findall(r'(\w*)\s*([\w-]*' + re.escape(word) + r'\w*)\s*(\w*)', line, re.IGNORECASE)
		for match in matches:
			previous_word = match[0]
			next_word = match[2]
			#print(f"{previous_word} {match[1]} {next_word} ")

			# Count unique prefixes and suffixes
			if previous_word not in unique_prefix_counts:
				unique_prefix_counts[previous_word] = 0
			unique_prefix_counts[previous_word] += 1
			if next_word not in unique_suffix_counts:
				unique_suffix_counts[next_word] = 0
			unique_suffix_counts[next_word] += 1

			unique_words.add(match[1])

	df_prefix_counts = pd.DataFrame(unique_prefix_counts.items(), columns=['Prefix', 'Count'])
	df_suffix_counts = pd.DataFrame(unique_suffix_counts.items(), columns=['Suffix', 'Count'])

	print(f"Top {top} Prefixes:")
	print(df_prefix_counts.nlargest(top, 'Count').to_string(index=False))
	print(f"\nTop {top} Suffixes:")
	print(df_suffix_counts.nlargest(top, 'Count').to_string(index=False))

	print(f"\nUnique words: {unique_words}")

	#return df_prefix_counts, df_suffix_counts

In [458]:
target_word = 'navy'

samples = df[df['description'].str.contains(target_word, na=False, case=False) 
			 #& ~df['description'].str.contains('green', na=False, case=False)
			 #& ~df['specs.Colour'].str.contains('blue', na=False, case=False)
			 #& ~df['specs.Fabric'].str.contains('Cotton', na=False, case=False)
			 #& ~df['specs.Value Addition'].str.contains('sequin', na=False, case=False)
			 ][
				 #['id', 'description', 'specs.Colour', 'specs.Fabric', 'specs.Value Addition']
				 ['id', 'description', 'specs.Colour']
				 #['specs.Value Addition']
				 #['specs.Colour']
			]

#samples = df[df['description'].str.contains('dye', na=False, case=False)][['id', 'description', 'specs.Colour', 'specs.Fabric', 'specs.Value Addition']]

print(f"Total Match: {len(samples)}")
print(samples['id'][:20].to_list())

lines = samples['description'].to_list()
show_common_prefix_suffix(lines, target_word, 40)

samples[:200].to_dict(orient='records')

Total Match: 100
[40, 121, 176, 310, 315, 392, 400, 410, 441, 462, 522, 539, 557, 570, 574, 615, 692, 786, 886, 1010]
Top 40 Prefixes:
Prefix  Count
           67
   and     18
  with     16
  this      1
  over      1

Top 40 Suffixes:
Suffix  Count
  blue    101
  Blue      2

Unique words: {'navy', 'Navy'}


[{'id': 40,
  'description': 'Navy blue Pabna cotton baluchari saree with weaving. Comes with matching unstitched blouse piece. Blouse shown in the photo is a styling suggestion, it is not a part of the actual product.\nOrigin:\nThis saree is crafted in the district of Pabna, Bangladesh.',
  'specs.Colour': 'Blue'},
 {'id': 121,
  'description': 'Blue striped Tangail cotton saree with silver, navy blue and sage green weaving. Comes with matching unstitched blouse piece. Blouse shown in the photo is a styling suggestion, it is not a part of the actual product.\nOrigin:\nThis saree is crafted in the district of Tangail, Bangladesh.',
  'specs.Colour': 'Blue'},
 {'id': 176,
  'description': 'Navy blue cotton saree with shades of green and yellow prints. Comes with matching unstitched blouse piece. Blouse shown in the photo is a styling suggestion, it is not a part of the actual product.',
  'specs.Colour': 'Blue'},
 {'id': 310,
  'description': 'Olive cotton saree with bottle green, magen

Top 40 Prefixes:
  Prefix  Count
             11
   Light      2
    with      1
     and      1
matching      1

Top 40 Suffixes:
     Suffix  Count
    printed      5
    viscose      2
     prints      1
        red      1
       silk      1
    Tangail      1
     purple      1
 embroidery      1
embroidered      1
     cotton      1
        and      1

Unique words: {'mulberry', 'Mulberry'}
