In [1]:
import numpy as np
import pandas as pd
import chromadb
import openai
import json

#### Loading the dataset

In [3]:
data = pd.read_csv('Fashion Dataset v2.csv')
data.head()

Unnamed: 0,p_id,name,products,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
1,16524740,InWeave Women Orange Solid Kurta with Palazzos...,"Kurta, Palazzos, Floral Print Dupatta",5899,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
2,16331376,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,"Kurta, Trousers, Dupatta",4899,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
3,14709966,Nayo Women Red Floral Printed Kurta With Trous...,"Kurta, Trouser, Dupatta",3699,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
4,11056154,AHIKA Women Black & Green Printed Straight Kurta,Kurta,1350,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size..."


### Performing EDA on the data

In [5]:
len(data)

14214

In [6]:
data.dtypes

p_id              int64
name             object
products         object
price             int64
colour           object
brand            object
img              object
ratingCount     float64
avg_rating      float64
description      object
p_attributes     object
dtype: object

**Checking for the number of unique values**

In [8]:
data.nunique()

p_id            14214
name            13873
products          910
price            1209
colour             50
brand            1022
img             14214
ratingCount       829
avg_rating       2367
description     14181
p_attributes    13089
dtype: int64

**Checking data for null values**

In [10]:
data.isna().sum()

p_id               0
name               0
products           0
price              0
colour             0
brand              0
img                0
ratingCount     7684
avg_rating      7684
description        0
p_attributes       0
dtype: int64

Fixing the 'ratingCount' and 'avg_rating' columns.

The rating is probably null because these could be new products and no one has rated them yet.

In [12]:
data.loc[data['ratingCount'].isna(), 'ratingCount'] = 0
data.loc[data['avg_rating'].isna(), 'avg_rating'] = 0
data.isna().sum()

p_id            0
name            0
products        0
price           0
colour          0
brand           0
img             0
ratingCount     0
avg_rating      0
description     0
p_attributes    0
dtype: int64

##### Examining data types

In [14]:
data.dtypes

p_id              int64
name             object
products         object
price             int64
colour           object
brand            object
img              object
ratingCount     float64
avg_rating      float64
description      object
p_attributes     object
dtype: object

In [15]:
data = data.convert_dtypes()

In [16]:
data.dtypes

p_id                     Int64
name            string[python]
products        string[python]
price                    Int64
colour          string[python]
brand           string[python]
img             string[python]
ratingCount              Int64
avg_rating             Float64
description     string[python]
p_attributes    string[python]
dtype: object

EDA is now complete

In [18]:
# Set the API key
filepath = "../"

with open(filepath + "OPENAI_API_Key.txt", "r") as f:
  openai.api_key = ''.join(f.readlines())

In [19]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [20]:
# Define the path where chroma collections will be stored

chroma_data_path = 'chromadb_data'

In [21]:
# Call PersistentClient()
client = chromadb.PersistentClient(path=chroma_data_path)

In [22]:
# Set up the embedding function using the OpenAI embedding model

#model = "text-embedding-ada-002"
#embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [23]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

#fashion_coll = client.get_or_create_collection(name='myntra_fashion_data', embedding_function=embedding_function)
client.delete_collection(name='myntra_fashion_data')
fashion_coll = client.get_or_create_collection(name='myntra_fashion_data')

In [24]:
documents_list = data["description"].apply(lambda d:d.lower()).tolist()
ids = data['p_id'].apply(lambda id:str(id)).tolist()

In [25]:
import ast
#metadata_list = data['p_attributes'].apply(lambda attr:ast.literal_eval(attr)).tolist()
def combine_metadatas(row):
    metadata_dict = ast.literal_eval(row['p_attributes'].lower())
    metadata_dict.update({'color' : row['colour'].lower(), 'brand' : row['brand'].lower(), 'price' : row['price'], 'ratingCount' : row['ratingCount'], 'avg_rating' : row['avg_rating']})
    return metadata_dict


data['new_metadatas'] = data.apply(combine_metadatas, axis=1)
metadata_list = data['new_metadatas'].tolist()

#metadata_list

In [26]:
fashion_coll.upsert(
#    embeddings=[embedding_function],
    documents= documents_list,
    ids = ids,
    metadatas = metadata_list
)

#### Fetch Query Parameters from OpenAI


In [58]:
def parse_query_to_where_clause(user_query):
    prompt = f"""
You are a helpful assistant that converts user queries into filters for a product database.
Given the following user query, extract the conditions and format them as a JSON object for use as a "where" clause in ChromaDB.

The metadata fields are:
- color: string (e.g., "red", "blue")
- price: float (e.g., {{"$lt": 15}} for "under $15")
- brand: string (e.g., "BrandA")
- rating: float (e.g., {{"$gt": 4}} for "above 4 stars")


User Query: {user_query}

Strictly return the "content" as a JSON string without the special characters as prefix and suffix.
Don't add any attributes other than "color", "price", "brand", "rating".
None of the attributes are mandatory. If you cannot find a value then don't include the attribute in the response.
"""

    # Make the API call
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": prompt}]
    )

    # Extract and return the response
    return response.choices[0].message.content

# Example user query
user_query = 'Please show me black printed kurtas'
where_clause = parse_query_to_where_clause(user_query)
print(where_clause)

{
  "color": "black"
}


In [62]:
query='Please show me black printed kurtas'
query_embedding = model.encode(query.lower())
results = fashion_coll.query(query_embeddings=[query_embedding], n_results=2, where=json.loads(where_clause))
for r in results['documents']:
    print(r)
for r in results['metadatas']:
    print(r)

['black and golden printed straight kurta, has a round neck, three-quarter sleeves, straight hem, and side slits80% viscose rayon and 20% polyester<br>hand-washthe model (height 5\'8") is wearing a size s', 'black and white printed kurta with palazzos and dupatta<br>black and white ethnic print straight calf length kurta, has a round neck, three-quarter sleeves, side slits<br>white and black printed palazzos, has elasticated waistband, slip-on closure<br>black and white printed dupatta, has printed bordertop fabric: pure cotton<br>bottom fabric: pure cotton<br>dupatta fabric: pure cotton<br>hand-washthe model (height 5\'8") is wearing a size s']
[{'avg_rating': 4.15498155, 'body or garment size': 'to-fit denotes body measurements in', 'body shape id': '424', 'brand': 'w', 'color': 'black', 'colour family': 'monochrome', 'design styling': 'regular', 'fabric': 'viscose rayon', 'fabric 2': 'polyester', 'fabric purity': 'synthetic', 'hemline': 'straight', 'length': 'knee length', 'main tre