# Imports

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# Data Transformation

In [2]:
df_images = pd.read_csv(".\images.csv")
print(df_images.shape)
df_images.head()

(44446, 2)


Unnamed: 0,filename,link
0,15970.jpg,http://assets.myntassets.com/v1/images/style/p...
1,39386.jpg,http://assets.myntassets.com/v1/images/style/p...
2,59263.jpg,http://assets.myntassets.com/v1/images/style/p...
3,21379.jpg,http://assets.myntassets.com/v1/images/style/p...
4,53759.jpg,http://assets.myntassets.com/v1/images/style/p...


In [3]:
df_images['filename'] = df_images['filename'].str.replace(".jpg", "").astype("int")

  df_images['filename'] = df_images['filename'].str.replace(".jpg", "").astype("int")


In [4]:
print(df_images.info())
df_images.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44446 entries, 0 to 44445
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  44446 non-null  int32 
 1   link      44446 non-null  object
dtypes: int32(1), object(1)
memory usage: 521.0+ KB
None


Unnamed: 0,filename,link
0,15970,http://assets.myntassets.com/v1/images/style/p...
1,39386,http://assets.myntassets.com/v1/images/style/p...
2,59263,http://assets.myntassets.com/v1/images/style/p...
3,21379,http://assets.myntassets.com/v1/images/style/p...
4,53759,http://assets.myntassets.com/v1/images/style/p...


In [5]:
df_styles = pd.read_csv(".\styles.csv", sep=",", on_bad_lines='skip')
print(df_styles.shape)
print(df_styles.info())
df_styles.head()

(44424, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44424 entries, 0 to 44423
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44424 non-null  int64  
 1   gender              44424 non-null  object 
 2   masterCategory      44424 non-null  object 
 3   subCategory         44424 non-null  object 
 4   articleType         44424 non-null  object 
 5   baseColour          44409 non-null  object 
 6   season              44403 non-null  object 
 7   year                44423 non-null  float64
 8   usage               44107 non-null  object 
 9   productDisplayName  44417 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.4+ MB
None


Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [6]:
df = df_styles.merge(df_images, left_on="id", right_on="filename", how="left")
print(df.shape)
print(df.info())
df.head()

(44424, 12)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44424 entries, 0 to 44423
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44424 non-null  int64  
 1   gender              44424 non-null  object 
 2   masterCategory      44424 non-null  object 
 3   subCategory         44424 non-null  object 
 4   articleType         44424 non-null  object 
 5   baseColour          44409 non-null  object 
 6   season              44403 non-null  object 
 7   year                44423 non-null  float64
 8   usage               44107 non-null  object 
 9   productDisplayName  44417 non-null  object 
 10  filename            44424 non-null  int32  
 11  link                44424 non-null  object 
dtypes: float64(1), int32(1), int64(1), object(9)
memory usage: 4.2+ MB
None


Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,filename,link
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,15970,http://assets.myntassets.com/v1/images/style/p...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,39386,http://assets.myntassets.com/v1/images/style/p...
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,59263,http://assets.myntassets.com/v1/images/style/p...
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,21379,http://assets.myntassets.com/v1/images/style/p...
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,53759,http://assets.myntassets.com/v1/images/style/p...


In [None]:
df['combined_features'] = df.apply(
    lambda row: f"{row['gender']} {row['masterCategory']} {row['subCategory']} {row['articleType']} {row['baseColour']} {row['season']} {row['year']} {row['usage']} {row['productDisplayName']}",
    axis=1
)

# Modelling

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
embeddings = sentence_model.encode(df['combined_features'].tolist())

In [None]:
np.save("ecommerce_embeddings.npy", embeddings)  # Save embeddings for later use

In [None]:
print(embeddings)

# Finding Similar Products

In [None]:
def vector_search(query, embeddings, data, top_n=5, similarity_threshold=0.5):
  query_embedding = sentence_model.encode(query)
  similarities = cosine_similarity([query_embedding], embeddings)[0]
  valid_indices = np.where(similarities >= similarity_threshold)[0]
  sorted_indices = valid_indices[similarities[valid_indices].argsort()[::-1]]
  return data.iloc[sorted_indices[:top_n]]

In [None]:
results = vector_search(
    query="Casual black leather jacket for winter",
    embeddings=embeddings,
    data=df,
    top_n=5,
    similarity_threshold=0.5
)
results

In [None]:
def get_recommendations(query):
  results = vector_search(query, embeddings, df)
  return results[['productDisplayName', 'masterCategory', 'subCategory', 'baseColour']].to_html(index=False)

In [None]:
gr.Interface(   
    allow_flagging='never',
    fn=get_recommendations,
    inputs=gr.Textbox(label="Describe Your Fashion Preferences"),
    outputs=gr.HTML(label="Recommendations"),
    title="Fashion Recommendation System"
).launch()