# Imports

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

# Data Transformation

In [2]:
df_images = pd.read_csv(".\images.csv")
print(df_images.shape)
df_images.head()

(44446, 2)


Unnamed: 0,filename,link
0,15970.jpg,http://assets.myntassets.com/v1/images/style/p...
1,39386.jpg,http://assets.myntassets.com/v1/images/style/p...
2,59263.jpg,http://assets.myntassets.com/v1/images/style/p...
3,21379.jpg,http://assets.myntassets.com/v1/images/style/p...
4,53759.jpg,http://assets.myntassets.com/v1/images/style/p...


In [3]:
df_images['filename'] = df_images['filename'].str.replace(".jpg", "").astype("int")

  df_images['filename'] = df_images['filename'].str.replace(".jpg", "").astype("int")


In [4]:
print(df_images.info())
df_images.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44446 entries, 0 to 44445
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  44446 non-null  int32 
 1   link      44446 non-null  object
dtypes: int32(1), object(1)
memory usage: 521.0+ KB
None


Unnamed: 0,filename,link
0,15970,http://assets.myntassets.com/v1/images/style/p...
1,39386,http://assets.myntassets.com/v1/images/style/p...
2,59263,http://assets.myntassets.com/v1/images/style/p...
3,21379,http://assets.myntassets.com/v1/images/style/p...
4,53759,http://assets.myntassets.com/v1/images/style/p...


In [5]:
df_styles = pd.read_csv(".\styles.csv", sep=",", on_bad_lines='skip')
print(df_styles.shape)
print(df_styles.info())
df_styles.head()

(44424, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44424 entries, 0 to 44423
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44424 non-null  int64  
 1   gender              44424 non-null  object 
 2   masterCategory      44424 non-null  object 
 3   subCategory         44424 non-null  object 
 4   articleType         44424 non-null  object 
 5   baseColour          44409 non-null  object 
 6   season              44403 non-null  object 
 7   year                44423 non-null  float64
 8   usage               44107 non-null  object 
 9   productDisplayName  44417 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.4+ MB
None


Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [6]:
df = df_styles.merge(df_images, left_on="id", right_on="filename", how="left")
print(df.shape)
print(df.info())
df.head()

(44424, 12)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44424 entries, 0 to 44423
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44424 non-null  int64  
 1   gender              44424 non-null  object 
 2   masterCategory      44424 non-null  object 
 3   subCategory         44424 non-null  object 
 4   articleType         44424 non-null  object 
 5   baseColour          44409 non-null  object 
 6   season              44403 non-null  object 
 7   year                44423 non-null  float64
 8   usage               44107 non-null  object 
 9   productDisplayName  44417 non-null  object 
 10  filename            44424 non-null  int32  
 11  link                44424 non-null  object 
dtypes: float64(1), int32(1), int64(1), object(9)
memory usage: 4.2+ MB
None


Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,filename,link
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,15970,http://assets.myntassets.com/v1/images/style/p...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,39386,http://assets.myntassets.com/v1/images/style/p...
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,59263,http://assets.myntassets.com/v1/images/style/p...
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,21379,http://assets.myntassets.com/v1/images/style/p...
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,53759,http://assets.myntassets.com/v1/images/style/p...


In [7]:
df['combined_features'] = df.apply(
    lambda row: f"{row['gender']} {row['masterCategory']} {row['subCategory']} {row['articleType']} {row['baseColour']} {row['season']} {row['year']} {row['usage']} {row['productDisplayName']}",
    axis=1
)

In [8]:
df_small = df.groupby(['gender', 'articleType', 'season'], group_keys=False).apply(lambda x: x.sample(frac=0.1, replace=False, random_state=42))
df_small['combined_features'] = df_small.apply(
    lambda row: f"{row['gender']} {row['masterCategory']} {row['subCategory']} {row['articleType']} {row['baseColour']} {row['season']} {row['year']} {row['usage']} {row['productDisplayName']}",
    axis=1
)

In [9]:
df_small.reset_index(drop=True, inplace=True)

# Modelling

In [10]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')
#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df_small['combined_features'])

In [11]:
tfidf_matrix.shape

(4390, 2632)

In [12]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df_small.index, index=df_small['id']).drop_duplicates()

In [13]:
# because I am using a small dataset, I use a filtered indices series to the length of the small dataset
indices = indices[indices < df_small.shape[0]]

In [14]:
indices.sort_values()

id
37539       0
31114       1
47125       2
47100       3
47113       4
         ... 
46530    4385
25619    4386
19291    4387
29505    4388
23272    4389
Length: 4390, dtype: int64

In [15]:
indices.index.tolist()[:5]

[37539, 31114, 47125, 47100, 47113]

In [16]:
df_small[df_small['id'].isin(indices.index.tolist())].head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,filename,link,combined_features
0,37539,Boys,Apparel,Socks,Booties,White,Summer,2012.0,Casual,Madagascar 3 Infants Boys White Booties,37539,http://assets.myntassets.com/v1/images/style/p...,Boys Apparel Socks Booties White Summer 2012.0...
1,31114,Boys,Apparel,Bottomwear,Capris,Blue,Summer,2012.0,Casual,Gini and Jony Boys Washed Blue 3/4 Length Pants,31114,http://assets.myntassets.com/v1/images/style/p...,Boys Apparel Bottomwear Capris Blue Summer 201...
2,47125,Boys,Footwear,Shoes,Casual Shoes,Blue,Fall,2012.0,Casual,Marvel Boys Blue Web Spider Shoes,47125,http://assets.myntassets.com/v1/images/style/p...,Boys Footwear Shoes Casual Shoes Blue Fall 201...
3,47100,Boys,Footwear,Shoes,Casual Shoes,Red,Summer,2012.0,Casual,Marvel Boys Red Light Spidey Shoes,47100,http://assets.myntassets.com/v1/images/style/p...,Boys Footwear Shoes Casual Shoes Red Summer 20...
4,47113,Boys,Footwear,Flip Flops,Flip Flops,Blue,Fall,2012.0,Casual,Marvel Boys Blue Slippers,47113,http://assets.myntassets.com/v1/images/style/p...,Boys Footwear Flip Flops Flip Flops Blue Fall ...


In [17]:
df_small[df_small['id'] == 47125]['productDisplayName'].values[0]

'Marvel Boys Blue Web Spider Shoes'

In [18]:

def content_recommender(title, cosine_sim=cosine_sim, df=df_small, indices=indices):
  print(f"Searcing for products similar to: {df[df['id'] == title]['productDisplayName'].values[0]}")
  idx = indices[title]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:11]
  movie_indices = [i[0] for i in sim_scores]
  return df['productDisplayName'].iloc[movie_indices].tolist()

In [19]:
# Get recommendations
# For simplicity, I am using the id of the product as the input. In a real-world scenario, you would use the product name
# TODO: Use the product name as input 
content_recommender(47125)

Searcing for products similar to: Marvel Boys Blue Web Spider Shoes


['Marvel Boys Red Light Spidey Shoes',
 'Marvel Boys Blue Slippers',
 'Madagascar3 Boys Blue Printed T-Shirt',
 'Timberland Men Casual Blue Casual Shoes',
 'Madagascar 3 Boys Blue Printed T-shirt',
 'Gini and Jony Boys Printed Blue T-shirt',
 'Gini and Jony Boys Printed Blue T-shirt',
 'Ganuchi Men Blue Shoes',
 'iD Men Casual Blue Shoes',
 'United Colors of Benetton Boys Boys Blue Washed Jeans']