In [None]:
import pandas as pd
import numpy as np

## Load dataset

In [None]:
df = pd.read_csv('./data/dataset.csv')
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [None]:
df.shape

(12491, 8)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12491 entries, 0 to 12490
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductID     12491 non-null  int64 
 1   ProductName   12491 non-null  object
 2   ProductBrand  12491 non-null  object
 3   Gender        12491 non-null  object
 4   Price (INR)   12491 non-null  int64 
 5   NumImages     12491 non-null  int64 
 6   Description   12491 non-null  object
 7   PrimaryColor  11597 non-null  object
dtypes: int64(3), object(5)
memory usage: 780.8+ KB


In [None]:
cate_cols = df.select_dtypes(include=['object']).columns
df[cate_cols] = df[cate_cols].fillna('unknown')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12491 entries, 0 to 12490
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductID     12491 non-null  int64 
 1   ProductName   12491 non-null  object
 2   ProductBrand  12491 non-null  object
 3   Gender        12491 non-null  object
 4   Price (INR)   12491 non-null  int64 
 5   NumImages     12491 non-null  int64 
 6   Description   12491 non-null  object
 7   PrimaryColor  12491 non-null  object
dtypes: int64(3), object(5)
memory usage: 780.8+ KB


In [None]:
selected_cols = ["ProductName", "ProductBrand", "Gender", "Description", "PrimaryColor"]
values = df[selected_cols].values
overall_infos = []
for value in values:
    result = " ".join(value)
    overall_infos.append(result)
df["overall_info"] = overall_infos

In [None]:
new_df = df[["overall_info"]]
new_df.head()

Unnamed: 0,overall_info
0,DKNY Unisex Black & Grey Printed Medium Trolle...
1,EthnoVogue Women Beige & Grey Made to Measure ...
2,SPYKAR Women Pink Alexa Super Skinny Fit High-...
3,Raymond Men Blue Self-Design Single-Breasted B...
4,Parx Men Brown & Off-White Slim Fit Printed Ca...


## Embedding

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
def text_preprocessing(column):
    #make all words with lower letters
    column = column.str.lower()
    #getting rid of any punctution
    column = column.str.replace('http\S+|www.\S+|@|%|:|,|', '', case=False)
    #spliting each sentence to words to apply previous funtions on them
    word_tokens = column.str.split()
    keywords = word_tokens.apply(lambda x: [item for item in x if item not in stop])
    #assemble words of each sentence again and assign them in new column
    for i in range(len(keywords)):
        keywords[i] = " ".join(keywords[i])
        column = keywords

    return column

In [None]:
new_df['cleaned_info'] = text_preprocessing(new_df['overall_info'])

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("bert-base-nli-mean-tokens")
new_df["embeddings"] = new_df["cleaned_info"].apply(lambda x: model.encode(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["embeddings"] = new_df["cleaned_info"].apply(lambda x: model.encode(x))


In [None]:
embedding_df = pd.DataFrame(new_df["embeddings"].tolist())
embedding_df.to_csv("./data/embedding.csv", header=None, index=0)

In [None]:
df = pd.read_csv("./data/dataset.csv").reset_index(drop=True)
docs = new_df["embeddings"]

In [None]:
from sentence_transformers import util
text = "Men Black T-Shirt"
query_vector = model.encode(text)


top_n = 5
results = util.pytorch_cos_sim(query_vector, docs)

sort_idx = results.argsort(descending=True, axis=1)[0][:top_n]
print(sort_idx)
print(df.iloc[sort_idx])

tensor([2094, 3101, 6369,  879, 9738])
      ProductID                                        ProductName  \
2094   10062775        Bossini Men Black Printed Hooded Sweatshirt   
3101   10071371       IMYOUNG Men Black Printed Scoop Neck T-shirt   
6369   10176819          Ecko Unltd Men Black Printed Hood T-shirt   
879    10000369          Parx Men Black Printed Round Neck T-shirt   
9738   10222821  Ed Hardy Men Black Embellished  Round Neck T-s...   

     ProductBrand Gender  Price (INR)  NumImages  \
2094      Bossini    Men          849          5   
3101      IMYOUNG    Men          699          5   
6369   Ecko Unltd    Men          699          5   
879          Parx    Men          489          5   
9738     Ed Hardy    Men         1249          5   

                                            Description PrimaryColor  
2094  Black printed sweatshirt, long sleeves, straig...        Black  
3101  Black and grey printed T-shirt, has a scoop ne...        Black  
6369  Black pr