In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re


In [2]:
df = pd.read_csv('clothing_data.csv')

In [3]:
df

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White
...,...,...,...,...,...,...,...,...
12486,10262843,Pepe Jeans Men Black Hammock Slim Fit Low-Rise...,Pepe Jeans,Men,1299,7,"Black dark wash 5-pocket low-rise jeans, clean...",Black
12487,10261721,Mochi Women Gold-Toned Solid Heels,Mochi,Women,1990,5,"A pair of gold-toned open toe heels, has regul...",Gold
12488,10261607,612 league Girls Navy Blue & White Printed Reg...,612 league,Girls,602,4,Navy Blue and White printed mid-rise denim sho...,Blue
12489,10266621,Bvlgari Men Aqva Pour Homme Marine Eau de Toil...,Bvlgari,Men,8950,2,Bvlgari Men Aqva Pour Homme Marine Eau de Toil...,


In [5]:
df.isnull()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
12486,False,False,False,False,False,False,False,False
12487,False,False,False,False,False,False,False,False
12488,False,False,False,False,False,False,False,False
12489,False,False,False,False,False,False,False,True


In [6]:
df.isnull().sum()

ProductID         0
ProductName       0
ProductBrand      0
Gender            0
Price (INR)       0
NumImages         0
Description       0
PrimaryColor    894
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

ProductID       0
ProductName     0
ProductBrand    0
Gender          0
Price (INR)     0
NumImages       0
Description     0
PrimaryColor    0
dtype: int64

In [12]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [13]:
df['Description'] = df['Description'].apply(clean_text)

In [14]:
vectorizer = TfidfVectorizer(stop_words='english')

In [16]:
tfidf_matrix = vectorizer.fit_transform(df['Description'])

In [20]:
def compute_similarity(input_text):
    # Convert the input text to a TF-IDF vector
    input_vector = vectorizer.transform([input_text])

    # Compute the cosine similarity between the input vector and all items
    similarity_scores = cosine_similarity(input_vector, tfidf_matrix)

    # Get the indices and similarity scores of the most similar items
    top_items = similarity_scores.argsort()[0][::-1]

    # Retrieve the indices and similarity scores of the top items
    item_indices = top_items[:5]  # Get the top 5 most similar items
    item_scores = similarity_scores[0, item_indices]

    # Return the most similar items and their similarity scores
    recommendations = df.iloc[item_indices][['ProductID', 'Description']]
    recommendations['similarity_score'] = item_scores

    return recommendations


In [36]:
input_text = "male shoes"

In [37]:
similar_items = compute_similarity(input_text)

In [38]:
print(similar_items)

      ProductID                                        Description  \
8772   10201205                                beige driving shoes   
1841   10029797  a pair of black running sports shoes has regul...   
1190   10029731  a pair of black running sports shoes has regul...   
1944   10029869  a pair of grey running sports shoes has regula...   
1445   10029753  a pair of navy blue running sports shoes has r...   

      similarity_score  
8772          0.459095  
1841          0.349479  
1190          0.349479  
1944          0.345773  
1445          0.343101  


In [49]:
def get_similar_items(input_text, database_file, top_n=5):
    # Load the database
    df = pd.read_csv(database_file)
    
    # Preprocess the data, if needed
    
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the item descriptions in the database
    tfidf_matrix = vectorizer.fit_transform(df['Description'])
    
    # Convert the input text to a TF-IDF vector
    input_vector = vectorizer.transform([input_text])
    
    # Compute the cosine similarity between the input vector and all items
    similarity_scores = cosine_similarity(input_vector, tfidf_matrix)
    
    # Get the indices of the most similar items
    top_indices = similarity_scores.argsort()[0][::-1][:top_n]
    
    # Retrieve the URLs of the top similar items
    top_urls = df.iloc[top_indices]['Url'].tolist()
    
    return top_urls

In [60]:
input_text2 = "Black cotton t-shirt"

In [61]:
database_file = "clothing_data2.csv"

In [62]:
#Printing top 3 results
top_n = 3

In [63]:
similar_items2 = get_similar_items(input_text2, database_file, top_n)

In [59]:
print(similar_items2)
#Urls are not present in the database, so the function therefore returns the Name of the product as per the dataset used.

['Indian Terrain Men Navy Blue & Brown Striped Polo Collar Cotton T-shirt', 'Cherry Crumble Boys Black & Red Colourblocked Jacket', 'Indian Terrain Men Beige & Brown Checked Polo Collar Cotton T-shirt']
