In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine  
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import pickle
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('main1+cat+url.csv')
df.rename(columns = {'Product_Category' : 'Category'}, inplace =True)
df.head()

Unnamed: 0,UserId,Product ID,Rating,Category,URL
0,A10REFE1TW3ZVT,3227001381,5,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
1,A3M7ROZKQW6F69,3227001381,4,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
2,A271KYL82709TY,3227001381,5,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
3,A1OON4OPP58E26,3227001381,5,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
4,A3COLQK8Y9V9A5,3227001381,5,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...


In [3]:
df.shape

(483365, 5)

In [4]:
print('There are {} rows and {} columns in the dataset.'.format(df.shape[0],df.shape[1]))
print('----------------------------------------\n')

for i in df.columns:
    print('Number of unique values in {} \n The unique values are {}'
         .format(i, len(df[i].unique()), df[i].unique()))
    print('----------------------------------------\n')

There are 483365 rows and 5 columns in the dataset.
----------------------------------------

Number of unique values in UserId 
 The unique values are 388717
----------------------------------------

Number of unique values in Product ID 
 The unique values are 6776
----------------------------------------

Number of unique values in Rating 
 The unique values are 5
----------------------------------------

Number of unique values in Category 
 The unique values are 57
----------------------------------------

Number of unique values in URL 
 The unique values are 6776
----------------------------------------



In [5]:
## Here we are calculating those user's who had rated more for product:

In [6]:
user_ratings = pd.DataFrame(df.groupby('UserId')['Rating'].count()).sort_values('Rating', ascending = False).reset_index().rename(columns = {'Rating': 'Total Ratings'})
print('Unique User ID\'s are: {}\nMaximum No. of Rating given by User:{}\nMinimum No. of Rating given by User:{}'\
      .format(user_ratings.shape[0],user_ratings['Total Ratings'][0],user_ratings['Total Ratings'][388716]))
user_ratings

Unique User ID's are: 388717
Maximum No. of Rating given by User:88
Minimum No. of Rating given by User:1


Unnamed: 0,UserId,Total Ratings
0,A2V5R832QCSOMX,88
1,A3M174IC0VXOS2,76
2,A1RRMZKOMZ2M7J,67
3,A281NPSIMI1C2R,66
4,AJGU56YG8G1DQ,59
...,...,...
388712,A2CY8KBQ5AZYBT,1
388713,A2CY797Z9YY2HM,1
388714,A2CY78WDXN5Z7D,1
388715,A2CY6K8AUCKGS3,1


In [7]:
print('User\'s rated only one product ID are {}'.format(user_ratings[user_ratings['Total Ratings'] <= 1].shape[0]))

User's rated only one product ID are 332133


In [8]:
## Here we are calculating those user's who had rated more than a single product:

In [9]:
top_users = user_ratings[user_ratings['Total Ratings'] > 1]

print('No. of Unique User\'s left: {}'.format(top_users.shape[0]))

x = top_users['UserId'].value_counts().index
filter_users = df[df['UserId'].isin(x)]
filter_users

No. of Unique User's left: 56584


Unnamed: 0,UserId,Product ID,Rating,Category,URL
1,A3M7ROZKQW6F69,3227001381,4,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
9,A30IURTQM5NNA4,3227001381,5,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
15,A3LO1A9N0F0TXE,3227001381,5,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
22,A445U3NN5XBYK,3227001381,3,Hair Spray,https://amazon.in/Set-Wet-Sulphate-Paraben-Hai...
34,A60XNB876KYML,7806397051,3,Nail polish remover,https://amazon.in/ARYADIT-Polish-Remover-remov...
...,...,...,...,...,...
483349,A1Y0VDBCRAG8VN,B001G7PNW6,4,Hair Gel,https://amazon.in/Parachute-Advansed-Cream-Ant...
483353,AACTF87OBQG3U,B001G7PNW6,5,Hair Gel,https://amazon.in/Parachute-Advansed-Cream-Ant...
483354,A3V1EI1Z1MWZJT,B001G7PNW6,5,Hair Gel,https://amazon.in/Parachute-Advansed-Cream-Ant...
483360,A2OE8MQ67X0JJB,B001G7PNW6,5,Hair Gel,https://amazon.in/Parachute-Advansed-Cream-Ant...


In [10]:
ids = pd.DataFrame(filter_users.groupby('Product ID')['Rating'].mean())
ids.columns = ['Avg Rating']
ids['No. of Ratings'] = pd.DataFrame(filter_users.groupby('Product ID')['Rating'].count())

ids['% Ratings'] = ((ids['No. of Ratings'] / ids['No. of Ratings'].sum()) * 1000).round(decimals = 2)
ids = ids.sort_values('No. of Ratings', ascending = False)

top_ids = ids.merge(filter_users, on = 'Product ID').drop_duplicates('Product ID').reset_index().drop(columns = ['index', 'UserId','Rating'])

print('Unique Product ID\'s are: {}'.format(top_ids.shape[0]))
print('Maximum No. of Rating given to an ID:', top_ids['No. of Ratings'][0])
top_ids

Unique Product ID's are: 6767
Maximum No. of Rating given to an ID: 817


Unnamed: 0,Product ID,Avg Rating,No. of Ratings,% Ratings,Category,URL
0,B000ZMBSPE,4.389229,817,5.40,Body lotion,https://amazon.in/Vaseline-Intensive-Restore-L...
1,B000142FVW,4.577815,604,3.99,Body Soap,https://amazon.in/Yardley-English-Lavender-Soa...
2,B00150LT40,4.508591,582,3.85,Body wash,https://amazon.in/Dove-Deeply-Nourishing-Body-...
3,B000UVZU1S,4.242489,466,3.08,Face Bronzers,https://amazon.in/Paradise-Eyeshadows-Highligh...
4,B000TKH6G2,4.292793,444,2.94,Face highlighter,https://amazon.in/MARS-Highlighter-Shade-1269-...
...,...,...,...,...,...,...
6762,B000YA27AW,5.000000,1,0.01,Eye Mascara,https://amazon.in/Beauty-Lift-Up-Mascara-Thick...
6763,B00186X7SM,5.000000,1,0.01,Face Primer,https://amazon.in/Lakme-Primer-Liquid-Conceale...
6764,B000JO2NWA,2.000000,1,0.01,Makeup Brush,https://amazon.in/Vega-Face-Brush-Facial-Piece...
6765,B000AR8W8K,5.000000,1,0.01,Eyeliner,https://amazon.in/MARS-Ultra-Smudge-Sketch-Eye...


In [11]:
## Finding Category to Category Similarity:

In [12]:
category = pd.pivot_table(filter_users, index ='Category',columns = 'UserId', values ='Rating')

category.fillna(0,inplace=True) #Replacing 'NaN' values by '0':

similar_category_score = cosine_similarity(category) #Similarity between Category:

np.fill_diagonal(similar_category_score,0) #Filling with '0' for diagonal elements:

simCategory = pd.DataFrame(similar_category_score) #Creating a DataFrame:

simCategory.index = category.index #Assign the values of category to the index and columns:
simCategory.columns = category.index

max_sim_category = pd.DataFrame(simCategory.idxmax(axis=1))
max_sim_category.columns = ['Similar to']
max_sim_category

Unnamed: 0_level_0,Similar to
Category,Unnamed: 1_level_1
Body Soap,Deodrant
Body hair removal,Nail paint
Body lotion,Hair colour
Body oil,Foot Scrub
Body scrub,Body Soap
Body wash,Deodrant
Brow gel,Foot Scrub
Brow pencil,Lip Stick
Colossal,Nail paint
Concealer,Nail paint


In [13]:
## Finding Url to Url Similarity:

In [14]:
URL = pd.pivot_table(filter_users, index ='URL',columns = 'UserId', values ='Rating')

URL.fillna(0,inplace=True) #Replacing 'NaN' values by '0':

similar_URL_score = cosine_similarity(URL) #Similarity between Category:

np.fill_diagonal(similar_URL_score,0) #Filling with '0' for diagonal elements:

simURL = pd.DataFrame(similar_URL_score) #Creating a DataFrame:

simURL.index = URL.index #Assign the values of category to the index and columns:
simURL.columns = URL.index

max_sim_URL = simURL.idxmax(axis=1) #Here checking Url with the maximum similarity:

In [15]:
y = []
for k in max_sim_URL.index:
    y.append([k, max_sim_URL[k],simURL.loc[k][max_sim_URL[k]]])
simDf = pd.DataFrame(y,columns=['URL1','URL2','Cosine'])    
simDf.sort_values(by=['Cosine'], ascending=False, inplace=True)
simDf

Unnamed: 0,URL1,URL2,Cosine
1706,https://amazon.in/FIXDERMA-Fixderma-Moisturisi...,https://amazon.in/Nivea-Fruity-Shine-Watermelo...,0.804030
4316,https://amazon.in/Nivea-Fruity-Shine-Watermelo...,https://amazon.in/FIXDERMA-Fixderma-Moisturisi...,0.804030
4438,https://amazon.in/OROMAC-Waterproof-Makeup-Fou...,https://amazon.in/SUGAR-Cosmetics-Eligiblur-Co...,0.790906
5429,https://amazon.in/SUGAR-Cosmetics-Eligiblur-Co...,https://amazon.in/OROMAC-Waterproof-Makeup-Fou...,0.790906
6041,https://amazon.in/Theraaderm-White-Sunscreen-S...,https://amazon.in/Random-Mac-Prep-Prime-Beauty...,0.725956
...,...,...,...
3115,https://amazon.in/Lakme-Insta-Eye-Liner-Black/...,https://amazon.in/AgriExpo-Agriculture-Exhibit...,0.049807
6687,https://amazon.in/mCaffeine-Circles-Vitamin-Hy...,https://amazon.in/beauty-Extension-Artificial-...,0.047356
4439,https://amazon.in/OURCARES-Ayurvedic-Increases...,https://amazon.in/Pilgrim-Squalane-Blueberry-e...,0.039284
5914,https://amazon.in/Synaty-120Pcs-Artificial-Ext...,https://amazon.in/Derma-Co-Niacinamide-Hyaluro...,0.037796


In [19]:
URL.index

Index(['https://amazon.in/10pcs-Colossal-Double-sided-Callus-Remover/dp/B08ZHNK8L7/ref=sr_1_182?keywords=foot+scrub&qid=1679830572&s=beauty&sr=1-182',
       'https://amazon.in/2-Oh-Point-Semi-Permanent-Color/dp/B09S3SVHP8/ref=sr_1_156?keywords=hair+dye&qid=1679841243&s=beauty&sr=1-156',
       'https://amazon.in/2-Oh-Point-Semi-Permanent-Color/dp/B09S3T3P2W/ref=sr_1_151?keywords=hair+dye&qid=1679841243&s=beauty&sr=1-151',
       'https://amazon.in/212-DEODORANT-NATURAL-SPRAY-150ML/dp/B09TYW9WF3/ref=sr_1_205?keywords=deodorant&qid=1679834700&s=beauty&sr=1-205',
       'https://amazon.in/24-Hours-Organic-Papaya-Removal/dp/B08DLMRQ8P/ref=sr_1_101?keywords=body+hair+removal+cream&qid=1679841659&s=hpc&sr=1-101',
       'https://amazon.in/24-Hours-Organic-Papaya-Removal/dp/B08DLMRQ8P/ref=sr_1_166?keywords=body+hair+removal+cream&qid=1679841663&s=hpc&sr=1-166',
       'https://amazon.in/24PCS-False-French-Cover-Acrylic/dp/B08CKC4QBR/ref=sr_1_103?keywords=false+nails&qid=1679840171&sr=8-103',

In [20]:
def recommend(url):
    
    index = np.where(URL.index == url)[0][0]  
    similar_url = sorted(list(enumerate(similar_URL_score[index])),key=lambda x:x[1],reverse=True)[1:5]
    
    data = []
    for i in similar_url:         
        item = []
        temp_df = df[df['URL'] == URL.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('URL')['Category'].values))   
        item.extend(list(temp_df.drop_duplicates('URL')['Product ID'].values))
        item.extend(list(temp_df.drop_duplicates('URL')['URL'].values))
        
        data.append(item)
    
    return data

In [21]:
recommend('https://amazon.in/yara-beauty-Premium-Scrubber-Pedicure/dp/B0BX4CVN15/ref=sr_1_290?keywords=foot+scrub&qid=1679830577&s=beauty&sr=1-290')

[['Body hair removal',
  'B000NP6F10',
  'https://amazon.in/VI-JOHN-Feather-Touch-Removal-Chandan/dp/B0B1J163RR/ref=sr_1_196?keywords=body+hair+removal+cream&qid=1679841666&s=hpc&sr=1-196'],
 ['Face Moisturizer',
  'B000BJUPHI',
  'https://amazon.in/Olay-Luminous-Niacinamide-Moisturizer-Reduce/dp/B09ND8TPB9/ref=sr_1_105?keywords=face+moisturizer&qid=1679843659&sr=8-105'],
 ['Shaving cream',
  'B000KFBAO0',
  'https://amazon.in/Park-Avenue-Cool-Shaving-Cream/dp/B094N3NMHT/ref=sr_1_61?keywords=shaving+cream&qid=1679833966&sr=8-61'],
 ['Hair Conditioner',
  'B001B1QMKO',
  'https://amazon.in/Love-Beauty-Planet-Lavender-Conditioner/dp/B09N3S7PQG/ref=sr_1_96?keywords=hair+conditioner&qid=1679828900&sr=8-96']]

In [22]:
!pip install bz2file 



In [23]:
import pickle
import bz2file as bz2
import os

In [None]:
pickle.dump(df, open('df.pkl','wb'))
pickle.dump(top_ids, open('top_ids.pkl','wb'))

In [None]:
# Compressing url as its size is huge 
 
ufile = bz2.BZ2File("URL.pkl",'wb')
pickle.dump(URL,ufile)
ufile.close()
 
print(os.path.getsize("URL.pkl"))

In [None]:
# Compressing similar score as its size is huge
 
sfile = bz2.BZ2File("similar_URL_score.pkl",'wb')
pickle.dump(similar_URL_score,sfile)
sfile.close()
 
print(os.path.getsize("similar_URL_score.pkl"))