In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer 

In [2]:
data = pd.read_csv("grocery_retail_dataset_cleaned_filtered.csv")

In [3]:
data.head(5)

Unnamed: 0,index,product_id,title,price,quantity,customer_id,rating
0,0,7169806908,Knife Plastic White,45.21,6,247389045,2.9
1,1,3843510261,Cafe Royale,41.83,18,764352451,2.2
2,2,4764590794,Flour Bran Red,3.25,17,656322986,3.4
3,3,8350173327,Oil Cooking Spray,29.1,19,611980927,4.7
4,4,1814054677,Apricots Halves,32.64,22,939300456,2.6


In [4]:
data.isnull().sum().any()

True

In [5]:
for item in data:
  data[item] = data[item].fillna('')

In [6]:
data['title'] = data['title'].str.lower()

In [7]:
data.head()

Unnamed: 0,index,product_id,title,price,quantity,customer_id,rating
0,0,7169806908,knife plastic white,45.21,6,247389045,2.9
1,1,3843510261,cafe royale,41.83,18,764352451,2.2
2,2,4764590794,flour bran red,3.25,17,656322986,3.4
3,3,8350173327,oil cooking spray,29.1,19,611980927,4.7
4,4,1814054677,apricots halves,32.64,22,939300456,2.6


In [8]:
columns = ['index', 'title', 'quantity', 'rating']

In [9]:
data[columns].head()

Unnamed: 0,index,title,quantity,rating
0,0,knife plastic white,6,2.9
1,1,cafe royale,18,2.2
2,2,flour bran red,17,3.4
3,3,oil cooking spray,19,4.7
4,4,apricots halves,22,2.6


In [10]:
#data['title'].lower()

In [11]:
#new = data[["index", "title", "quantity" ,"rating"]]

In [12]:
#new.head(20)

In [13]:
#new['title'].str.lower()
#data['title'].str.strip()

In [14]:
def get_important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['quantity'][i].astype(str)+ ' ' +data['title'][i]+' '+data['rating'][i].astype(str))

  return important_features

In [15]:
data['important_features'] = get_important_features(data)
data.head()

Unnamed: 0,index,product_id,title,price,quantity,customer_id,rating,important_features
0,0,7169806908,knife plastic white,45.21,6,247389045,2.9,6 knife plastic white 2.9
1,1,3843510261,cafe royale,41.83,18,764352451,2.2,18 cafe royale 2.2
2,2,4764590794,flour bran red,3.25,17,656322986,3.4,17 flour bran red 3.4
3,3,8350173327,oil cooking spray,29.1,19,611980927,4.7,19 oil cooking spray 4.7
4,4,1814054677,apricots halves,32.64,22,939300456,2.6,22 apricots halves 2.6


In [16]:
tfv = TfidfVectorizer().fit_transform(data['important_features'])

In [17]:
cs = cosine_similarity(tfv)

In [18]:
print(cs)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [19]:
cs.shape

(1000, 1000)

In [20]:
title = input("Type Product name : ")
item_id = data[data.title == title]['index'].values[0]

In [21]:
score = list(enumerate(cs[item_id]))

In [22]:
sorted_score = sorted(score, key = lambda x: x[1], reverse = True)
sorted_score = sorted_score[1:]

In [23]:
print(sorted_score)

[(556, 0.39961092347042415), (700, 0.39961092347042415), (802, 0.3568291265850652), (623, 0.17869391184406244), (500, 0.17832461427919938), (961, 0.17711602750745875), (668, 0.16761889223257095), (927, 0.16761889223257095), (469, 0.16445849813139807), (657, 0.16445849813139807), (774, 0.16437085050176198), (605, 0.1596888901568852), (573, 0.15814414396809456), (909, 0.15756857723915868), (366, 0.15667016560223568), (963, 0.15594856853829922), (852, 0.15493464978945853), (671, 0.14966174712246522), (326, 0.1474591326673051), (442, 0.14731829887051634), (85, 0.1459094128404635), (126, 0.14565571050488257), (622, 0.14303966607264146), (838, 0.14047302836592465), (381, 0.1373342698478507), (853, 0.1373342698478507), (378, 0.13558466160145524), (733, 0.13433488966487472), (836, 0.1336987652041246), (35, 0.13069198577496294), (66, 0.13054252638381855), (441, 0.12687957120023755), (746, 0.1194161082616403), (823, 0.1172120596949727), (932, 0.11155322096663726), (0, 0.0), (2, 0.0), (3, 0.0), (

In [24]:
j = 0
print("Products Recommendations for", title, 'are :\n')
for item in sorted_score:
  product_title = np.array(data[data.index == item[0]]['title'].values[0])
  #titles = set(grocery_title)
  print(j+1, product_title)
  j = j + 1
  if j > 10:
    break

Products Recommendations for cafe royale are :

1 
2 
3 coffee  cafe moreno
4 flounder  fresh
5 onion powder
6 salt  sea
7 wasabi powder
8 carroway seed
9 maple syrup
10 ginger  crystalized
11 longos  cheese tortellini
