In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
original = pd.read_csv('cloths-rating.csv')

original.shape

(634, 4)

In [3]:
original = original.drop_duplicates(subset=['ProductID','UserID'],keep='last')

original.shape

(385, 4)

In [4]:
original.head()

Unnamed: 0,ProductID,UserID,Rating,Text
0,777,AV1YnR7wglJLPUi8IJmi,4,Great taffy at a great price.
1,767,AVpfpK8KLJeJML43BCuD,4,Absolutely wonderful - silky and sexy and comf...
4,1049,AVpfpK8KLJeJML43BCuD,5,"I love, love, love this jumpsuit. it's fun, fl..."
6,1080,AVph0EeEilAPnD_x9myq,2,I used this product only once and found its no...
13,1095,AVpfpK8KLJeJML43BCuD,1,Average quality of fabric poor stitching and p...


In [5]:
le = LabelEncoder()

In [6]:
original['UserID'] = le.fit_transform(original['UserID'])

original.head()

Unnamed: 0,ProductID,UserID,Rating,Text
0,777,0,4,Great taffy at a great price.
1,767,3,4,Absolutely wonderful - silky and sexy and comf...
4,1049,3,5,"I love, love, love this jumpsuit. it's fun, fl..."
6,1080,7,2,I used this product only once and found its no...
13,1095,3,1,Average quality of fabric poor stitching and p...


### Finding Updated Score

Perform Sentiment analysis on reviews and find sentiment score.
<br>
Multiply sentiment with original Rating and find an updated score.
<br>
Make a function which returns New Rating based on conditions while taking input updated score

In [7]:
def calc_sentiment(text):
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        None

In [8]:
original['Sentiment'] = original['Text'].apply(calc_sentiment)

original.head()

Unnamed: 0,ProductID,UserID,Rating,Text,Sentiment
0,777,0,4,Great taffy at a great price.,0.8
1,767,3,4,Absolutely wonderful - silky and sexy and comf...,0.633333
4,1049,3,5,"I love, love, love this jumpsuit. it's fun, fl...",0.55
6,1080,7,2,I used this product only once and found its no...,-0.075
13,1095,3,1,Average quality of fabric poor stitching and p...,-0.35


In [9]:
original['Updated_score'] = original['Sentiment']*original['Rating']

original = original.round(decimals=1)

original.head()

Unnamed: 0,ProductID,UserID,Rating,Text,Sentiment,Updated_score
0,777,0,4,Great taffy at a great price.,0.8,3.2
1,767,3,4,Absolutely wonderful - silky and sexy and comf...,0.6,2.5
4,1049,3,5,"I love, love, love this jumpsuit. it's fun, fl...",0.5,2.7
6,1080,7,2,I used this product only once and found its no...,-0.1,-0.2
13,1095,3,1,Average quality of fabric poor stitching and p...,-0.4,-0.4


In [10]:
unique_scores = original['Updated_score'].unique()
unique_scores.sort()

print(len(unique_scores))
print(unique_scores)

59
[-1.8 -1.7 -1.6 -1.5 -1.4 -1.  -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2
 -0.1  0.   0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9  1.   1.1  1.2
  1.3  1.4  1.5  1.6  1.7  1.8  1.9  2.   2.1  2.2  2.3  2.4  2.5  2.6
  2.7  2.8  2.9  3.   3.1  3.2  3.3  3.5  3.7  3.8  3.9  4.   4.1  4.2
  4.6  4.7  5. ]


In [11]:
dict1 = {'1' : [-1.8, -1.7, -1.6, -1.5, -1.4, -1.,  -0.9, -0.8, -0.7, -0.6,-0.5, -0.4, -0.3,],
        '2' : [ -0.2, -0.1,0.,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1., 1.1,1.2,1.3,1.4,1.5 ], 
        '3' : [1.6, 1.7, 1.8, 1.9, 2., 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.],
        '4' : [3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9],
        '5' : [4., 4.1, 4.2, 4.6, 4.7, 5.]
        }

In [12]:
def update_rating(val):
    for j in dict1.keys():
        if val in dict1[j]:
            return j 
        

In [13]:
original['New_rating'] = original['Updated_score'].apply(update_rating)

In [14]:
original.head()

Unnamed: 0,ProductID,UserID,Rating,Text,Sentiment,Updated_score,New_rating
0,777,0,4,Great taffy at a great price.,0.8,3.2,4
1,767,3,4,Absolutely wonderful - silky and sexy and comf...,0.6,2.5,3
4,1049,3,5,"I love, love, love this jumpsuit. it's fun, fl...",0.5,2.7,3
6,1080,7,2,I used this product only once and found its no...,-0.1,-0.2,2
13,1095,3,1,Average quality of fabric poor stitching and p...,-0.4,-0.4,1


In [15]:
print("New ratings: \n",original['New_rating'].value_counts())

print("Old ratings: \n", original['Rating'].value_counts())

New ratings: 
 2    177
3    111
1     33
4     32
5     32
Name: New_rating, dtype: int64
Old ratings: 
 5    204
1     74
4     53
3     33
2     21
Name: Rating, dtype: int64


In [16]:
original['New_rating'] = pd.to_numeric(original['New_rating'])

### Pivoting + Recommendations

Now making the pivot table, on which we will perform cosine similarity and nearest neighbour algorithms in order to find out which products are similar to each other, for better recommendations

In [17]:
pivoted = original.pivot_table( index='ProductID',columns='UserID',values='New_rating').fillna(0)

pivoted.head()

UserID,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
89,0.0,0.0,0.0,2.0,0.0,0.0,5.0,2.0,2.0,0.0,...,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0
333,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,1.0,...,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
369,4.0,2.0,0.0,3.0,0.0,3.0,5.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,0.0
444,2.0,2.0,0.0,0.0,2.0,0.0,3.0,5.0,2.0,0.0,...,2.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,4.0,0.0
684,0.0,0.0,3.0,5.0,2.0,0.0,2.0,2.0,2.0,0.0,...,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


### Sparse Matrix

Matrices that contain mostly zero values are called sparse, distinct from matrices where most of the values are non-zero, called dense.

In [18]:
from scipy.sparse import csr_matrix

sparse = csr_matrix(pivoted)

In [19]:
print(sparse)

  (0, 3)	2.0
  (0, 6)	5.0
  (0, 7)	2.0
  (0, 8)	2.0
  (0, 11)	3.0
  (0, 16)	5.0
  (0, 18)	5.0
  (0, 20)	2.0
  (0, 22)	4.0
  (0, 28)	3.0
  (0, 33)	3.0
  (0, 37)	3.0
  (0, 41)	3.0
  (1, 1)	3.0
  (1, 5)	3.0
  (1, 8)	5.0
  (1, 9)	1.0
  (1, 10)	2.0
  (1, 13)	2.0
  (1, 17)	2.0
  (1, 22)	4.0
  (1, 25)	2.0
  (1, 29)	3.0
  (1, 33)	1.0
  (1, 34)	2.0
  :	:
  (25, 17)	5.0
  (25, 21)	2.0
  (25, 22)	2.0
  (25, 25)	2.0
  (25, 30)	3.0
  (25, 38)	4.0
  (25, 40)	1.0
  (26, 0)	2.0
  (26, 1)	2.0
  (26, 3)	2.0
  (26, 6)	2.0
  (26, 7)	2.0
  (26, 8)	2.0
  (26, 9)	2.0
  (26, 11)	3.0
  (26, 16)	3.0
  (26, 17)	2.0
  (26, 18)	1.0
  (26, 19)	1.0
  (26, 22)	2.0
  (26, 23)	2.0
  (26, 25)	4.0
  (26, 28)	2.0
  (26, 33)	2.0
  (26, 37)	2.0


In [20]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine',n_neighbors=20, radius = 1)
model_knn.fit(sparse)

NearestNeighbors(metric='cosine', n_neighbors=20, radius=1)

#### Pairwise cosine similarity
Compute cosine similarity between samples in X and Y.

Cosine similarity, or the cosine kernel, computes similarity as the normalized dot product of X and Y:

K(X, Y) = <X, Y> / (||X||*||Y||)

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(pivoted)
similarity_matrix

array([[1.        , 0.27208681, 0.33120207, 0.4117735 , 0.58744485,
        0.42632907, 0.27243118, 0.39191855, 0.74695614, 0.35975608,
        0.22213083, 0.30779351, 0.52893072, 0.45394737, 0.29397865,
        0.10838875, 0.36148514, 0.06083303, 0.24590296, 0.11092108,
        0.4580397 , 0.35602582, 0.13114825, 0.        , 0.51357179,
        0.20232566, 0.66577517],
       [0.27208681, 1.        , 0.32322116, 0.27031424, 0.17656302,
        0.47628204, 0.1667811 , 0.03706247, 0.20751059, 0.32065738,
        0.11483385, 0.22099785, 0.37886076, 0.43363835, 0.33524262,
        0.22413273, 0.19466161, 0.11793204, 0.36865731, 0.        ,
        0.26116595, 0.20299949, 0.16949761, 0.        , 0.36348024,
        0.34553795, 0.46933966],
       [0.33120207, 0.32322116, 1.        , 0.4700634 , 0.23860971,
        0.32665263, 0.508233  , 0.3928371 , 0.37295395, 0.3441236 ,
        0.12171612, 0.23717082, 0.36664796, 0.19601755, 0.20431735,
        0.37862009, 0.15474612, 0.40625   , 0.2829

In [22]:
product_id = int(input("Enter product ID to find ecommendations of: "))

pids = list(pivoted.index)

print(pids)

Enter product ID to find ecommendations of: 1077
[89, 333, 369, 444, 684, 697, 767, 777, 823, 847, 853, 858, 862, 910, 949, 1002, 1003, 1049, 1060, 1065, 1077, 1080, 1095, 1120, 6969, 8001, 9696]


In [23]:
q_index = pids.index(product_id)    #to find index number associated with the product id 

print(q_index)



20


#### NearestNeighbors.kneighbors
Finds the K-neighbors of a point.
<br>
Returns indices of and distances to the neighbors of each point.

In [24]:
similarities, indices = model_knn.kneighbors(pivoted.iloc[q_index,:].values.reshape(1,-1), n_neighbors=8)

print(similarities)   #shows similarity distance through productID by USER
print(indices)        #shows indices of productID by USER

[[0.         0.24952123 0.29507823 0.33471407 0.35405776 0.38433642
  0.41626451 0.43603835]]
[[20 21  9 11  6  3 10 22]]


In [31]:
pivoted.iloc[21]

UserID
0     0.0
1     2.0
2     0.0
3     0.0
4     2.0
5     0.0
6     2.0
7     2.0
8     0.0
9     2.0
10    0.0
11    5.0
12    2.0
13    4.0
14    0.0
15    2.0
16    3.0
17    5.0
18    2.0
19    3.0
20    1.0
21    0.0
22    0.0
23    3.0
24    5.0
25    0.0
26    5.0
27    3.0
28    0.0
29    0.0
30    1.0
31    3.0
32    0.0
33    4.0
34    0.0
35    0.0
36    2.0
37    0.0
38    0.0
39    5.0
40    0.0
41    0.0
Name: 1080, dtype: float64