In [91]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [53]:
orders = pd.read_csv('../data/raw/olist_orders_dataset.csv')
reviews = pd.read_csv('../data/raw/olist_order_reviews_dataset.csv')
customers = pd.read_csv('../data/raw/olist_customers_dataset.csv')
products = pd.read_csv('../data/raw/olist_order_items_dataset.csv')

print(f"There are {orders.order_id.nunique()} unique Order IDs")
print(f"There are {reviews.review_id.nunique()} unique Review ID's")
print(f"There are {customers.customer_unique_id.nunique()} unique Customers")
print(f"There are {products.product_id.nunique()} unique Products")

There are 99441 unique Order IDs
There are 98410 unique Review ID's
There are 96096 unique Customers
There are 32951 unique Products


In [36]:
orders.shape

(99441, 8)

In [156]:
df = orders.merge(reviews, on='order_id')
df = df.merge(customers, on='customer_id')
df = df.merge(products, on='order_id')
df.duplicated().sum()

0

In [135]:
# drop unwanted columns
cols = ['customer_unique_id','product_id','review_score']
df = df[cols].drop_duplicates()

0

In [157]:
df.isna().sum()

order_id                             0
customer_id                          0
order_status                         0
order_purchase_timestamp             0
order_approved_at                   15
order_delivered_carrier_date      1184
order_delivered_customer_date     2360
order_estimated_delivery_date        0
review_id                            0
review_score                         0
review_comment_title             98938
review_comment_message           64730
review_creation_date                 0
review_answer_timestamp              0
customer_unique_id                   0
customer_zip_code_prefix             0
customer_city                        0
customer_state                       0
order_item_id                        0
product_id                           0
seller_id                            0
shipping_limit_date                  0
price                                0
freight_value                        0
dtype: int64

In [169]:
# Need the amount of reviews each product has
ratings = df.groupby('product_id')['review_score'].count().to_frame()
ratings = ratings.rename(columns={'review_score':'num_reviews'}).reset_index()
ratings = ratings.loc[ratings['num_reviews'] >= 5]
ratings.sort_values('num_reviews',ascending=False)

Unnamed: 0,product_id,num_reviews
22000,aca2eb7d00ea1a7b8ebd4e68314663af,524
8562,422879e10f46682990de24d770e7f83d,486
19640,99a4788cb24856965c36a24e339b6058,482
7320,389d119b48cf3043d311335e499d9c6b,391
7038,368c6c730842d78016ad823897a372db,388
...,...,...
14146,6e263657e75994ff623356f9cff692db,5
14159,6e423c2d28bfb207e1fd1d6e1df84721,5
14236,6eda94acf0ad08a1bb12f10aef74767d,5
14243,6ee79a827c36202118501870afb17875,5


In [170]:
df_with_num_reviews = df.merge(ratings, on='product_id', how='right')
df_with_num_reviews.customer_unique_id.nunique()

58222

In [171]:
df_with_num_reviews.sort_values('num_reviews',ascending=False)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_id,review_score,...,customer_zip_code_prefix,customer_city,customer_state,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,num_reviews
46771,77bebd4675a69e173b4fc0b4cbc38ae4,212bf860ff1728f44420899993139ef9,delivered,2018-02-14 13:18:52,2018-02-14 13:30:20,2018-02-15 20:16:37,2018-02-16 20:43:17,2018-02-22 00:00:00,7bcf4ad8357025d506dee61cadef29f4,1,...,13426,piracicaba,SP,1,aca2eb7d00ea1a7b8ebd4e68314663af,955fee9216a65b617aa5c0531780ce60,2018-02-21 12:30:20,69.90,13.08,524
46885,ec72beff33dc025a12ecdb790a99f730,8a8dd60b4b54b878c9d5e2bf131222c3,delivered,2018-03-07 15:59:58,2018-03-07 20:15:21,2018-03-08 22:03:36,2018-03-15 14:52:08,2018-03-20 00:00:00,8dc5409505bb48555478b48f5e61f80f,4,...,8380,sao paulo,SP,1,aca2eb7d00ea1a7b8ebd4e68314663af,955fee9216a65b617aa5c0531780ce60,2018-03-14 20:15:21,69.90,14.13,524
46905,ae6b64ec83bef670aea816ed1fd2857b,10629d24a28802b8beb2f7f1c6626c5e,delivered,2018-01-23 14:17:56,2018-01-24 02:34:57,2018-01-24 22:05:30,2018-02-06 16:35:16,2018-02-23 00:00:00,5bbf6ef1473a6b7298faf3f71613f62b,5,...,78550,sinop,MT,1,aca2eb7d00ea1a7b8ebd4e68314663af,955fee9216a65b617aa5c0531780ce60,2018-01-31 02:34:57,69.90,23.36,524
46904,24ffe5543b23e12690af44dda7fd5ad2,fac53dc282acb0f2be3044ec7d861360,delivered,2018-02-15 13:03:08,2018-02-15 13:15:41,2018-02-20 17:57:17,2018-03-16 02:26:31,2018-03-12 00:00:00,c66e6da7d72eee6a37f0f1f8abe7a105,1,...,89150,presidente getulio,SC,1,aca2eb7d00ea1a7b8ebd4e68314663af,955fee9216a65b617aa5c0531780ce60,2018-02-22 12:15:41,69.90,20.98,524
46903,08921fc98ee1dc54454ae8af90493c44,73f6f07cecb177c52aafb9ddd4b4b448,delivered,2018-04-19 01:10:02,2018-04-19 01:30:15,2018-04-19 18:03:42,2018-04-20 18:52:35,2018-05-08 00:00:00,a94470bd9b6d68015ec36246cf57a887,5,...,6343,carapicuiba,SP,1,aca2eb7d00ea1a7b8ebd4e68314663af,955fee9216a65b617aa5c0531780ce60,2018-04-25 01:30:15,69.90,12.43,524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26403,448c60334e2a53a5c678e4bd6b1e77a7,402ee7c789991c2390d968451ae17e0e,delivered,2017-12-09 17:04:08,2017-12-09 17:13:42,2017-12-11 23:32:12,2017-12-18 21:53:28,2018-01-04 00:00:00,a491b1aadbf0d30c334ff822927d76cc,5,...,31060,belo horizonte,MG,2,5cbd407f3315b628a89206fbc140f6c8,d12c926d74ceff0a90a21184466ce161,2017-12-14 17:13:42,19.90,14.10,5
26404,b4587793077c17563480c33e3afa4786,f115c9fb49045355c502f201e83f432c,delivered,2018-04-02 13:24:38,2018-04-02 13:35:21,2018-04-03 22:48:29,2018-04-04 15:38:37,2018-04-12 00:00:00,34345c2999f52889e3189a233c06e498,5,...,9390,maua,SP,1,5cbd407f3315b628a89206fbc140f6c8,d12c926d74ceff0a90a21184466ce161,2018-04-06 13:35:21,19.90,7.39,5
26405,9447715ad192d3c877e9dd56205c66dd,7d84bbeca5122dc5bde314dee413ac26,delivered,2017-12-19 23:28:46,2017-12-20 09:51:46,2017-12-21 18:08:38,2017-12-27 20:28:15,2018-01-12 00:00:00,cf4dfb83cfbafbc9626a3c21c2fd215f,5,...,14960,novo horizonte,SP,1,5cbd407f3315b628a89206fbc140f6c8,d12c926d74ceff0a90a21184466ce161,2017-12-27 09:51:46,19.90,11.85,5
55574,d65137e8832d68f38fd35e84cbacd4f5,4d653ad70929faa00ad785f1dcd26b74,delivered,2017-07-04 16:31:00,2017-07-05 17:39:27,2017-07-06 16:07:50,2017-07-10 19:52:30,2017-07-24 00:00:00,34d886e757b7e8d60ef183c9d4aa604c,5,...,97507,uruguaiana,RS,1,ce582a7aa032280b20a5f169195202e7,d98eec89afa3380e14463da2aabaea72,2017-07-11 17:13:25,34.99,12.48,5


In [172]:
mat = df_with_num_reviews.pivot_table(index='product_id',columns='customer_unique_id', values='review_score').fillna(0)

In [1]:
mat

NameError: name 'mat' is not defined

In [175]:
mat1 = mat.copy()

In [176]:
def recommend_products(user, num_recommended_products):
    
    user = str(user) # make sure the user id is a string
    
    print(f"The list of the Products User: {user} has bought/reviewed \n")
    
    for prod in mat[mat[user] > 0.0][user].index.tolist(): # loop through all products that have review
        print(prod)
    
    print('\n')
    
    recommended_products = [] # initialize list 
    
    for prod in mat[mat[user]==0.0].index.tolist(): #loop through all unreviewed products
        index_df = mat.index.tolist().index(prod)
        predicted_rating = mat1.iloc[index_df, mat1.columns.tolist().index(user)]
        recommended_products.append((prod, predicted_rating))
    
    sorted_rp = sorted(recommended_products, key=lambda x:x[1], reverse=True)
    
    print('The list of the Recommended Products \n')
    rank = 1 
    for recommended_product in sorted_rp[:num_recommended_products]:
        print(f'{rank}: {recommended_product[0]} - predicted rating: {recommended_product[1]}')
        rank += 1

In [177]:
def product_recommender(user, num_neighbors, num_recommendation):
    
    number_neighbors = num_neighbors
    
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(mat.values)
    distances, indices = knn.kneighbors(mat.values, n_neighbors=number_neighbors)
    
    user_index = mat.columns.tolist().index(user)
    
    for i,prod in list(enumerate(mat.index)):
        # if user hasnt given this product a rating:
        if mat.iloc[i, user_index] == 0.0: 
            sim_prods = indices[i].tolist() # add that product's similar products to similar product list
            prod_distances = distances[i].tolist() # add distances to sim. prod. to list
            
            if i in sim_prods: #removing itself from lists
                id_prod = sim_prods.index(i)
                sim_prods.remove(i)
                prod_distances.pop(id_prod)
                
            else:
                sim_prods = sim_prods[:num_neighbors-1]
                prod_distances = prod_distances[:num_neighbors-1]
            
            prod_similarity = [1-x for x in prod_distances] # inverse distance
            prod_similarity_copy = prod_similarity.copy() # create a copy
            nominator = 0
            
            for s in range(0, len(prod_similarity)):
                if mat.iloc[sim_prods[s], user_index] == 0.0: #check if the similar product has rating of 0 for user
                    if len(prod_similarity_copy) == (num_neighbors - 1):
                        prod_similarity_copy.pop(s)
                        
                    else:
                        prod_similarity_copy.pop(s-(len(prod_similarity)-len(prod_similarity_copy)))
                    
                else: # similar prod has rating
                    nominator = nominator + prod_similarity[s]*mat.iloc[sim_prods[s], user_index]
                    
            if len(prod_similarity_copy) > 0: # make sure there are similar prods
                if sum(prod_similarity_copy) > 0: # Don't want to divide by 0!
                    predicted_r = nominator/sum(prod_similarity_copy) # calculated the predicted rating
                
                else:
                    predicted_r = -1.
            else:
                predicted_r = -1.
            
        mat1.iloc[i, user_index] = predicted_r # enter the predicted rating into the copy matrix
        
    recommend_products(user, num_recommendation)

In [185]:
import random
user = random.choice(mat.columns.tolist())
product_recommender('c8460e4251689ba205045f3ea17884a1', 10, 10)

The list of the Products User: c8460e4251689ba205045f3ea17884a1 has bought/reviewed 

e7cc48a9daff5436f63d3aad9426f28b


The list of the Recommended Products 

1: 001795ec6f1b187d37335e1c4704762e - predicted rating: -1.0
2: 001b72dfd63e9833e8c02742adf472e3 - predicted rating: -1.0
3: 00210e41887c2a8ef9f791ebc780cc36 - predicted rating: -1.0
4: 002159fe700ed3521f46cfcf6e941c76 - predicted rating: -1.0
5: 00250175f79f584c14ab5cecd80553cd - predicted rating: -1.0
6: 002af88741ba70c7b5cf4e4a0ad7ef85 - predicted rating: -1.0
7: 005030ef108f58b46b78116f754d8d38 - predicted rating: -1.0
8: 007c63ae4b346920756b5adcad8095de - predicted rating: -1.0
9: 00878d953636afec00d3e85d55a12e7f - predicted rating: -1.0
10: 008cff0e5792219fae03e570f980b330 - predicted rating: -1.0


In [183]:
df_with_num_reviews.customer_unique_id.value_counts()

c8460e4251689ba205045f3ea17884a1    24
c402f431464c72e27330a67f7b94d4fb    20
4546caea018ad8c692964e3382debd19    20
d97b3cfb22b0d6b25ac9ed4e9c2d481b    20
0f5ac8d5c31de21d2f25e24be15bbffb    18
                                    ..
c37cd7abcdc247d7b35a95a1deededf7     1
839ec7c231c579ed10ddab6610f8f12b     1
89b59fe90c14d3e0cf8974983b933eff     1
e75ea98bb2cbafbf1d99019e43612439     1
c767dbe6f331a287a229243814f19e0a     1
Name: customer_unique_id, Length: 58222, dtype: int64

In [184]:
df.loc[df['customer_unique_id'] == 'c8460e4251689ba205045f3ea17884a1']

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_id,review_score,...,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
41893,cb1f3a44e8b8527e16913306a4d3de2f,dbe9495069f4ddb6875dfc83462d616f,delivered,2018-08-07 09:03:02,2018-08-08 09:05:09,2018-08-08 15:01:00,2018-08-15 19:28:29,2018-08-24 00:00:00,9fbc1da26e7ba88631bea5de53108f0a,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,1,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-14 09:05:09,170.0,30.44
41894,cb1f3a44e8b8527e16913306a4d3de2f,dbe9495069f4ddb6875dfc83462d616f,delivered,2018-08-07 09:03:02,2018-08-08 09:05:09,2018-08-08 15:01:00,2018-08-15 19:28:29,2018-08-24 00:00:00,9fbc1da26e7ba88631bea5de53108f0a,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,2,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-14 09:05:09,170.0,30.44
41895,cb1f3a44e8b8527e16913306a4d3de2f,dbe9495069f4ddb6875dfc83462d616f,delivered,2018-08-07 09:03:02,2018-08-08 09:05:09,2018-08-08 15:01:00,2018-08-15 19:28:29,2018-08-24 00:00:00,9fbc1da26e7ba88631bea5de53108f0a,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,3,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-14 09:05:09,170.0,30.44
41896,cb1f3a44e8b8527e16913306a4d3de2f,dbe9495069f4ddb6875dfc83462d616f,delivered,2018-08-07 09:03:02,2018-08-08 09:05:09,2018-08-08 15:01:00,2018-08-15 19:28:29,2018-08-24 00:00:00,9fbc1da26e7ba88631bea5de53108f0a,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,4,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-14 09:05:09,170.0,30.44
41897,cb1f3a44e8b8527e16913306a4d3de2f,dbe9495069f4ddb6875dfc83462d616f,delivered,2018-08-07 09:03:02,2018-08-08 09:05:09,2018-08-08 15:01:00,2018-08-15 19:28:29,2018-08-24 00:00:00,9fbc1da26e7ba88631bea5de53108f0a,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,5,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-14 09:05:09,170.0,30.44
41898,cb1f3a44e8b8527e16913306a4d3de2f,dbe9495069f4ddb6875dfc83462d616f,delivered,2018-08-07 09:03:02,2018-08-08 09:05:09,2018-08-08 15:01:00,2018-08-15 19:28:29,2018-08-24 00:00:00,9fbc1da26e7ba88631bea5de53108f0a,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,6,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-14 09:05:09,170.0,30.44
49315,03aba68b07658f28f29612641f08d4ba,a7ab31829dc9a10f37e82b1e1afd26b6,delivered,2018-08-08 14:27:15,2018-08-09 08:50:17,2018-08-14 18:00:00,2018-08-21 15:33:32,2018-08-27 00:00:00,42bd18b4a9757a63d6bc9b4cc348c1dd,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,1,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-15 08:50:17,170.0,30.44
49316,03aba68b07658f28f29612641f08d4ba,a7ab31829dc9a10f37e82b1e1afd26b6,delivered,2018-08-08 14:27:15,2018-08-09 08:50:17,2018-08-14 18:00:00,2018-08-21 15:33:32,2018-08-27 00:00:00,42bd18b4a9757a63d6bc9b4cc348c1dd,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,2,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-15 08:50:17,170.0,30.44
49317,03aba68b07658f28f29612641f08d4ba,a7ab31829dc9a10f37e82b1e1afd26b6,delivered,2018-08-08 14:27:15,2018-08-09 08:50:17,2018-08-14 18:00:00,2018-08-21 15:33:32,2018-08-27 00:00:00,42bd18b4a9757a63d6bc9b4cc348c1dd,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,3,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-15 08:50:17,170.0,30.44
49318,03aba68b07658f28f29612641f08d4ba,a7ab31829dc9a10f37e82b1e1afd26b6,delivered,2018-08-08 14:27:15,2018-08-09 08:50:17,2018-08-14 18:00:00,2018-08-21 15:33:32,2018-08-27 00:00:00,42bd18b4a9757a63d6bc9b4cc348c1dd,4,...,c8460e4251689ba205045f3ea17884a1,91170,porto alegre,RS,4,e7cc48a9daff5436f63d3aad9426f28b,53243585a1d6dc2643021fd1853d8905,2018-08-15 08:50:17,170.0,30.44


In [167]:
for prod in mat[mat['8d50f5eadf50201ccdcedfb9e2ac8455'] > 0.0].index.tolist(): # loop through all products that have review
    print(prod)

94cc774056d3f2b0dc693486a589025e
d6354128c28cc56532ba7393d9373083


In [174]:
mat[mat['8d50f5eadf50201ccdcedfb9e2ac8455']>0]['8d50f5eadf50201ccdcedfb9e2ac8455'].index.tolist()

KeyError: '8d50f5eadf50201ccdcedfb9e2ac8455'

Something is wrong with the way I'm making my matrix. I am losing alot of value - lots of customers aren't appear for all the things they have watched