In [1]:
import pandas as pd
import numpy as np

In [2]:
reviews = pd.read_csv('../data/olist_order_reviews_dataset.csv')
customers = pd.read_csv('../data/olist_customers_dataset.csv')
orders_products = pd.read_csv('../data/olist_order_items_dataset.csv')
orders = pd.read_csv('../data/olist_orders_dataset.csv')
products = pd.read_csv('../data/olist_products_dataset.csv')

In [3]:
reviews.name ="reviews"
customers.name = "customers"
orders.name = "orders"
orders_products.name = "orders_products"
products.name = "products"

## 0. Cast

In [94]:
date_columns_orders = ["order_purchase_timestamp",
                "order_approved_at",
                "order_delivered_carrier_date",
                "order_delivered_customer_date",
                "order_estimated_delivery_date"]

date_columns_orders_products = ["shipping_limit_date"]
date_columns_reviews = ["review_answer_timestamp"]


orders[date_columns_orders] = orders[date_columns_orders].apply(pd.to_datetime)
orders_products[date_columns_orders_products] = orders_products[date_columns_orders_products].apply(pd.to_datetime)
reviews[date_columns_reviews] = reviews[date_columns_reviews].apply(pd.to_datetime)

In [95]:
orders['year'] = orders["order_purchase_timestamp"].dt.year

## 1. Split Train and Test

In [96]:
# Join datasets
def join_datasets(dataset1, dataset2, key):
    
    dataset1[key] = dataset1[key].astype('str')
    dataset2[key] = dataset2[key].astype('str')
    
    joined_datasets = dataset1.merge(dataset2,
                                 on=key,
                                 how='left')
    return joined_datasets


def join_all_datasets(data_orders, data_orders_items, data_clients, data_products, data_reviews):
    
    data_orders_customers = join_datasets(data_orders, data_clients, key='customer_id')
    
    data_orders_items_customers = join_datasets(data_orders_items, data_orders_customers, key='order_id')
    
    data_orders_items_customers_reviews = join_datasets(data_orders_items_customers, data_reviews, key='order_id')
    
    data_orders_items_customers_reviews_products = join_datasets(data_orders_items_customers_reviews, data_products, key='product_id')

    data_orders_items_customers_reviews = data_orders_items_customers_reviews[data_orders_items_customers_reviews[
        "product_id"]!="99a4788cb24856965c36a24e339b6058"].copy()
    
    return data_orders_items_customers_reviews    


In [97]:
data_joined = join_all_datasets(data_orders = orders,
                            data_orders_items = orders_products,
                            data_clients = customers ,
                            data_reviews= reviews)

In [99]:
def split(data):
    data_agg = data[data['customer_unique_id'].notnull() 
                      & data["product_id"].notnull()
                     & data["order_purchase_timestamp"].notnull()
                     ][["customer_unique_id",
                                                 "product_id",
                                                 "review_score" ]]
    
    bought_products_customer = data_agg.groupby(by = "customer_unique_id").agg({"product_id": "nunique"})
    customers2consider = bought_products_customer[bought_products_customer['product_id']>1].index.tolist()

    data_user_rec_train = data[data["customer_unique_id"].isin(customers2consider)].copy()
    
    return data_user_rec_train


In [100]:
data_for_svd = split(data_joined)

In [101]:
full_users = data_for_svd["customer_unique_id"].unique().tolist()

In [102]:
batch_size = int(len(full_users)/10)

In [103]:
train_users = full_users[:-batch_size]

In [104]:
test_users = full_users[-batch_size:]

In [105]:
expected_test = data_for_svd[data_for_svd["customer_unique_id"].isin(test_users)].copy()

In [106]:
expected_test['rank'] = expected_test.groupby(
        'customer_unique_id')['order_purchase_timestamp'].rank(ascending=False)

In [107]:
expected_test = expected_test[expected_test["rank"]==1].copy()

In [108]:
user_item_matrix = data_for_svd.pivot_table(index='customer_unique_id',
                                  columns='product_id',
                                  values='review_score')

In [109]:
user_item_matrix

product_id,0011c512eb256aa0dbbb544d8dffcf6e,001b72dfd63e9833e8c02742adf472e3,00210e41887c2a8ef9f791ebc780cc36,00250175f79f584c14ab5cecd80553cd,002959d7a0b0990fe2d69988affcbc80,0042f1a9a7e0edd1400c6cd0fda065f8,005030ef108f58b46b78116f754d8d38,0060b415594c5e1200324ef1a18493c4,007c63ae4b346920756b5adcad8095de,008cff0e5792219fae03e570f980b330,...,ffb2e8c1ddc7c3e590d2bc4c91de53e1,ffbb3c00e9687ad738ace3977e821da5,ffbbf6b9097237a1122f17e7341a3fb2,ffbc83054b3741a8d67fc59d9cf9d42d,ffc0b406806006602c5853b00ab5f7fd,ffcfaba393e8ef71937c6e8421bc2868,ffd4bf4306745865e5692f69bd237893,ffe8083298f95571b4a66bfbc1c05524,fff1059cd247279f3726b7696c66e44e,fff28f91211774864a1000f918ed00cc
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00053a61a98854899e70ed204dd4bafe,,,,,,,,,,,...,,,,,,,,,,
000de6019bb59f34c099a907c151d855,,,,,,,,,,,...,,,,,,,,,,
000fbf0473c10fc1ab6f8d2d286ce20c,,,,,,,,,,,...,,,,,,,,,,
001926cef41060fae572e2e7b30bd2a4,,,,,,,,,,,...,,,,,,,,,,
001928b561575b2821c92254a2327d06,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffb973f2bb1c0cb807a99341a9b20dcd,,,,,,,,,,,...,,,,,,,,,,
ffd6f65402f2bc47238ecd2bdc93e0d4,,,,,,,,,,,...,,,,,,,,,,
ffe254cc039740e17dd15a5305035928,,,,,,,,,,,...,,,,,,,,,,
fff7219c86179ca6441b8f37823ba3d3,,,,,,,,,,,...,,,,,,,,,,


In [110]:
for row, column in zip(expected_test["customer_unique_id"].values.tolist(),expected_test["product_id"].values.tolist()):
    user_item_matrix.at[row, column] = np.nan

### SVD

In [111]:
mask=np.isnan(user_item_matrix)
masked_arr=np.ma.masked_array(user_item_matrix, mask)

item_means=np.mean(masked_arr, axis=0)
user_means=np.mean(masked_arr, axis=1)
item_means_tiled = np.tile(item_means, (user_item_matrix.shape[0],1))

In [116]:
from scipy.sparse.linalg import svds
latent_variables = 10
U, sigma, V = svds(user_item_matrix, k = latent_variables)

In [117]:
sigma

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [114]:
sigma = np.array([0,0,0,0,0,
       19.73859555, 22.43859566, 23.06530217, 23.41713833, 25.27767386,
       25.3888348 , 28.52267569, 34.76847675, 36.46910255, 40.05005962])

In [115]:
sigma = np.diag([np.sqrt(sigma[i]) for i in range(0,latent_variables)])

Usk=np.dot(U,sigma)
skV=np.dot(sigma,V)

UsV = np.dot(Usk, skV)

UsV = UsV + item_means_tiled


In [83]:
predicted_df = pd.DataFrame(UsV)

In [84]:
predicted_df.index = user_item_matrix.index

In [85]:
predicted_df.columns = user_item_matrix.columns

In [86]:
predicted_df["rec_product"] = predicted_df.idxmax(axis=1)

In [87]:
predicted_df["customer"] = predicted_df.index

In [88]:
recommendations = predicted_df[predicted_df["customer"].isin(test_users)][["customer", "rec_product"]].copy()

In [89]:
true_with_rec = expected_test[["customer_unique_id", "product_id"]].merge(recommendations,
                                                                               left_on="customer_unique_id", 
                                                                               right_on="customer" )

In [90]:
for produs_cump, produs_rec in zip(true_with_rec["product_id"].values.tolist(),true_with_rec["rec_product"].values.tolist()):
    listuta = [produs_cump, produs_rec ]
    print(products[products["product_id"].isin(listuta)])

                            product_id   product_category_name  \
6869  36f60d45225e60c7da4558b070ce4b60  informatica_acessorios   
6900  6ae38d49db7fc65ad1779828542aecba      ferramentas_jardim   

      product_name_lenght  product_description_lenght  product_photos_qty  \
6869                 33.0                       300.0                 1.0   
6900                 42.0                       317.0                 1.0   

      product_weight_g  product_length_cm  product_height_cm  product_width_cm  
6869             207.0               19.0               11.0              14.0  
6900            2300.0               23.0               22.0              19.0  
                            product_id   product_category_name  \
6451  6c04a068e5ab37749c980c42a036b9e3              automotivo   
6869  36f60d45225e60c7da4558b070ce4b60  informatica_acessorios   

      product_name_lenght  product_description_lenght  product_photos_qty  \
6451                 52.0                      136

In [119]:
data_joined.columns

Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value', 'customer_id',
       'order_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'year', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'review_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp'],
      dtype='object')

In [120]:
data_joined.groupby(by = "product_category_name").agg({"price": "sum"})

KeyError: 'product_category_name'