In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('generated_data.csv')
products = pd.read_csv('../data/olist_products_dataset.csv')
similarities = pd.read_csv('similarity_matrix.csv')
orders_products = pd.read_csv('../data/olist_order_items_dataset.csv')

## 0. Cast

In [3]:
date_columns_orders = ["order_purchase_timestamp"]

data[date_columns_orders] = data[date_columns_orders].apply(pd.to_datetime)

In [4]:
data['year'] = data["order_purchase_timestamp"].dt.year

In [5]:
data["year"].value_counts()

2017    22054
2016    21840
2018    20006
Name: year, dtype: int64

## 1. Split Train and Test

In [6]:
def split(data, date_column, date_split):
    
    data_df = data.copy()
    mask_train = data_df[date_column] < date_split
    mask_test = data_df[date_column] >= date_split

    data_train = data_df[mask_train].copy()
    data_test = data_df[mask_test].copy()
    
    data_train['split'] ="train"
    data_test['split'] ="test"

    return data_train, data_test

In [7]:
df_train, df_test = split(data= data,
                          date_column="order_purchase_timestamp",
                          date_split = '2018-01-01')

In [8]:
len(df_train)

43894

In [9]:
len(df_test) 

20006

## 2. SVD

In [10]:
import svd.svd as svd
import testing.test as test
import content_recommender.similarity_recommendations as simrec

In [11]:
predicted_df = svd.perform_svd(df_train)

(3445, 41)


In [12]:
true_with_rec = svd.extract_test_recommendations(df_test, predicted_df)

In [13]:
svd_test_assessed = test.assess_similarity(true_with_rec,similarities)
deduplicated_svd_test_assessed = simrec.delete_duplicated_recommendations(svd_test_assessed)

## 3. Similarity Recommendations

In [14]:
import content_recommender.similarity_recommendations as simrec

In [15]:
df_train_with_rec = simrec.recommend_similar(df_train, similarities)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  rec_product = similarity_matrix[similarity_matrix[product1] != 1][product1].argmax()


In [16]:
sim_rec_test = simrec.merge_sim_rec_with_test(df_train_with_rec, df_test)

In [17]:
deduplicated_sim_rec_test = simrec.delete_duplicated_recommendations(sim_rec_test)

In [18]:
deduplicated_sim_rec_test_assessed = test.assess_similarity(deduplicated_sim_rec_test,similarities)

### 3.1. Success Rate for each Component

In [19]:
deduplicated_svd_test_assessed["bought_flag"] =  np.where(deduplicated_svd_test_assessed["similarity"]>0.9999,
                                                          1,0)
deduplicated_svd_test_assessed["component"] = "svd"

deduplicated_sim_rec_test_assessed["bought_flag"] =  np.where(deduplicated_sim_rec_test_assessed["similarity"]>0.9999,
                                                              1,0)
deduplicated_sim_rec_test_assessed["component"] = "similarity"

In [20]:
deduplicated_svd_test_assessed["component"] = "svd"
success_rate_svd = deduplicated_svd_test_assessed.groupby(by = ["customer_unique_id", "rec_product", "component", "segment"]).agg({
                                                                 "bought_flag": "max"})

success_rate_sim = deduplicated_sim_rec_test_assessed.groupby(by = ["customer_unique_id", "rec_product", "component", "segment"]).agg({
                                                                 "bought_flag": "max"})

success_rate_svd_value = success_rate_svd["bought_flag"].mean()
success_rate_svd_sum_bought_products = success_rate_svd["bought_flag"].sum()

print(f"Based on SVD Component there were bought {success_rate_svd_sum_bought_products} products with a {success_rate_svd_value} success rate")


Based on SVD Component there were bought 1903 products with a 0.5552961774146484 success rate


In [21]:
success_rate_sim_value = success_rate_sim["bought_flag"].mean()
success_rate_sim_sum_bought_products = success_rate_sim["bought_flag"].sum()

print(f"Based on SIM Component there were bought {success_rate_sim_sum_bought_products} products with a {success_rate_sim_value} success rate")

Based on SIM Component there were bought 4583 products with a 0.5145391265296957 success rate


In [22]:
#au fost recomandate un număr de X produse către Y clienți pe baza componentei bazata pe client
print("SVD: There were recommended a number of ", len(deduplicated_svd_test_assessed[["rec_product",
                                                                                 "customer_unique_id"
                                                                                ]].drop_duplicates()),
      " products to ",len(deduplicated_svd_test_assessed["customer_unique_id"].unique().tolist()), " clients")

SVD: There were recommended a number of  3427  products to  3427  clients


In [23]:
print("SIM: There were recommended a number of ", len(deduplicated_sim_rec_test_assessed[["rec_product",
                                                                                 "customer_unique_id"
                                                                                ]].drop_duplicates()),
      " products to ",len(deduplicated_sim_rec_test_assessed["customer_unique_id"].unique().tolist()), " clients")

SIM: There were recommended a number of  8907  products to  3426  clients


## 4. RL Component 

In [24]:
# Concatenate success_rate_svd and success_rate_sim
all_recommendations = pd.concat([success_rate_svd, success_rate_sim]).reset_index()

In [25]:
Q_table = all_recommendations.groupby(by=["component", "segment"]).agg({"bought_flag": 'mean'})
Q_table

Unnamed: 0_level_0,Unnamed: 1_level_0,bought_flag
component,segment,Unnamed: 2_level_1
similarity,1,0.472635
similarity,2,0.602089
similarity,3,0.505762
svd,1,0.628469
svd,2,0.489487
svd,3,0.552186


In [26]:
# Extract Policy
Q_table.groupby("segment").idxmax()

Unnamed: 0_level_0,bought_flag
segment,Unnamed: 1_level_1
1,"(svd, 1)"
2,"(similarity, 2)"
3,"(svd, 3)"


In [27]:
# apply policy

all_recommendations_policy = all_recommendations[((all_recommendations["component"] == "svd") &
                                                (all_recommendations["segment"] != 2 )) |
                                                 ((all_recommendations["component"] == "similarity") &
                                                (all_recommendations["segment"] == 2 )) ]

success_rate_policy_value = all_recommendations_policy["bought_flag"].mean()
success_rate_policy_sum_bought_products = all_recommendations_policy["bought_flag"].sum()

print(f"Based on policy RL combination there were bought {success_rate_policy_sum_bought_products} products with a {success_rate_policy_value} synergy success rate")


Based on policy RL combination there were bought 2589 products with a 0.5959944751381215 synergy success rate


In [28]:
all_recommendations["bought_flag"].mean()

0.5258634668396303

In [29]:
print("Policy: There were recommended a number of ", len(all_recommendations_policy[["rec_product",
                                                                                 "customer_unique_id"
                                                                                ]].drop_duplicates()),
      " products to ",len(all_recommendations_policy["customer_unique_id"].unique().tolist()), " clients")

Policy: There were recommended a number of  4344  products to  3426  clients


## 5. Extract Business Value

In [30]:
products_deduplicated = orders_products.drop_duplicates(subset="product_id", keep = "first")

In [31]:
# merge bought products with price
all_recommendations_price = all_recommendations.merge(products_deduplicated[["product_id", "price"]],  
                                                                left_on = "rec_product",
                                                                   right_on = "product_id", 
                                                                   how= "left")


In [32]:
all_recommendations_price[all_recommendations_price["bought_flag"] ==1]["price"].sum() * 0.21

117516.82319999998