# Turicreate baseline
Более подробно [почитать](https://apple.github.io/turicreate/docs/userguide/recommender/) про библиотеку

In [1]:
import os
import pandas as pd
import numpy as np
import turicreate as tc

In [2]:
# precision at k
def precision_at_k(r_true_arr, k):
    return np.sum(r_true_arr[:k]) / k


# average precision at k
def average_precision_at_k(r_true_arr, k):
    apk = 0
    for n in range(0, k):
        apk += precision_at_k(r_true_arr, n + 1) * r_true_arr[n]
    if np.sum(r_true_arr[:k]) != 0:
        return (apk) / k
    else:
        return 0


# average normed precision at k
def average_normed_precision_at_k(r_true_arr, k, n_true):
    apk = 0
    apk_ideal = n_true / k
    
    for n in range(0, k):
        apk += precision_at_k(r_true_arr, n + 1) * r_true_arr[n]
    if np.sum(r_true_arr[:k]) != 0:
        return ((apk) / k) / apk_ideal
    else:
        return 0

In [3]:
df_clients = tc.SFrame.read_csv('../retailhero-uplift/data/clients.csv')
df_products = tc.SFrame.read_csv("../retailhero-uplift/data/products.csv")
df_purchases = tc.SFrame.read_csv("../retailhero-uplift/data/purchases.csv")

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,float,str,str,float,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,float,float,float,float,str,str,float,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [4]:
df_purchases[:2]

client_id,transaction_id,transaction_datetime,regular_points_received,express_points_received,regular_points_spent
000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0
000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0

express_points_spent,purchase_sum,store_id,product_id,product_quantity,trn_sum_from_iss,trn_sum_from_red
0.0,1007.0,54a4a11a29,9a80204f78,2.0,80.0,
0.0,1007.0,54a4a11a29,da89ebd374,1.0,65.0,


In [5]:
df_clients[:2]

client_id,first_issue_date,first_redeem_date,age,gender
000012768d,2017-08-05 15:40:48,2018-01-04 19:30:07,45,U
000036f903,2017-04-10 13:54:23,2017-04-23 12:37:56,72,F


In [6]:
df_products[:2]

product_id,level_1,level_2,level_3,level_4,segment_id,brand_id,vendor_id,netto,is_own_trademark
0003020d3c,c3d3a8e8c6,c2a3ea8d5e,b7cda0ec0c,6376f2a852,123.0,394a54a7c1,9eaff48661,0.4,0
0003870676,e344ab2e71,52f13dac0c,d3cfe81323,6dc544533f,105.0,acd3dd483f,10486c3cf0,0.68,0

is_alcohol
0
0


In [7]:
# сделаем значения текстовыми, чтобы потом Turicreate сам сделал OHE
df_products["segment_id"] = df_products["segment_id"].astype(str)
df_products["is_alcohol"] = df_products["is_alcohol"].astype(str)
df_products["is_own_trademark"] = df_products["is_own_trademark"].astype(str)

In [8]:
# разбиваем на трейн и тест
training_data, validation_data = \
    tc.recommender.util.random_split_by_user(df_purchases, "client_id", "product_id", 
                                            max_num_users=10000, random_seed=42)

In [9]:
training_data.shape, validation_data.shape

((45556268, 13), (230300, 13))

In [10]:
columns = ['client_id',
'product_id']

item_columns = ['product_id',
 'level_1',
 'level_2',
 'level_3',
 'level_4',
 'segment_id',
 'brand_id',
 'vendor_id',
 'is_own_trademark',
 'is_alcohol']

In [11]:
# обучим модель с дополнительными данными о продуктах
model = tc.recommender.ranking_factorization_recommender.create(training_data[columns], 
                                                        user_id="client_id", 
                                                        item_id="product_id",
                                                        max_iterations=20,
                                                        item_data=df_products[item_columns])

In [12]:
# рекомендации для пользователей из вадидационной выборки
recommended = model.recommend(validation_data["client_id"].unique(), k=30, 
                              exclude_known=False)

In [13]:
# датафрейм с покупками в реальности
reality = validation_data["client_id", "product_id"].to_dataframe()
reality.loc[:, "is_buyed"] = 1

# Metrics (precision@30, avg_precision@30, average_normed_precision@30)

In [14]:
rec_df = recommended.to_dataframe().merge(reality, on=["client_id", "product_id"], 
                                          how="left", sort=False).fillna(0)

In [15]:
# словарь с количеством покупок на валидации
real_dict = reality.groupby(by="client_id")["is_buyed"].sum().to_dict()

In [16]:
np.mean([precision_at_k(i, 30) for i in 
         rec_df.groupby(by="client_id", sort=False)["is_buyed"].apply(list)])

0.15021443285043734

In [17]:
np.mean([average_precision_at_k(i, 30) for i in 
         rec_df.groupby(by="client_id")["is_buyed"].apply(list)])

0.08843787115556434

In [18]:
np.mean([average_normed_precision_at_k(i, 30, real_dict.get(client, 0)) for client, i in 
         rec_df.groupby(by="client_id")["is_buyed"].apply(list).reset_index().values])

0.09879876704365677

# Сохраняем нашу модель

In [19]:
model.save("x5.model")

# Загрузка модели быстрая

In [20]:
%%time
model = tc.load_model("x5.model")

CPU times: user 770 ms, sys: 149 ms, total: 919 ms
Wall time: 480 ms


# Итого:
1. Мы обучили factorization_recommender из библиотеки turicreate;
2. Сделали валидационное и обучающее множества, получив на валидации результаты:
    - average_precision_at_k: 0.088
    - average_normed_precision_at_k: 0.099
3. Сохранили модель и использовали ее в решении;
4. Получили следующие результаты при загрузке нашего ответа:
    - check: 0.0907
    - public: 0.0754