# Implicit baseline
Более подробно [почитать](https://github.com/benfred/implicit) про библиотеку

In [225]:
import os
import pickle
import scipy
import pandas as pd
import numpy as np
import implicit

from tqdm import tqdm_notebook

In [220]:
# add zeros to k items length
def add_to_k(lst, k):
    return lst + [0] * max(k - len(lst), 0)

# precision at k
def precision_at_k(r_true_arr, k):
    return np.sum(r_true_arr[:k]) / k


# average precision at k
def average_precision_at_k(r_true_arr, k):
    apk = 0
    for n in range(0, k):
        apk += precision_at_k(r_true_arr, n + 1) * r_true_arr[n]
    if np.sum(r_true_arr[:k]) != 0:
        return (apk) / k
    else:
        return 0


# average normed precision at k
def average_normed_precision_at_k(r_true_arr, k, n_true):
    apk = 0
    apk_ideal = n_true / k
    
    for n in range(0, k):
        apk += precision_at_k(r_true_arr, n + 1) * r_true_arr[n]
    if np.sum(r_true_arr[:k]) != 0:
        return ((apk) / k) / apk_ideal
    else:
        return 0

In [None]:
def enumerated_dict(values):
    enum_dict = {}
    reverse_dict = {}
    
    for n, value in enumerate(values):
        enum_dict[value] = n
        reverse_dict[n] = value
        
    return enum_dict, reverse_dict


def predict_user(model, user_id, products, product_dict, reverse_product_dict, matrix_shape):
    enum_clients = np.zeros(len(products))
    enum_products = np.array([product_dict[product] for product in products])

    sparse_matrix = scipy.sparse.csr_matrix((np.ones(shape=(len(enum_clients))), 
                                             (enum_clients, enum_products)), 
                                            shape=matrix_shape)
    
    rec = model.recommend(0, sparse_matrix, N=30, recalculate_user=True,
                     filter_already_liked_items=False)
    
    return [[user_id, reverse_product_dict[r[0]]] for r in rec]

In [3]:
df_purchases = pd.read_csv("../retailhero-uplift/data/purchases.csv")

In [132]:
# выберем только пользователей с более, чем одной транзакцией
transactions_cnt = df_purchases\
                    .groupby(by=["client_id"])["transaction_id"]\
                    .count()\
                    .reset_index()

multi_trans_users = transactions_cnt[transactions_cnt["transaction_id"] > 1]["client_id"]

In [173]:
test_users = np.random.choice(multi_trans_users, 1000)

In [174]:
train, test = df_purchases[~df_purchases["client_id"].isin(test_users)], \
              df_purchases[df_purchases["client_id"].isin(test_users)]

In [175]:
last_transactions = test.drop_duplicates(subset="client_id", keep="last")["transaction_id"]
test_data = test[~test["transaction_id"].isin(last_transactions)]
test_validation = test[test["transaction_id"].isin(last_transactions)]

In [176]:
# клиенты только из train, а продукты из всего набора данных
client_dict, reverse_client_dict = enumerated_dict(df_purchases["client_id"].unique())
product_dict, reverse_product_dict = enumerated_dict(df_purchases["product_id"].unique())

In [177]:
# Определим размер матрицы
matrix_shape = (max(reverse_client_dict.keys()) + 1, max(reverse_product_dict.keys()) + 1)

In [264]:
enum_clients = np.array([client_dict[client] for client in train["client_id"]])
enum_products = np.array([product_dict[product] for product in train["product_id"]])

sparse_matrix = scipy.sparse.coo_matrix((np.ones(shape=(len(enum_clients))), 
                                         (enum_clients, enum_products)), 
                                        shape=matrix_shape)
print("Sparticity: ", 100 - df_purchases.shape[0] / \
        (sparse_matrix.shape[0] * sparse_matrix.shape[1]))

Sparticity:  99.99730966188883


In [265]:
# Initialize model
model = implicit.nearest_neighbours.TFIDFRecommender(K=50)

# Fit model
model.fit((sparse_matrix.T))

HBox(children=(IntProgress(value=0, max=42530), HTML(value='')))




In [287]:
# Рекомендации для отсутствующих пользователей
recommendations = []

for test_client in tqdm_notebook(test_data["client_id"].unique()):
    products = test_data[test_data["client_id"]==test_client]["product_id"]
    rec = predict_user(model, test_client, products, product_dict, reverse_product_dict,
                       (1, matrix_shape[1]))
    recommendations.extend(rec)

HBox(children=(IntProgress(value=0, max=983), HTML(value='')))




In [289]:
# датафрейм с покупками в реальности
reality = test_validation[["client_id", "product_id"]].copy()
reality.loc[:, "is_buyed"] = 1

# Metrics (precision@30, avg_precision@30, average_normed_precision@30)

In [290]:
rec_df = pd.DataFrame(recommendations, columns=["client_id", "product_id"])\
            .merge(reality, 
                   on=["client_id", "product_id"], 
                   how="left", 
                   sort=False)\
            .fillna(0)

In [291]:
# словарь с количеством покупок на валидации
real_dict = reality.groupby(by="client_id")["is_buyed"].sum().to_dict()

In [292]:
np.mean([precision_at_k(i, 30) for i in 
         rec_df.groupby(by="client_id", sort=False)["is_buyed"].apply(list)])

0.04706680230586639

In [293]:
np.mean([average_precision_at_k(add_to_k(i, 30), 30) for client, i in 
         rec_df.groupby(by="client_id")["is_buyed"].apply(list).reset_index().values])

0.021161332606648455

In [272]:
np.mean([average_normed_precision_at_k(add_to_k(i, 30), 30, real_dict.get(client, 0)) for client, i in 
         rec_df.groupby(by="client_id")["is_buyed"].apply(list).reset_index().values])

0.09907394880846197

# Сохраняем нашу модель

In [297]:
# сохраняем помимо модели еще и словари, чтобы была возможность создать матрицу
with open("x5_implicit.pkl", "wb") as f:
    pickle.dump((model, client_dict, reverse_client_dict, 
                 product_dict, reverse_product_dict), f)

# Итого:
1. Мы обучили  из библиотеки implicit;
2. Сделали валидационное и обучающее множества, получив на валидации результаты:
    - average_precision_at_k: 0.021
    - average_normed_precision_at_k: 0.099
3. Сохранили модель и использовали ее в решении;
4. Получили следующие результаты при загрузке нашего ответа:
    - check: 0.0938
    - public: 0.0908