In [1]:
!wget https://storage.yandexcloud.net/datasouls-competitions/x5-retailhero/retailhero-uplift.zip

--2020-01-03 14:53:13--  https://storage.yandexcloud.net/datasouls-competitions/x5-retailhero/retailhero-uplift.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 625190754 (596M) [application/zip]
Saving to: ‘retailhero-uplift.zip’


2020-01-03 14:53:43 (20.9 MB/s) - ‘retailhero-uplift.zip’ saved [625190754/625190754]



In [2]:
!unzip retailhero-uplift.zip

Archive:  retailhero-uplift.zip
   creating: data/
  inflating: data/clients.csv        
  inflating: data/uplift_train.csv   
  inflating: data/purchases.csv      
  inflating: data/uplift_test.csv    
  inflating: data/uplift_sample_submission.csv  
  inflating: data/products.csv       
  inflating: uplift_solution.py      
  inflating: requirements.txt        
  inflating: README                  


In [3]:
!pip install implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/5a/d8/6b4f1374ffa2647b72ac76960c71b984c6f3238090359fb419d03827d87a/implicit-0.4.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 2.8MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.2-cp36-cp36m-linux_x86_64.whl size=3471294 sha256=d288903b78c1c095ee3d880d7a4311b60a6147d8447951992a2bb07358e51aa1
  Stored in directory: /root/.cache/pip/wheels/1b/48/b1/1aebe3acc3afb5589e72d3e7c3ffc3f637dc4721c1a974dff7
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.2


In [0]:
import os
import pickle
import scipy
import pandas as pd
import numpy as np
import implicit

from tqdm import tqdm_notebook

In [0]:
# add zeros to k items length
def add_to_k(lst, k):
    return lst + [0] * max(k - len(lst), 0)

# precision at k
def precision_at_k(r_true_arr, k):
    return np.sum(r_true_arr[:k]) / k


# average precision at k
def average_precision_at_k(r_true_arr, k):
    apk = 0
    for n in range(0, k):
        apk += precision_at_k(r_true_arr, n + 1) * r_true_arr[n]
    if np.sum(r_true_arr[:k]) != 0:
        return (apk) / k
    else:
        return 0


# average normed precision at k
def average_normed_precision_at_k(r_true_arr, k, n_true):
    apk = 0
    apk_ideal = n_true / k
    
    for n in range(0, k):
        apk += precision_at_k(r_true_arr, n + 1) * r_true_arr[n]
    if np.sum(r_true_arr[:k]) != 0:
        return ((apk) / k) / apk_ideal
    else:
        return 0

In [0]:
def enumerated_dict(values):
    enum_dict = {}
    reverse_dict = {}
    
    for n, value in enumerate(values):
        enum_dict[value] = n
        reverse_dict[n] = value
        
    return enum_dict, reverse_dict


def predict_user(model, user_id, products, product_dict, reverse_product_dict, matrix_shape):
    enum_clients = np.zeros(len(products))
    enum_products = np.array([product_dict[product] for product in products])

    sparse_matrix = scipy.sparse.csr_matrix((np.ones(shape=(len(enum_clients))), 
                                             (enum_clients, enum_products)), 
                                            shape=matrix_shape)
    
    rec = model.recommend(0, sparse_matrix, N=30, recalculate_user=True,
                     filter_already_liked_items=False)
    
    return [[user_id, reverse_product_dict[r[0]]] for r in rec]

In [0]:
df_purchases = pd.read_csv("data/purchases.csv")

In [0]:
transactions_cnt = df_purchases\
                    .groupby(by=["client_id"])["transaction_id"]\
                    .count()\
                    .reset_index()

multi_trans_users = transactions_cnt[transactions_cnt["transaction_id"] > 1]["client_id"]

In [0]:
test_users = np.random.choice(multi_trans_users, 1000)

In [0]:
train, test = df_purchases[~df_purchases["client_id"].isin(test_users)], \
              df_purchases[df_purchases["client_id"].isin(test_users)]

In [0]:
last_transactions = test.drop_duplicates(subset="client_id", keep="last")["transaction_id"]
test_data = test[~test["transaction_id"].isin(last_transactions)]
test_validation = test[test["transaction_id"].isin(last_transactions)]

In [0]:
# клиенты только из train, а продукты из всего набора данных
client_dict, reverse_client_dict = enumerated_dict(df_purchases["client_id"].unique())
product_dict, reverse_product_dict = enumerated_dict(df_purchases["product_id"].unique())

In [0]:
# Определим размер матрицы
matrix_shape = (max(reverse_client_dict.keys()) + 1, max(reverse_product_dict.keys()) + 1)

In [14]:
enum_clients = np.array([client_dict[client] for client in train["client_id"]])
enum_products = np.array([product_dict[product] for product in train["product_id"]])

sparse_matrix = scipy.sparse.coo_matrix((np.ones(shape=(len(enum_clients))), 
                                         (enum_clients, enum_products)), 
                                        shape=matrix_shape)
print("Sparticity: ", 100 - df_purchases.shape[0] / \
        (sparse_matrix.shape[0] * sparse_matrix.shape[1]))

Sparticity:  99.99730966188883


In [32]:
# Initialize model
model = implicit.nearest_neighbours.TFIDFRecommender(K=1000)

# Fit model
model.fit((sparse_matrix.T))

HBox(children=(IntProgress(value=0, max=42530), HTML(value='')))




In [33]:
# Рекомендации для отсутствующих пользователей
recommendations = []

for test_client in tqdm_notebook(test_data["client_id"].unique()):
    products = test_data[test_data["client_id"]==test_client]["product_id"]
    rec = predict_user(model, test_client, products, product_dict, reverse_product_dict,
                       (1, matrix_shape[1]))
    recommendations.extend(rec)

HBox(children=(IntProgress(value=0, max=983), HTML(value='')))




In [0]:
# датафрейм с покупками в реальности
reality = test_validation[["client_id", "product_id"]].copy()
reality.loc[:, "is_buyed"] = 1

In [0]:
rec_df = pd.DataFrame(recommendations, columns=["client_id", "product_id"])\
            .merge(reality, 
                   on=["client_id", "product_id"], 
                   how="left", 
                   sort=False)\
            .fillna(0)

In [0]:
# словарь с количеством покупок на валидации
real_dict = reality.groupby(by="client_id")["is_buyed"].sum().to_dict()

In [37]:
np.mean([precision_at_k(i, 30) for i in 
         rec_df.groupby(by="client_id", sort=False)["is_buyed"].apply(list)])

0.04499830451000339

In [38]:
np.mean([average_precision_at_k(add_to_k(i, 30), 30) for client, i in 
         rec_df.groupby(by="client_id")["is_buyed"].apply(list).reset_index().values])

0.020145799815628522

In [39]:
np.mean([average_normed_precision_at_k(add_to_k(i, 30), 30, real_dict.get(client, 0)) for client, i in 
         rec_df.groupby(by="client_id")["is_buyed"].apply(list).reset_index().values])

0.09209015985174972

In [0]:
# сохраняем помимо модели еще и словари, чтобы была возможность создать матрицу
with open("x5_implicit.pkl", "wb") as f:
    pickle.dump((model, client_dict, reverse_client_dict, 
                 product_dict, reverse_product_dict), f)