In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

## Read & prepare data

In [2]:
data = Path('../data')

df_sales = pd.read_csv(data / 'sales.csv')
df_orders = pd.read_csv(data / 'orders.csv')
df_customers = pd.read_csv(data / 'customers.csv')

df_products = pd.read_csv(data / 'products.csv')
df_products.columns = df_products.columns.str.lower()

df_sales = df_sales.merge(df_orders[['order_id', 'customer_id']])

In [3]:
df_orders = df_orders.set_index('order_id')


## Validation

Let's set aside some data for testing 

In [4]:
df_train = df_sales[df_sales.order_id <= 900]
df_val = df_sales[df_sales.order_id > 900]

val_ground_truth = df_val.groupby('order_id').product_id.apply(set)

We need to have a good baseline. Let's use the most frequest items

We will have the following recommendation scenario: we suggest 10 products and see how many
of them the user will click on.

We'll use a simple evaluation technique: we'll calculate how many suggested items the user actually bought. 

It's called "Average presicion" ([link](https://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html))

In [5]:
n_rec = 10
freq = df_train.product_id.value_counts().iloc[:n_rec].index.values
freq

array([ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
      dtype=int64)

In [6]:
def average_precision(recommendations, ground_truth):
    res = 0

    n_rows, n_rec = recommendations.shape
    assert n_rows == len(ground_truth)

    for i, gt in enumerate(ground_truth):
        cnt = 0
        for item_id in recommendations[i]:
            if item_id in gt:
                cnt = cnt + 1
        precision = cnt / n_rec
        res = res + precision

    res = res / n_val
    return res

In [7]:
n_val = len(val_ground_truth)
baseline = np.repeat([freq], n_val, axis=0)
baseline[:10]

array([[ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757]],
      dtype=int64)

In [8]:
average_precision(baseline, val_ground_truth)

0.005050505050505051

That's our baseline. Let's see if we can improve it with content-based recommendations

## Collaborative filtering


Let's build the interaction matrix X. Users will be the rows and items will be the colums

In [144]:
from scipy.sparse import csr_matrix

In [145]:
rows = df_train.customer_id.values
columns = df_train.product_id.values
data = np.ones(len(df_train))

In [146]:
n_rows = rows.max() + 1
n_cols = columns.max() + 1

X = csr_matrix((data, (rows, columns)), shape=(n_rows, n_cols))
X

<1001x1260 sparse matrix of type '<class 'numpy.float64'>'
	with 4504 stored elements in Compressed Sparse Row format>

In [147]:
from sklearn.decomposition import randomized_svd
from sklearn.preprocessing import normalize

In [148]:
U, S, VT = randomized_svd(X, n_components=16, random_state=1)

In [149]:
V = VT.T

In [150]:
U = normalize(U)
V = normalize(V)

In [151]:
U[1].dot(V[179])

0.13674924590068577

Now we need to find customers who we have in both training & validation

In [152]:
target_customers = set(df_val.customer_id) & set(df_train.customer_id)
df_train_subset = df_train[df_train.customer_id.isin(target_customers)]

In [153]:
user_recommendations = {}

for user_id, df in df_train_subset.groupby('customer_id'):
    # represent each users as a average vector of all their
    # purchases

    X_user = U[user_id]

    # compute the similarities between the products and the user
    scores = V.dot(X_user)

#     argidx = scores.argpartition(np.arange(10))[:10]
    argidx = (-scores).argsort()[:10]

    user_recommendations[user_id] = argidx

Now we'll create a matrix with predictions

In [154]:
n_val = len(val_ground_truth)
cf_predictions = np.repeat([freq], n_val, axis=0)

In [155]:
for i, order_id in enumerate(val_ground_truth.index):
    user_id = df_orders.loc[order_id].customer_id
    if user_id in user_recommendations:
        cf_predictions[i] = user_recommendations[user_id]

In [156]:
average_precision(cf_predictions, val_ground_truth)

0.005050505050505051

That didn't work too 