In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

## Read & prepare data

In [2]:
data = Path('../data')

df_sales = pd.read_csv(data / 'sales.csv')
df_orders = pd.read_csv(data / 'orders.csv')
df_customers = pd.read_csv(data / 'customers.csv')

df_products = pd.read_csv(data / 'products.csv')
df_products.columns = df_products.columns.str.lower()

df_sales = df_sales.merge(df_orders[['order_id', 'customer_id']])

In [3]:
df_orders = df_orders.set_index('order_id')


## Validation

Let's set aside some data for testing 

In [4]:
df_train = df_sales[df_sales.order_id <= 900]
df_val = df_sales[df_sales.order_id > 900]

val_ground_truth = df_val.groupby('order_id').product_id.apply(set)

We need to have a good baseline. Let's use the most frequest items

We will have the following recommendation scenario: we suggest 10 products and see how many
of them the user will click on.

We'll use a simple evaluation technique: we'll calculate how many suggested items the user actually bought. 

It's called "Average presicion" ([link](https://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html))

In [5]:
n_rec = 10
freq = df_train.product_id.value_counts().iloc[:n_rec].index.values
freq

array([ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
      dtype=int64)

In [6]:
def average_precision(recommendations, ground_truth):
    res = 0

    n_rows, n_rec = recommendations.shape
    assert n_rows == len(ground_truth)

    for i, gt in enumerate(ground_truth):
        cnt = 0
        for item_id in recommendations[i]:
            if item_id in gt:
                cnt = cnt + 1
        precision = cnt / n_rec
        res = res + precision

    res = res / n_val
    return res

In [7]:
n_val = len(val_ground_truth)
baseline = np.repeat([freq], n_val, axis=0)
baseline[:10]

array([[ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757],
       [ 740,  579,   78,  843, 1188,  968, 1177, 1038,  182,  757]],
      dtype=int64)

In [8]:
average_precision(baseline, val_ground_truth)

0.005050505050505051

That's our baseline. Let's see if we can improve it with content-based recommendations

## Content-based recommendations

To make content-based recommendations, we need to have content to recommend 
from. So we'll take the features from the product data frame

In [9]:
product_columns = ['product_id', 'product_type', 'product_name', 'size', 'colour', 'description']

df_train = df_train.merge(df_products[product_columns]) \
    .sort_values(by=['order_id', 'sales_id'])

Let's see how many users are in both train & validation set

In [10]:
target_customers = set(df_val.customer_id) & set(df_train.customer_id)
len(target_customers)

52

We will need to make predictions only for them. For the rest
we'll make the default recommendation (most frequent items)

In [11]:
df_train_subset = df_train[df_train.customer_id.isin(target_customers)]

Now let's create vectorizers for turning products (and users) to vectors

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
from sklearn.preprocessing import normalize

In [14]:
class ToDictTranformer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.columns = None
    
    def fit(self, df, y=None):
        self.columns = df.columns
        return self

    def transform(self, df):
        return df.to_dict(orient='records')

    def get_feature_names_out(self, *args, **kwargs):
        return self.columns

In [15]:
ohe_columns = ['product_type', 'product_name', 'size', 'colour']

categorical_pipeline = make_pipeline(ToDictTranformer(), DictVectorizer())

vectorizer = ColumnTransformer([
    ('categorical', categorical_pipeline, ohe_columns),
    ('description', TfidfVectorizer(min_df=50), 'description'),
])

In [16]:
vectorizer.fit(df_train)

First, we'll represent each product as a unit vector

In [18]:
X_products = vectorizer.transform(df_products)
X_products = normalize(X_products)

Now fo reach user we will:

- Represent each users and average vector of the products they ordered
- Next, compute the similarity between this user vector and each product vector
- Finally, get top 10 most similar products

We will use that as the recommendation for each user

In [19]:
user_recommendations = {}

for user_id, df in df_train_subset.groupby('customer_id'):
    # represent each users as a average vector of all their
    # purchases

    X_user = vectorizer.transform(df)
    
    X_user = X_user.sum(axis=0)
    X_user = np.asarray(X_user)
    X_user = normalize(X_user).reshape(-1)
    
    # compute the similarities between the products and the user
    
    scores = X_products.dot(X_user)

    argidx = scores.argpartition(np.arange(10))[:10]

    user_recommendations[user_id] = argidx

Now we'll create a matrix with predictions

In [20]:
n_val = len(val_ground_truth)
c1_predictions = np.repeat([freq], n_val, axis=0)

In [21]:
for i, order_id in enumerate(val_ground_truth.index):
    user_id = df_orders.loc[order_id].customer_id
    if user_id in user_recommendations:
        c1_predictions[i] = user_recommendations[user_id]

In [22]:
average_precision(c1_predictions, val_ground_truth)

0.00404040404040404

Oops - it didn't work, the result is worse than the baseline