In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import datetime
import pickle
from sklearn.preprocessing import MinMaxScaler

## Load data

In [None]:
pooledfeatures = pd.read_csv('pooled_features.csv', dtype={'0':str})
pooledfeatures.head()

In [None]:
userfeatures = pd.read_csv('user_features.csv', dtype={'0':str})
userfeatures.head()

In [None]:
article_map = pd.read_csv('article_image_mapping.csv', dtype={
    'article_id': str, 
    'product_type_name': str, 
    'directory': str,
    'filename': str,
    'image_exists': bool
})

article_map = article_map[article_map.image_exists]

In [None]:
data = pd.read_csv('data/transactions_train.csv', dtype={
    't_dat': str,
    'customer_id': str,
    'article_id': str,
    'price': float,
    'sales_channel_id': int
})

data.head()

In [None]:
data['t_dat'] = pd.to_datetime(data['t_dat'])

In [None]:
# Keep only records of products which have images
data = pd.merge(data, article_map, how='inner', on='article_id')
data = data[['t_dat','customer_id','article_id']]

In [None]:
data.sort_values('t_dat', inplace=True)

In [None]:
cust_list = set(data[(data['t_dat'].dt.date > datetime.date(2020,9,20))].customer_id.unique())

In [None]:
scaler=MinMaxScaler()
pooledfeatures.iloc[:,2:]=scaler.fit_transform(pooledfeatures.iloc[:,2:])

In [None]:
userfeatures = pd.merge(data[data.customer_id.isin(cust_list)].groupby('customer_id').tail(1).drop('t_dat',axis=1), pooledfeatures, left_on='article_id', right_on='0').drop(['article_id','0','1'], axis=1)
userfeatures.columns = pd.Series(range(userfeatures.columns.size)).astype(str)
userfeatures.head()

In [None]:
# Keep only latest 3 months transactions
data = data[(data['t_dat'].dt.date < datetime.date(2020,9,20))]
data.shape

In [None]:
# Keep only latest 3 months users
usersfeatures = userfeatures[userfeatures['0'].isin(cust_list)]
len(cust_list)

In [None]:
pooledfeatures.drop('1',axis=1, inplace=True)

In [None]:
pooledfeatures.set_index('0', inplace=True)
userfeatures.set_index('0', inplace=True)

## Customer purchase hashmap

In [None]:
data.sort_values('customer_id', inplace=True)

In [None]:
data['purchases'] = data.groupby(['customer_id'])['article_id'].transform(lambda x: ' '.join(x))
data = data[['customer_id','purchases']]
data.drop_duplicates(inplace=True)
data.shape

In [None]:
data['purchases'] = data.purchases.apply(lambda x: set(x.split()))
data.head()

In [None]:
cust_hashmap = dict(zip(data.customer_id, data.purchases))

## Calculate pairwise distances

In [None]:
dist_matrix = pd.DataFrame(euclidean_distances(userfeatures, pooledfeatures), columns=pooledfeatures.index, index=userfeatures.index)
dist_matrix.shape

In [None]:
dist_matrix.sample(10)

## Example Results

In [None]:
k=5
customer_id='c02de3411b7abf30b748fc07531db4ef6973c0ec533d29bcac2f53aedb51b277'
cols = dist_matrix.columns
cols = cols[cols.str.startswith('080')]
row = dist_matrix.loc[customer_id,:][cols]
indices = np.argsort(row)

# Take top k
indices = indices[:k]
predictions = set(cols[indices])
predictions

In [None]:
temp = data.groupby('customer_id').tail(1).drop('t_dat',axis=1)

In [None]:
temp[temp.customer_id=='c02de3411b7abf30b748fc07531db4ef6973c0ec533d29bcac2f53aedb51b277'].iloc[0].article_id

## Evaluate

In [None]:
def evaluate(k):
    
    precision = []
    recall = []

    for customer_id, purchases in cust_hashmap.items():
        try:
            cat = temp[temp.customer_id==customer_id].iloc[0].article_id[:3]
            
            # Sort distances in descending order
            cols = dist_matrix.columns
            cols = cols[cols.str.startswith(cat)]
            row = dist_matrix.loc[customer_id,:][cols]
            indices = np.argsort(row)

            # Take top k
            indices = indices[:k]
            predictions = set(cols[indices])

            tp = len(purchases.intersection(predictions))
            fp = len(predictions.difference(purchases))
            fn = len(purchases.difference(predictions))
            
            pr = tp/(tp+fp) if (tp+fp)>0 else 0
            re = tp/(tp+fn) if (tp+fn)>0 else 0
            
            precision.append(pr)
            recall.append(re)
            
        except:
            pass
    
    print("Precision = ", np.mean(np.array(precision)))
    print("Recall = ", np.mean(np.array(recall)))

## Top K=2

In [None]:
evaluate(2)

## Top K=4

In [None]:
evaluate(4)

## Top K=6

In [None]:
evaluate(6)

## Top K=8

In [None]:
evaluate(8)

## Top K=10

In [None]:
evaluate(10)

## Top K=12

In [None]:
evaluate(12)

## Top K=14

In [None]:
evaluate(14)

## Top K=16

In [None]:
evaluate(16)

## Top K=18

In [None]:
evaluate(18)

## Top K=20

In [None]:
evaluate(20)

## Top K=30

In [None]:
evaluate(30)

## Top K=40

In [None]:
evaluate(40)

## Top K=50

In [None]:
evaluate(50)