In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def user_collaborative_filter(X, target_user, similar_user_number = 1, 
                              max_recommend_items = None, similar_user_thresh = None):
    """
    X: a dataframe containing two columns, first column is users, second is purchased item
    target_user: a string specified the user we plan to give recommendations
    similar_user_number: select the number of top similar users
    similar_user_thresh (WORK IN PROGRESS): a floating number between 0 and 1 to set
                                            a minimum threshold for consine similarity
    """
    
    #add a value=1 column to data
    X['value'] = np.ones(len(X))
    
    #drop duplicate rows, if one purchased more than one items
    X.drop_duplicates(inplace=True)
    
    #convert dataframe to user-by-item matrix and fill nan with 0
    user_item_df = X.pivot(index=X.columns[0], columns=X.columns[1] )['value'].fillna(0)
    
    #convert user_item_df to numpy matrix
    matrix = np.array(user_item_df)
    
    #print("start cosine")
    #calculate cosine similarity
    similarity = cosine_similarity(matrix)
    #print("cosine end")
    
    #get target user index
    target_user_index = int(np.where(user_item_df.index == target_user)[0])
    
    #get the row of target user similarity
    target_user_similarity = similarity[target_user_index,:]
    
    #sort the index of user similarity in descending order
    #use [::-1] to reverse the order
    #[1:] filter out the target user [0] itself
    similar_user_index = np.argsort(target_user_similarity)[::-1][1:]
    
    recommended_items = []
    item_list = list(user_item_df.columns)
    #get unpurchased item index by substrating target user from most similar user
    for i in range(similar_user_number):
        not_purchased = matrix[similar_user_index[i]] - matrix[target_user_index]
        
        #check out item names
        for j in np.where(not_purchased == 1)[0]:
            if item_list[j] not in recommended_items:
                recommended_items.append(item_list[j])
                if max_recommend_items is not None:
                    if len(recommended_items) == max_recommend_items:
                        return recommended_items
    
    return recommended_items
    

In [3]:
#TEST DRIVE
X = pd.DataFrame({"customer":["A","A","A","B","B","B","C","C","C","D","D"],
                  "article":["a","a","b","a","b","c","c","d","e","d","e"]})
user_collaborative_filter(X, target_user = "B",similar_user_number = 2)

['d', 'e']

## A simple example

In [11]:
#create user-article matrix of training data
df_t = pd.DataFrame({"customer":["A","A","A","B","B","B","C","C","C","D","D"],
                     "article":["a","a","b","a","b","c","c","d","e","d","e"],
                     "value":np.ones(11)})
df_t

Unnamed: 0,customer,article,value
0,A,a,1.0
1,A,a,1.0
2,A,b,1.0
3,B,a,1.0
4,B,b,1.0
5,B,c,1.0
6,C,c,1.0
7,C,d,1.0
8,C,e,1.0
9,D,d,1.0


In [13]:
#remove duplicate purchase history
df_t.drop_duplicates(inplace=True)

In [14]:
#fill missing values with 0
df_p = df_t.pivot(index='customer', columns="article")['value'].fillna(0)
df_p

article,a,b,c,d,e
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1.0,1.0,0.0,0.0,0.0
B,1.0,1.0,1.0,0.0,0.0
C,0.0,0.0,1.0,1.0,1.0
D,0.0,0.0,0.0,1.0,1.0


In [51]:
user = 'B'
int(np.where(df_p.index == user)[0])

1

In [6]:
#conver dataframe to numpy matrix
df_m = np.array(df_p)
df_m

array([[1., 1., 0., 0., 0.],
       [1., 1., 1., 0., 0.],
       [0., 0., 1., 1., 1.],
       [0., 0., 0., 1., 1.]])

In [7]:
#calculating pairwise cosine similarity
similarity = cosine_similarity(df_m)
similarity

array([[1.        , 0.81649658, 0.        , 0.        ],
       [0.81649658, 1.        , 0.33333333, 0.        ],
       [0.        , 0.33333333, 1.        , 0.81649658],
       [0.        , 0.        , 0.81649658, 1.        ]])

In [8]:
#Identify similar customers to customer A (index=0)
similarity[0,:]

array([1.        , 0.81649658, 0.        , 0.        ])

In [9]:
np.where(similarity[0,:] > 0.8)[0]

array([0, 1])

In [10]:
np.argsort(similarity[0,:])[::-1][1:]

array([1, 3, 2])

In [11]:
#Get index of articles which were not yet purchased by the customer A, but purchased by cutomer B
not_purchased = df_m[1] - df_m[0]
np.where(not_purchased == 1)

(array([2]),)

In [12]:
#check non-purchased items in df_p columns
recommended_items = [ list(df_p.columns)[i] for i in np.where(not_purchased == 1)[0]]
recommended_items

['c']

In [13]:
#If we recommend to customer B [1]
similarity[1,:]

array([0.81649658, 1.        , 0.33333333, 0.        ])

In [14]:
np.argsort(similarity[1,:])[::-1][1:]

array([0, 2, 3])

In [15]:
not_purchased = df_m[0] - df_m[1]
np.where(not_purchased == 1)

(array([], dtype=int64),)

In [16]:
recommended_items = [ list(df_p.columns)[i] for i in np.where(not_purchased == 1)[0]]
recommended_items

[]

## Dive in H&M dataset

In [4]:
%%time
transaction_raw = pd.read_csv("../data/transactions_train.csv") 

CPU times: user 29.3 s, sys: 9.82 s, total: 39.1 s
Wall time: 43.7 s


In [5]:
#Get training data from 2019-5-1 to 2019-7-25
mask_train = (transaction_raw.t_dat >= '2019-05-01') & (transaction_raw.t_dat <= '2019-07-25')
train_raw = transaction_raw.loc[mask_train].reset_index(drop=True)
train_raw

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,524061003,0.050831,2
1,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,735404001,0.050831,2
2,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,700370004,0.016932,2
3,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,618800001,0.033881,2
4,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,731407001,0.016932,2
...,...,...,...,...,...
4953538,2019-07-25,fff3e75605ec575be9b95eda1e6557299e81bba12668d7...,751530001,0.011847,1
4953539,2019-07-25,fff3e75605ec575be9b95eda1e6557299e81bba12668d7...,800988001,0.028458,1
4953540,2019-07-25,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,749699001,0.025407,1
4953541,2019-07-25,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,507883009,0.013542,1


In [6]:
#Get test data from 2019-7-26 to 2019-8-1
mask_test = (transaction_raw.t_dat >= '2019-07-26') & (transaction_raw.t_dat <= '2019-08-01')
test_raw = transaction_raw.loc[mask_test].reset_index(drop=True)
test_raw

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2019-07-26,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,740943002,0.013542,2
1,2019-07-26,0002db27a1651998a3de4463437b580b45dfa7d8107afa...,773980002,0.010153,1
2,2019-07-26,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,752020001,0.022017,1
3,2019-07-26,000c6b4be3802318b68fe21ac36b119ef6568f42a82f18...,650672001,0.007610,2
4,2019-07-26,000c6b4be3802318b68fe21ac36b119ef6568f42a82f18...,740720002,0.007610,2
...,...,...,...,...,...
372838,2019-08-01,fff613e0adf409bc8acf40c7eabb40f377c7d4f7b12f61...,719209001,0.016932,2
372839,2019-08-01,fff613e0adf409bc8acf40c7eabb40f377c7d4f7b12f61...,720426001,0.025407,2
372840,2019-08-01,fff613e0adf409bc8acf40c7eabb40f377c7d4f7b12f61...,713199001,0.033881,2
372841,2019-08-01,fff613e0adf409bc8acf40c7eabb40f377c7d4f7b12f61...,739590002,0.016932,2


In [7]:
#Get X_train for user_collaborative_filter
X_train = train_raw[["customer_id","article_id"]].head(120000)

In [None]:
%%time
customer = X_train.customer_id[0]
user_collaborative_filter(X_train, target_user = customer, similar_user_number = 1)

In [45]:
user_collaborative_filter(X_train, target_user = customer, similar_user_number = 2)

[769434001, 742083002]

In [47]:
%%time

user_collaborative_filter(X_train, target_user = customer, similar_user_number = 3, max_recommend_items=12)

CPU times: user 55.8 s, sys: 3.32 s, total: 59.2 s
Wall time: 36.1 s


[769434001, 742083002, 661333002]

In [51]:
%%time

user_collaborative_filter(X_train, target_user = customer, similar_user_number = 100, max_recommend_items=12)

start cosine
cosine end
CPU times: user 57.2 s, sys: 3.6 s, total: 1min
Wall time: 38.1 s


[769434001,
 742083002,
 661333002,
 669882007,
 735550003,
 746329003,
 735404002,
 775629001,
 600886001,
 628917001,
 643642001,
 733419005]

In [41]:
user_collaborative_filter(X_train, target_user = customer, similar_user_number = 7)

[735550003,
 746329003,
 696209005,
 727347005,
 749699007,
 699075009,
 723370002,
 733267001,
 735428002,
 629420001,
 699598008,
 674606006,
 702623002,
 788107002]