In [61]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
def user_collaborative_filter(X, target_user, similar_user_number = 1, 
                              max_recommend_items = None, similar_user_thresh = None):
    """
    user-based collaborative filter for a specific user in X
    
    X: a dataframe containing two columns, first column is users, second is purchased item
    target_user: a string specified the user we plan to give recommendations
    similar_user_number: select the number of top similar users
    max_recommend_items: an integer to specify the max. number of recommended items
    similar_user_thresh (WORK IN PROGRESS): a floating number between 0 and 1 to set
                                            a minimum threshold for consine similarity
    """
    
    #add a value=1 column to data
    X = pd.DataFrame({"user":X.iloc[:,0],
                    "item":X.iloc[:,1],
                    "value":[1]*len(X)})
    
    #drop duplicate rows, if one purchased more than one items
    X.drop_duplicates(inplace=True)
    
    #convert dataframe to user-by-item matrix and fill nan with 0
    user_item_df = X.pivot(index=X.columns[0], columns=X.columns[1] )['value'].fillna(0)
    
    #convert user_item_df to numpy matrix
    matrix = np.array(user_item_df)
    
    #print("start cosine")
    #calculate cosine similarity
    similarity = cosine_similarity(matrix)
    #print("cosine end")
    
    #get target user index
    target_user_index = int(np.where(user_item_df.index == target_user)[0])
    
    #get the row of target user similarity
    target_user_similarity = similarity[target_user_index,:]
    
    #sort the index of user similarity in descending order
    #use [::-1] to reverse the order
    #[1:] filter out the target user [0] itself
    similar_user_index = np.argsort(target_user_similarity)[::-1][1:]
    
    recommended_items = []
    item_list = list(user_item_df.columns)
    #get unpurchased item index by substrating target user from most similar user
    for i in range(similar_user_number):
        not_purchased = matrix[similar_user_index[i]] - matrix[target_user_index]
        
        #check out item names
        for j in np.where(not_purchased == 1)[0]:
            if item_list[j] not in recommended_items:
                recommended_items.append(item_list[j])
                if max_recommend_items is not None:
                    if len(recommended_items) == max_recommend_items:
                        return recommended_items
    
    return recommended_items
    

In [3]:
def all_user_collaborative_filter(X, similar_user_number = 1, max_recommend_items = None):
    
    """
    user-based collaborative filtering for every user in X
    
    X: a dataframe containing two columns, first column is users, second is purchased item
    similar_user_number: select the number of top similar users
    max_recommend_items: an integer to specify the max. number of recommended items
    
    """
    
    #add a value=1 column to data
    X = pd.DataFrame({"user":X.iloc[:,0],
                    "item":X.iloc[:,1],
                    "value":[1]*len(X)})
    
    #drop duplicate rows, if one purchased more than one items
    X.drop_duplicates(inplace=True)
    
    #convert dataframe to user-by-item matrix and fill nan with 0
    user_item_df = X.pivot(index=X.columns[0], columns=X.columns[1] )['value'].fillna(0)
    
    #convert user_item_df to numpy matrix
    matrix = np.array(user_item_df)
    
    #print("start cosine")
    #calculate cosine similarity
    similarity = cosine_similarity(matrix)
    #print("cosine end")
    
    #sort user index by similarity (left-high/right-low) for each row(user)
    sorted_sim_index = np.fliplr(np.argsort(similarity))
    
    #number of total users
    num_total_users = matrix.shape[0]
    #item id/name list
    item_list = list(user_item_df.columns)
    
    #empty dict to store final results
    recommended_dict = {}
    
    #for each target user
    for i in range(num_total_users):
        user_id = user_item_df.index[i]
        
        recommended_items = []
        
        #Look at similar users 
        for j in range(similar_user_number):
            not_purchased = matrix[sorted_sim_index[i,j+1]] - matrix[i]
        
            #check out item names
            for k in np.where(not_purchased == 1)[0]:
                if item_list[k] not in recommended_items:
                    recommended_items.append(item_list[k])
                    if max_recommend_items == None:
                        continue
                    elif len(recommended_items) == max_recommend_items:
                        break
            if max_recommend_items == None:
                continue
            elif len(recommended_items) == max_recommend_items:
                break
        
        #add target user_id and recommended items to dict
        recommended_dict[user_id] = recommended_items
        
    return recommended_dict
                    
    

In [67]:
def average_precision(x):
    '''
    Calculate average precision for each user
    x: a binary vector
    '''
    x = np.array(x)
    
    #number of items
    k = len(x)
    
    precision = 0
    numerator = 0
    
    for i in range(k):
        
        numerator += x[i]
        precision += numerator / (i+1)
    
    return precision/k

def MAP(X, y):
    '''
    Calculate Mean Average Precision for the recommendation
    X: A dictionary containing user id as key and recommended items as values
    y: two column data frame of validation data (1st column is user id; 2nd is item id)
    
    '''
    number_users = 0
    
    unique_user_in_test = y.iloc[:,0].unique()
    
    #store average precision for each target user
    AP_res = []
    
    #for each target user
    for user, recommend_items in tqdm(X.items()):
        
        #when found in test data
        if user in unique_user_in_test:
            number_users += 1
            
            #get actual purchased items from y
            subset = y.loc[y.iloc[:,0] == user]
            purchased_items = set(subset.iloc[:,1])
            
            #empty array to store binary data
            purchased_vector = []
            
            #for each recommended item
            for item in recommend_items:
                
                #when it is actually purchased
                if item in purchased_items:
                    purchased_vector.append(1)
                #if not purchased
                else:
                    purchased_vector.append(0)
            
            #calculate Average Precision
            AP = average_precision(purchased_vector)
            #append to AP_res
            AP_res.append(AP)
    
    
    print("Number of users:", number_users)
    return sum(AP_res)/len(AP_res)
            
    

In [4]:
#Get training data 2019-05-01 - 2019-05-23
X_train = pd.read_csv("../data/transactions_train_20190501-0523.csv")
X_train

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,524061003,0.050831,2
1,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,735404001,0.050831,2
2,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,700370004,0.016932,2
3,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,618800001,0.033881,2
4,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,731407001,0.016932,2
...,...,...,...,...,...
1106456,2019-05-23,fff871bf24b40fd1290215414d760afaa69bb164d2b970...,573716012,0.022864,2
1106457,2019-05-23,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,649690002,0.022017,2
1106458,2019-05-23,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,706016002,0.033881,2
1106459,2019-05-23,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,399256023,0.016932,2


In [5]:
X_train = X_train[["customer_id", "article_id"]]
X_train

Unnamed: 0,customer_id,article_id
0,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,524061003
1,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,735404001
2,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,700370004
3,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,618800001
4,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,731407001
...,...,...
1106456,fff871bf24b40fd1290215414d760afaa69bb164d2b970...,573716012
1106457,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,649690002
1106458,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,706016002
1106459,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,399256023


In [71]:
%%time
#Run all user collaborative filtering (for first 100000 rows)
recommend_dict = all_user_collaborative_filter(X_train.head(100000),
                                               similar_user_number = 100, max_recommend_items = 12)
print("Number of customers:", len(recommend_dict))

Number of customers: 24031
Wall time: 2min 42s


In [68]:
#Get training data 2019-05-24 - 2019-09-01
X_test = pd.read_csv("../data/transactions_train_20190524-0601.csv")
X_test = X_test[["customer_id","article_id"]]
X_test

Unnamed: 0,customer_id,article_id
0,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,735121003
1,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,589722005
2,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,659302005
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,631458004
4,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,668647001
...,...,...
516472,fffdaab7d9d79ceee4ffd2ab98bb68411438ea90e46a67...,738881002
516473,fffdaab7d9d79ceee4ffd2ab98bb68411438ea90e46a67...,220094010
516474,ffff4c4e8b57b633c1ddf8fbd53db16b962cf831baf9ed...,487800001
516475,ffff4c4e8b57b633c1ddf8fbd53db16b962cf831baf9ed...,754013001


In [72]:
#MAP@12
MAP(recommend_dict, X_test)

100%|███████████████████████████████████████████████████████████████████████████| 24031/24031 [02:25<00:00, 164.63it/s]

Number of users: 5732





0.0018896534379786295

In [10]:
#TEST DRIVE
X = pd.DataFrame({"customer":["A","A","A","B","B","B","C","C","C","D","D"],
                  "article":["a","a","b","a","b","c","c","d","e","d","e"]})
user_collaborative_filter(X, target_user = "A",similar_user_number = 2)

['c', 'd', 'e']

In [52]:
#Test drive
all_user_collaborative_filter(X, similar_user_number = 2)

{'A': ['c', 'd', 'e'], 'B': ['d', 'e'], 'C': ['a', 'b'], 'D': ['c', 'a', 'b']}

## A simple example

In [11]:
#create user-article matrix of training data
df_t = pd.DataFrame({"customer":["A","A","A","B","B","B","C","C","C","D","D"],
                     "article":["a","a","b","a","b","c","c","d","e","d","e"],
                     "value":np.ones(11)})
df_t

Unnamed: 0,customer,article,value
0,A,a,1.0
1,A,a,1.0
2,A,b,1.0
3,B,a,1.0
4,B,b,1.0
5,B,c,1.0
6,C,c,1.0
7,C,d,1.0
8,C,e,1.0
9,D,d,1.0


In [12]:
#remove duplicate purchase history
df_t.drop_duplicates(inplace=True)

In [13]:
#fill missing values with 0
df_p = df_t.pivot(index='customer', columns="article")['value'].fillna(0)
df_p

article,a,b,c,d,e
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1.0,1.0,0.0,0.0,0.0
B,1.0,1.0,1.0,0.0,0.0
C,0.0,0.0,1.0,1.0,1.0
D,0.0,0.0,0.0,1.0,1.0


In [34]:
df_p.index[0]

'A'

In [15]:
#conver dataframe to numpy matrix
df_m = np.array(df_p)
df_m

array([[1., 1., 0., 0., 0.],
       [1., 1., 1., 0., 0.],
       [0., 0., 1., 1., 1.],
       [0., 0., 0., 1., 1.]])

In [17]:
#calculating pairwise cosine similarity
similarity = cosine_similarity(df_m)
similarity

array([[1.        , 0.81649658, 0.        , 0.        ],
       [0.81649658, 1.        , 0.33333333, 0.        ],
       [0.        , 0.33333333, 1.        , 0.81649658],
       [0.        , 0.        , 0.81649658, 1.        ]])

In [21]:
np.sort(similarity)

array([[0.        , 0.        , 0.81649658, 1.        ],
       [0.        , 0.33333333, 0.81649658, 1.        ],
       [0.        , 0.33333333, 0.81649658, 1.        ],
       [0.        , 0.        , 0.81649658, 1.        ]])

In [22]:
np.argsort(similarity)

array([[2, 3, 1, 0],
       [3, 2, 0, 1],
       [0, 1, 3, 2],
       [0, 1, 2, 3]])

In [33]:
np.fliplr(np.argsort(similarity))

array([[0, 1, 3, 2],
       [1, 0, 2, 3],
       [2, 3, 1, 0],
       [3, 2, 1, 0]])

In [8]:
#Identify similar customers to customer A (index=0)
similarity[0,:]

array([1.        , 0.81649658, 0.        , 0.        ])

In [9]:
np.where(similarity[0,:] > 0.8)[0]

array([0, 1])

In [10]:
np.argsort(similarity[0,:])[::-1][1:]

array([1, 3, 2])

In [11]:
#Get index of articles which were not yet purchased by the customer A, but purchased by cutomer B
not_purchased = df_m[1] - df_m[0]
np.where(not_purchased == 1)

(array([2]),)

In [12]:
#check non-purchased items in df_p columns
recommended_items = [ list(df_p.columns)[i] for i in np.where(not_purchased == 1)[0]]
recommended_items

['c']

In [13]:
#If we recommend to customer B [1]
similarity[1,:]

array([0.81649658, 1.        , 0.33333333, 0.        ])

In [14]:
np.argsort(similarity[1,:])[::-1][1:]

array([0, 2, 3])

In [15]:
not_purchased = df_m[0] - df_m[1]
np.where(not_purchased == 1)

(array([], dtype=int64),)

In [16]:
recommended_items = [ list(df_p.columns)[i] for i in np.where(not_purchased == 1)[0]]
recommended_items

[]

## Dive in H&M dataset

In [2]:
%%time
transaction_raw = pd.read_csv("../data/transactions_train.csv") 

Wall time: 22.1 s


In [4]:
#Get training data from 2019-5-1 to 2019-5-23
mask_train = (transaction_raw.t_dat >= '2019-05-01') & (transaction_raw.t_dat <= '2019-05-23')
train_raw = transaction_raw.loc[mask_train].reset_index(drop=True)
train_raw

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,524061003,0.050831,2
1,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,735404001,0.050831,2
2,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,700370004,0.016932,2
3,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,618800001,0.033881,2
4,2019-05-01,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,731407001,0.016932,2
...,...,...,...,...,...
1106456,2019-05-23,fff871bf24b40fd1290215414d760afaa69bb164d2b970...,573716012,0.022864,2
1106457,2019-05-23,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,649690002,0.022017,2
1106458,2019-05-23,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,706016002,0.033881,2
1106459,2019-05-23,fffb0ca7aca4f24df1b00a578d5b692ed7f83ffec7c37d...,399256023,0.016932,2


In [5]:
#Get test data from 2019-5-24 to 2019-6-1
mask_test = (transaction_raw.t_dat >= '2019-05-24') & (transaction_raw.t_dat <= '2019-06-01')
test_raw = transaction_raw.loc[mask_test].reset_index(drop=True)
test_raw

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2019-05-24,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,735121003,0.050831,1
1,2019-05-24,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,589722005,0.016932,1
2,2019-05-24,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,659302005,0.013559,1
3,2019-05-24,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,631458004,0.001678,1
4,2019-05-24,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,668647001,0.001678,1
...,...,...,...,...,...
516472,2019-06-01,fffdaab7d9d79ceee4ffd2ab98bb68411438ea90e46a67...,738881002,0.016932,2
516473,2019-06-01,fffdaab7d9d79ceee4ffd2ab98bb68411438ea90e46a67...,220094010,0.025407,2
516474,2019-06-01,ffff4c4e8b57b633c1ddf8fbd53db16b962cf831baf9ed...,487800001,0.011847,2
516475,2019-06-01,ffff4c4e8b57b633c1ddf8fbd53db16b962cf831baf9ed...,754013001,0.022017,2


In [11]:
#save train_raw and test_raw to csv
train_raw.to_csv("../data/transactions_train_20190501-0523.csv",index=False)
test_raw.to_csv("../data/transactions_train_20190524-0601.csv",index=False)

In [6]:
#Get X_train for user_collaborative_filter
X_train = train_raw[["customer_id","article_id"]].head(100000)

In [None]:
%%time
customer = X_train.customer_id[0]
user_collaborative_filter(X_train, target_user = customer, similar_user_number = 1)

In [45]:
user_collaborative_filter(X_train, target_user = customer, similar_user_number = 2)

[769434001, 742083002]

In [47]:
%%time

user_collaborative_filter(X_train, target_user = customer, similar_user_number = 3, max_recommend_items=12)

CPU times: user 55.8 s, sys: 3.32 s, total: 59.2 s
Wall time: 36.1 s


[769434001, 742083002, 661333002]

In [51]:
%%time

user_collaborative_filter(X_train, target_user = customer, similar_user_number = 100, max_recommend_items=12)

start cosine
cosine end
CPU times: user 57.2 s, sys: 3.6 s, total: 1min
Wall time: 38.1 s


[769434001,
 742083002,
 661333002,
 669882007,
 735550003,
 746329003,
 735404002,
 775629001,
 600886001,
 628917001,
 643642001,
 733419005]

In [41]:
user_collaborative_filter(X_train, target_user = customer, similar_user_number = 7)

[735550003,
 746329003,
 696209005,
 727347005,
 749699007,
 699075009,
 723370002,
 733267001,
 735428002,
 629420001,
 699598008,
 674606006,
 702623002,
 788107002]