In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def user_collaborative_filter(X, target_user,  similar_user_number = 1, similar_user_thresh = None):
    """
    X: a dataframe containing two columns, first column is users, second is purchased item
    target_user: a string specified the user we plan to give recommendations
    similar_user_number: select the number of top similar users
    similar_user_thresh: a floating number between 0 and 1 to set a minimum threshold for consine similarity
    """
    
    #add a value=1 column to data
    X['value'] = np.ones(len(X))
    
    #drop duplicate rows, if one purchased more than one items
    X.drop_duplicates(inplace=True)
    
    #convert dataframe to user-by-item matrix and fill nan with 0
    user_item_df = X.pivot(index=X.columns[0], columns=X.columns[1] )['value'].fillna(0)
    
    #convert user_item_df to numpy matrix
    matrix = np.array(user_item_df)
    
    #calculate cosine similarity
    similarity = cosine_similarity(matrix)
    
    #get target user index
    target_user_index = int(np.where(user_item_df.index == target_user)[0])
    
    #get the row of target user similarity
    target_user_similarity = similarity[target_user_index,:]
    
    #sort the index of user similarity in descending order
    #use [::-1] to reverse the order
    #[1:] filter out the target user [0] itself
    similar_user_index = np.argsort(target_user_similarity)[::-1][1:]
    
    #select the top similar uses
    
          
    
    
    

## A simple example

In [5]:
#create user-article matrix of training data
df_t = pd.DataFrame({"customer":["A","A","A","B","B","B","C","C","C","D","D"],
                     "article":["a","a","b","a","b","c","c","d","e","d","e"],
                     "value":np.ones(11)})
df_t

Unnamed: 0,customer,article,value
0,A,a,1.0
1,A,a,1.0
2,A,b,1.0
3,B,a,1.0
4,B,b,1.0
5,B,c,1.0
6,C,c,1.0
7,C,d,1.0
8,C,e,1.0
9,D,d,1.0


In [6]:
#remove duplicate purchase history
df_t.drop_duplicates(inplace=True)

In [7]:
#fill missing values with 0
df_p = df_t.pivot(index='customer', columns="article")['value'].fillna(0)
df_p

article,a,b,c,d,e
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1.0,1.0,0.0,0.0,0.0
B,1.0,1.0,1.0,0.0,0.0
C,0.0,0.0,1.0,1.0,1.0
D,0.0,0.0,0.0,1.0,1.0


In [51]:
user = 'B'
int(np.where(df_p.index == user)[0])

1

In [8]:
#conver dataframe to numpy matrix
df_m = np.array(df_p)
df_m

array([[1., 1., 0., 0., 0.],
       [1., 1., 1., 0., 0.],
       [0., 0., 1., 1., 1.],
       [0., 0., 0., 1., 1.]])

In [10]:
#calculating pairwise cosine similarity
similarity = cosine_similarity(df_m)
similarity

array([[1.        , 0.81649658, 0.        , 0.        ],
       [0.81649658, 1.        , 0.33333333, 0.        ],
       [0.        , 0.33333333, 1.        , 0.81649658],
       [0.        , 0.        , 0.81649658, 1.        ]])

In [11]:
#Identify similar customers to customer A (index=0)
similarity[0,:]

array([1.        , 0.81649658, 0.        , 0.        ])

In [55]:
np.where(similarity[0,:] > 0.8)[0]

array([0, 1], dtype=int64)

In [57]:
np.argsort(similarity[0,:])[::-1][1:]

array([1, 3, 2], dtype=int64)

In [35]:
#Get index of articles which were not yet purchased by the customer A, but purchased by cutomer B
not_purchased = df_m[2] - df_m[0]
np.where(not_purchased == 1)

(array([2, 3, 4], dtype=int64),)

In [36]:
list(df_p.columns)

['a', 'b', 'c', 'd', 'e']

In [37]:
list(np.where(not_purchased == 1)[0])

[2, 3, 4]

In [38]:
recommended_items = [ list(df_p.columns)[i] for i in np.where(not_purchased == 1)[0]]
recommended_items

['c', 'd', 'e']