In [1]:
# Use matrix factorization

import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

In [2]:
%%time
transaction_raw = pd.read_csv("../data/transactions_train.csv")

CPU times: user 29 s, sys: 8.94 s, total: 37.9 s
Wall time: 42.1 s


In [12]:
mask_train = (transaction_raw.t_dat >= '2019-05-01') & (transaction_raw.t_dat <= '2019-05-10')
X_train = transaction_raw.loc[mask_train].reset_index(drop=True)[['customer_id','article_id']]

mask_test = (transaction_raw.t_dat >= '2019-05-11') & (transaction_raw.t_dat <= '2019-05-17')
X_test = transaction_raw.loc[mask_test].reset_index(drop=True)[['customer_id','article_id']]
        

In [13]:
X = X_train.head(500)
X

Unnamed: 0,customer_id,article_id
0,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,524061003
1,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,735404001
2,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,700370004
3,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,618800001
4,00019d6c20e0fbb551af18c57149af4707ec016bb0decd...,731407001
...,...,...
495,01d106a0b45ae7f169b3233d332bf325c80c24d4d160ed...,735550002
496,01d106a0b45ae7f169b3233d332bf325c80c24d4d160ed...,701843002
497,01d106a0b45ae7f169b3233d332bf325c80c24d4d160ed...,736581001
498,01d106a0b45ae7f169b3233d332bf325c80c24d4d160ed...,780852001


In [14]:
X = pd.DataFrame({"user":X.iloc[:,0],
                    "item":X.iloc[:,1],
                    "value":[1]*len(X)})
    
#drop duplicate rows, if one purchased more than one items
X.drop_duplicates(inplace=True)
    
#convert dataframe to user-by-item matrix and fill nan with 0
user_item_df = X.pivot(index=X.columns[0], columns=X.columns[1] )['value'].fillna(0)
    
#convert user_item_df to numpy matrix
matrix = np.array(user_item_df)

#build MF model
model = NMF(init='random', random_state=0)
#calculate low dimensional component matrices W, H
W = model.fit_transform(matrix)
H = model.components_

#calculate dot product of W and H
matrix_product = np.matmul(W, H)

#matrix substraction to get recommended items
matrix_recommend = matrix_product - matrix

#number of total users
num_total_users = matrix.shape[0]
#item id/name list
item_list = list(user_item_df.columns)

#empty dict to store final results
recommended_dict = {}

#for each target users
for i in range(num_total_users):
    user_id = user_item_df.index[i]
    
    recommended_items = []


In [16]:
matrix.shape

(108, 419)

In [17]:
model = NMF(init='random', random_state=0)
W = model.fit_transform(matrix)
H = model.components_

In [19]:
%%time
np.dot(W,H).shape

CPU times: user 5.26 ms, sys: 3.82 ms, total: 9.08 ms
Wall time: 26.9 ms


(108, 419)

In [20]:
%%time
np.matmul(W,H).shape

CPU times: user 3.9 ms, sys: 330 µs, total: 4.23 ms
Wall time: 2.38 ms


(108, 419)

In [62]:
matrix_mf = np.matmul(W,H)

In [63]:
matrix_re = matrix_mf - matrix

In [64]:
matrix_re

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 3.32313145e-07, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.30997150e-06]])

In [65]:
print(matrix.max())
print(matrix.min())
print(matrix_mf.max())
print(matrix_mf.min())

1.0
0.0
1.0000002894786595
0.0


In [66]:
np.any(matrix_re > 0)

True

In [67]:
matrix_re_0greater = matrix_re > 0
matrix_re_0greater.sum()

1016

In [None]:
#number of total users
num_total_users = matrix.shape[0]
#item id/name list
item_list = list(user_item_df.columns)

#empty dict to store final results
recommended_dict = {}

In [3]:
X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
X

array([[1. , 1. ],
       [2. , 1. ],
       [3. , 1.2],
       [4. , 1. ],
       [5. , 0.8],
       [6. , 1. ]])

In [4]:
model = NMF(n_components=2, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [5]:
W

array([[0.        , 0.46880684],
       [0.55699523, 0.3894146 ],
       [1.00331638, 0.41925352],
       [1.6733999 , 0.22926926],
       [2.34349311, 0.03927954],
       [2.78981512, 0.06911798]])

In [6]:
H

array([[2.09783018, 0.30560234],
       [2.13443044, 2.13171694]])

In [7]:
X_new = model.transform(X)
X_new

array([[0.        , 0.46880687],
       [0.55762104, 0.38906185],
       [1.0039665 , 0.41888706],
       [1.6736919 , 0.22910467],
       [2.3434173 , 0.03932227],
       [2.78976277, 0.06914748]])

In [10]:
np.dot(W,H)

array([[1.00063558, 0.99936347],
       [1.99965977, 1.00034074],
       [2.99965485, 1.20034566],
       [3.9998681 , 1.0001321 ],
       [5.00009002, 0.79990984],
       [6.00008587, 0.999914  ]])

In [11]:
np.matmul(W,H)

array([[1.00063558, 0.99936347],
       [1.99965977, 1.00034074],
       [2.99965485, 1.20034566],
       [3.9998681 , 1.0001321 ],
       [5.00009002, 0.79990984],
       [6.00008587, 0.999914  ]])