In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd, numpy as np
from IPython.display import display, HTML, Markdown, Latex

In [2]:
# df = pd.read_excel("c:/recsys/materials/alice.xlsx", index_col='Name').astype(float)
ex ={'Item1': {'Alice': 5.0,    'User1': 3.0, 'User2': 4.0, 'User3': 3.0, 'User4': 1.0},
     'Item2': {'Alice': 3.0,    'User1': 1.0, 'User2': 3.0, 'User3': 3.0, 'User4': 5.0},
     'Item3': {'Alice': 4.0,    'User1': 2.0, 'User2': 4.0, 'User3': 1.0, 'User4': 5.0},
     'Item4': {'Alice': 4.0,    'User1': 3.0, 'User2': 3.0, 'User3': 5.0, 'User4': 2.0},
     'Item5': {'Alice': np.nan, 'User1': 3.0, 'User2': 5.0, 'User3': 4.0, 'User4': 1.0},
}
df = pd.DataFrame(ex)
df.index.name, df.columns.name = 'userID', 'itemID'

#df.loc['Alice','Item1'] = np.nan

means = df.mean(axis=1)
cent  = df.sub( means, axis=0 )

display( df )


itemID,Item1,Item2,Item3,Item4,Item5
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alice,5.0,3.0,4.0,4.0,
User1,3.0,1.0,2.0,3.0,3.0
User2,4.0,3.0,4.0,3.0,5.0
User3,3.0,3.0,1.0,5.0,4.0
User4,1.0,5.0,5.0,2.0,1.0


### Sparse matrix factorization with *randomized SVD*

1. Input: *centered* matrix, number of components, (optional) number of iterations and random seed (advisable)
2. Input must be converted to sparse matrix format: `cent.to_sparse().to_coo().tocsc()`

In [4]:
from sklearn.utils.extmath import randomized_svd

U_, Sigma, VT_ = randomized_svd(cent.to_sparse().to_coo().tocsc(), n_components=3, n_iter=5, random_state=1234)


U = pd.DataFrame(U_, index=cent.index)
VT = pd.DataFrame(VT_, columns=cent.columns)

display(U, Sigma, VT)

full1 = U.mul(Sigma).dot(VT).add(means, axis=0)
full2 = (U * Sigma @ VT).add(means, axis=0)
display( full1, full2 )

Unnamed: 0_level_0,0,1,2
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,-0.174861,-0.34117,-0.555705
User1,-0.322239,-0.177994,-0.239617
User2,-0.129507,-0.46878,0.777465
User3,-0.436334,0.770703,0.171239
User4,0.811434,0.195407,0.001257


array([4.9988044 , 2.59007561, 1.21341448])

itemID,Item1,Item2,Item3,Item4,Item5
0,-0.353569,0.520529,0.569753,-0.30493,-0.431784
1,-0.404465,0.47919,-0.497364,0.578812,-0.156174
2,-0.478396,0.195905,-0.101055,-0.377873,0.761419


TypeError: unsupported operand type(s) for @: 'DataFrame' and 'DataFrame'

### Sparse matrix factorization with *sparse SVD* (add-on module)

1. Input: *centered* matrix, number of components
2. Input must be converted to sparse matrix format: `cent.to_sparse().to_coo().tocsc()`

In [268]:
from sparsesvd import sparsesvd
U_, Sigma, VT_ = sparsesvd( cent.to_sparse().to_coo().tocsc(), 2 )

U = pd.DataFrame(U_.T, index=cent.index)
VT = pd.DataFrame(VT_, columns=cent.columns)

display(U, Sigma, VT)

full1 = U.mul(Sigma).dot(VT).add(means, axis=0)
full2 = (U * Sigma @ VT).add(means, axis=0)
display( full1, full2 )

Unnamed: 0_level_0,0,1
userID,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,-0.174861,0.34117
User1,-0.322239,0.177994
User2,-0.129507,0.46878
User3,-0.436334,-0.770703
User4,0.811434,-0.195407


array([4.9988044 , 2.59007561])

itemID,Item1,Item2,Item3,Item4,Item5
0,-0.353569,0.520529,0.569753,-0.30493,-0.431784
1,0.404465,-0.47919,0.497364,-0.578812,0.156174


itemID,Item1,Item2,Item3,Item4,Item5
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alice,4.666461,3.121567,3.941478,3.755069,4.515425
User1,3.155998,1.34061,1.711528,2.624343,3.167521
User2,4.519986,2.881197,4.035039,3.294626,4.269152
User3,3.163803,3.021196,0.964455,5.020512,3.830034
User4,1.161147,5.153898,4.859308,1.856089,0.969557


itemID,Item1,Item2,Item3,Item4,Item5
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alice,4.666461,3.121567,3.941478,3.755069,4.515425
User1,3.155998,1.34061,1.711528,2.624343,3.167521
User2,4.519986,2.881197,4.035039,3.294626,4.269152
User3,3.163803,3.021196,0.964455,5.020512,3.830034
User4,1.161147,5.153898,4.859308,1.856089,0.969557


#### Regardless of randomized SVM or sparse SVM, we use the factorized matrices as follows

In [284]:
# Predict all ratings for all items
display( (U * Sigma @ VT).add(means, axis=0) )

# Predict all ratings for Alice
display( U.loc['Alice'] * Sigma @ VT + means['Alice'] )

# Predict the rating of Alice for Item5
display( U.loc['Alice'] * Sigma @ VT['Item5'] + means['Alice'] )

itemID,Item1,Item2,Item3,Item4,Item5
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alice,4.666461,3.121567,3.941478,3.755069,4.515425
User1,3.155998,1.34061,1.711528,2.624343,3.167521
User2,4.519986,2.881197,4.035039,3.294626,4.269152
User3,3.163803,3.021196,0.964455,5.020512,3.830034
User4,1.161147,5.153898,4.859308,1.856089,0.969557


itemID
Item1    4.666461
Item2    3.121567
Item3    3.941478
Item4    3.755069
Item5    4.515425
Name: Alice, dtype: float64

4.51542515155701

--------------
--------------
--------------
--------------
# Experimental

--------------
--------------
--------------
--------------


In [8]:
ex ={'Item1': {'Alice': 5.0, 'User1': 3.0, 'User2': 4.0, 'User3': 3.0, 'User4': 1.0},
     'Item2': {'Alice': 3.0, 'User1': 1.0, 'User2': 3.0, 'User3': 3.0, 'User4': 5.0},
     'Item3': {'Alice': 4.0, 'User1': 2.0, 'User2': 4.0, 'User3': 1.0, 'User4': 5.0},
     'Item4': {'Alice': 4.0, 'User1': 3.0, 'User2': 3.0, 'User3': 5.0, 'User4': 2.0},
     'Item5': {              'User1': 3.0, 'User2': 5.0, 'User3': 4.0, 'User4': 1.0},
}
# Dense data frame
adf = pd.DataFrame(ex); adf.index.name = 'User'
# Centered data frame
cdf = adf.sub( adf.mean(axis=1), axis=0 )
# Sparse data frame
sdf = cdf.to_sparse()
# Sparse scipy matrix
ratings = sdf.to_sparse().to_coo().tocsr()
ratings

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [54]:
svx={'Item1': { 'User1': 3.0, 'User2': 4.0, 'User3': 3.0, 'User4': 1.0},
     'Item2': { 'User1': 1.0, 'User2': 3.0, 'User3': 2.0, 'User4': 6.0},
     'Item3': { 'User1': 2.0, 'User2': 4.0, 'User3': 1.0, 'User4': 5.0},
     'Item4': { 'User1': 3.0, 'User2': 3.0, 'User3': 5.0, 'User4': 2.0},
}
# Dense data frame
dfx = pd.DataFrame(svx).T; dfx.columns.name = 'User'; dfx.index.name= 'Item'
# Centered data frame
#cdf = adf.sub( adf.mean(axis=1), axis=0 )
# Sparse data frame
#sdf = cdf.to_sparse()
# Sparse scipy matrix
#ratings = sdf.to_sparse().to_coo().tocsr()
dfx

User,User1,User2,User3,User4
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Item1,3.0,4.0,3.0,1.0
Item2,1.0,3.0,2.0,6.0
Item3,2.0,4.0,1.0,5.0
Item4,3.0,3.0,5.0,2.0


# Matrix factorization

In [57]:
ratings = dfx.to_sparse().to_coo()

In [110]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(ratings, n_components=2, n_iter=5, random_state=1234)

display(U, Sigma, VT)

# @ = matrix multiplication
U @ np.diag(Sigma) @ VT

array([[ 0.43124523,  0.49315012],
       [ 0.53273754, -0.53052572],
       [ 0.52374556, -0.40520071],
       [ 0.50587435,  0.5578152 ]])

array([ 12.22151125,   4.92815942])

array([[ 0.3593326 ,  0.56750746,  0.4428526 ,  0.59388293],
       [ 0.36767659,  0.08799758,  0.56862492, -0.73057242]])

array([[ 2.78742375,  3.20489262,  3.71598251,  1.3545147 ],
       [ 1.37826736,  3.46488934,  1.39667175,  5.77678012],
       [ 1.56586336,  3.45687205,  1.69919932,  5.26029771],
       [ 3.23233518,  3.75054826,  4.30110911,  1.66336515]])

array([[ 3.,  4.,  3.,  1.],
       [ 1.,  3.,  2.,  6.],
       [ 2.,  4.,  1.,  5.],
       [ 3.,  3.,  5.,  2.]])

In [9]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=5, random_state=1234)
svd.fit(ratings)  

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=1234, tol=0.0)

In [10]:
svd.components_

array([[-0.35356875,  0.52052937,  0.56975326, -0.30493031, -0.43178356],
       [-0.40446482,  0.4791903 , -0.49736385,  0.57881237, -0.156174  ]])

In [11]:
svd.singular_values_

array([4.9988044 , 2.59007561])

In [19]:
svd.fit(cdf.to_sparse().to_coo())
svd.singular_values_

array([4.9988044 , 2.59007561])