In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import wmf
import scipy.sparse.linalg as sla
import math

In [18]:
def itemize(data):
    _,indx = np.unique(data,return_index=True)
    u = data[np.sort(indx)]
    n_data = u.shape[0]
    new_indx = np.arange(n_data)
    d = dict(zip(u,new_indx))
    data_indx = np.zeros(data.shape, dtype=np.int32)
    for i in range(data_indx.shape[0]):
        data_indx[i] = d[data[i]]
    return data_indx, n_data

def load_data():
    data = pd.read_csv('../../dataset/review.csv')
    data = data.drop(['funny', 'review_id', 'text', 'date', 'useful', 'cool'], axis=1)
    data.drop_duplicates(inplace=True)
    rows, cols, stars = np.array(data['user_id']), np.array(data['business_id']), np.array(data['stars'],dtype=np.uint8)
    # itemize users and items
    row_indx, n_users = itemize(rows)
    col_indx, n_items = itemize(cols)
    return scipy.sparse.csr_matrix((stars,(row_indx, col_indx)), dtype=np.uint8, shape=(n_users,n_items))

In [32]:
data = pd.read_csv('../../dataset/review.csv')
data = data.drop(['funny', 'review_id', 'text', 'date', 'useful', 'cool'], axis=1)
data.drop_duplicates(inplace=True)
rows, cols, stars = np.array(data['user_id']), np.array(data['business_id']), np.array(data['stars'],dtype=np.uint8)

In [39]:
row_indx, n_users = itemize(rows)
col_indx, n_items = itemize(cols)

In [44]:
R = scipy.sparse.csr_matrix((stars,(row_indx, col_indx)), dtype=np.uint8, shape=(n_users,n_items))

In [49]:
np.min(R)

0

In [43]:
stars.shape

(4092144,)

In [42]:
stars = stars[stars>=3]

In [35]:
cols = cols[stars>=3]

In [36]:
cols.shape

(4092144,)

In [46]:
R.data = np.ones_like(R.data)

In [29]:
R.count_nonzero()

5261667

In [15]:
stars[stars<3]=0

In [16]:
np.sum(stars>0)

4092144

In [12]:
5261667-4092144

1169523

In [4]:
R = load_data()

In [5]:
R.shape

(1326101, 174567)

In [24]:
V = np.random.randn(26744,50).astype('float32')*0.01
U = np.random.randn(138493,50).astype('float32')*0.01

In [9]:
V

array([[-0.00932499, -0.00336239, -0.00701297, ..., -0.00855676,
        -0.00414298, -0.00803222],
       [-0.0023182 , -0.00637991, -0.00130385, ..., -0.00871569,
         0.0162383 , -0.01480562],
       [ 0.00972943,  0.00264119,  0.02311968, ...,  0.01302849,
        -0.00845836, -0.01661776],
       ...,
       [ 0.00699472,  0.00051567,  0.01317331, ...,  0.00277903,
         0.0146654 ,  0.00650031],
       [-0.00245316,  0.01889137, -0.00448005, ..., -0.00132854,
        -0.02452814,  0.01086835],
       [-0.00424517, -0.0111349 ,  0.01505396, ...,  0.00240241,
         0.00481974, -0.00955946]], dtype=float32)

In [10]:
R.data = np.ones_like(R.data)

In [13]:
R.min()

0

In [5]:
S = wmf.log_surplus_confidence_matrix(R, alpha=2.0, epsilon=1e-6)

In [6]:
num_iters = 2
num_factors = 100

In [7]:
U,V = wmf.factorize(S,num_factors,num_iterations=num_iters, verbose=True)

precompute transpose
  took 0.151 seconds
run ALS algorithm
  iteration 0
    recompute user factors U
parallel
    time since start: 469.852 seconds
    recompute item factors V
parallel
    time since start: 519.650 seconds
  iteration 1
    recompute user factors U
parallel
    time since start: 985.633 seconds
    recompute item factors V
parallel
    time since start: 1037.617 seconds


In [10]:
np.save('U_wmf',U)
np.save('V_wmf',V)

In [25]:
def rmse(R,V,W):
    num = R.count_nonzero()
    term = scipy.sparse.lil_matrix(R.shape)
    ind = R.nonzero()
    i_prev=0
    k=0
    for i in range(len(ind[0])-1):
        if ind[0][i]==ind[0][i+1]:
            i=i+1
        else:
            term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(V[ind[0][i],:],W[j,:]) for j in ind[1][i_prev:i+1]])
            i_prev=i+1
            k=k+1
    term[k,ind[1][i_prev:i+1]] = np.asarray([np.dot(V[ind[0][i],:],W[j,:]) for j in ind[1][i_prev:i+1]])
    term = R - term
    rmse = math.sqrt(sla.norm(term)**2/num)
    return rmse

In [26]:
rmse(R_ml,U,V)

5.081027938038781

In [14]:
R.shape

(1326101, 174567)

In [6]:
R = R[:-1000000,:-100000]

In [7]:
R.shape

(326101, 74567)

In [17]:
R1.count_nonzero()

1107133

In [18]:
# Function for loading and pre-processing data
def get_data(path):
    np_frame = pd.read_csv(path,usecols=[0,1,2]).as_matrix()

    users = np_frame[:,0].astype(dtype = 'uint32')-1
    items = np_frame[:,1].astype(dtype = 'uint32')-1
    ratings = np_frame[:,2].astype(dtype = 'float64')

    items_unique, items_cleaned = np.unique(items,return_inverse=True)

    num_users = np.max(users)+1
    num_items = items_unique.shape[0]

    user_dict = {i: [] for i in range(num_users)}

    for i in range(len(users)):
        user_dict[users[i]].append([items_cleaned[i],ratings[i]])

    R_train = scipy.sparse.lil_matrix((num_users,num_items))
    R_test = scipy.sparse.lil_matrix((num_users,num_items))
    num_train = 0
    num_test = 0

    for i in user_dict.keys():
        l = len(user_dict[i])
        indx = np.arange(l)
        np.random.shuffle(indx)
        temp = np.asarray(user_dict[i])
        R_test[i,temp[indx[:l/2],0]] = temp[indx[:l/2],1]
        R_train[i,temp[indx[l/2:],0]] = temp[indx[l/2:],1]
        num_train += len(indx[l/2:])
        num_test += len(indx[:l/2])

    return R_train.tocsr()

In [19]:
path = '../../../HW1/ml-20m/ratings.csv'
R_ml = get_data(path)

In [20]:
R_ml.shape

(138493, 26744)

In [21]:
R_ml.count_nonzero()

10033855

In [27]:
R_ml_bin=R_ml[R_ml>=3.0]

matrix([[3.5, 3.5, 3.5, ..., 4.5, 3. , 5. ]])

In [None]:
R_ml[R_ml<3.0]=0.0



In [None]:
R_ml.shape