In [2]:
import numpy as np
import pandas as pd
import math
import copy

In [3]:
rating_df_cols = ["UserId", "MovieId", "Rating", "Timestamp"]
rating_df = pd.read_table("data_set/ratings.dat", sep = "::", engine = "python", names = rating_df_cols)
#print(rating_df.shape)
#print(rating_df.head())

In [4]:
rating_df.drop(rating_df.index[1000000:1000209], inplace = True)
#print(rating_df.shape)

In [5]:
rating_matrix = np.asarray(rating_df.pivot(index = "UserId", columns = "MovieId", values = "Rating").fillna(0))
#print(rating_matrix.shape)

In [6]:
A = rating_matrix

In [7]:
def svd(A):
    transpose_flag = 0
    if A.shape[0] > A.shape[1]:  
        transpose_flag = 1
        A = A.T
    
    #Convert matrix A into square matrix to be able to perform eigen value decomp
    AAt = np.matmul(A, A.T)  #get the value of U(EE.T)U.T.
    
    AtA = np.matmul(A.T, A) #get the value of value of V(EE.T)V.T
    
    #getting the eigen values and vectors for both the square matrices
    eval_AAt, evec_AAt = np.linalg.eig(AAt) #use eigen value decomposition to get the the eigen value and the eigen vector. In this case it is V
    eval_AtA, evec_AtA = np.linalg.eig(AtA) # in this case it is U.
    
    #print(eval_AAt)
    evec_AAt_transpose = evec_AAt.T #get the transpose of eigen vectors. In this case it is U.T
    evec_AtA_transpose = evec_AtA.T #in this case it is V.T
    
    #ignore the vectors corresponding to negative eigen values
    eval_AAt[eval_AAt < 10e-2] = 0
    eval_AtA[eval_AtA < 10e-2] = 0
    #print(eval_AAt)
    #print(eval_AAt.shape)
    
    eval_AAt = np.sqrt(eval_AAt.real) #get the sqrt of all elements of the eigen value matrix as sigma^2 = lambda.
    eval_AtA = np.sqrt(eval_AtA.real)
    
    evec_AAt = evec_AAt.real #take only the real values of eigen vectors
    evec_AtA = evec_AtA.real
    
    argsort_eval_AAt = np.argsort(-eval_AAt) #Sort the values in the descending order but keep the value of their original positions
    argsort_eval_AtA = np.argsort(-eval_AtA)
    
    U = np.zeros(evec_AAt.shape)
    sigma1 = np.sort(eval_AAt)[::-1] #sort the eigen values in descending order.
    
    V = np.zeros(evec_AtA.shape)
    sigma2 = (np.sort(eval_AtA))[::-1] #sort the eigen values in descending order.
    
    for i,j in enumerate(argsort_eval_AtA):
        V[:,i] = evec_AtA[:,j]  #every element of the eigen value corresponds to a specific evector. Use that cor
        
    #sigma = np.diag(sigma1)
    #print(sigma)
    sigma = np.zeros(A.shape)
    sigma[:, :A.shape[0]] = np.diag(sigma1) #set the eigen value matrix as a diagonal matrix
       
    for i in range(U.shape[1]):
        if sigma1[i] != 0:
            U[:,i] = (A.dot(V[:,i]))/sigma1[i]
        else:
            U[:,i] = 0
        
    if transpose_flag == 0:
        return U, sigma, V.T
    else:
        return V, sigma.T, U.T

In [8]:
#a = np.array([1,2,3,4])
#d = np.zeros((4,4))
#d[:, :4] = np.diag(a)
# or simpler: d = np.diag([1,2,3,4])

#print(d)

In [9]:
U, sigma, Vt = svd(A)
A_pred  = U.dot(sigma.dot(Vt)) #get the predicted matrix

In [10]:
rmse = np.sqrt((((A_pred - A) ** 2).sum()/(A.shape[0]* A.shape[1])))
print(rmse)

0.000216688356404


In [11]:
total_energy = (sigma ** 2).sum()  #total energy is the total sum of the squares of the eigen values
req_energy = 0.9 * total_energy #with dimensionality reduction we need to consider only 90% of the total energy

current_energy = 0
component = 0
while current_energy < req_energy:
    current_energy += sigma[component][component] ** 2
    component += 1
    
components = component + 1
print("Number of components =", components) #number of eigen values considered
print(req_energy)
print(current_energy)
print("Energy percent = ", current_energy/total_energy * 100)

Number of components = 884
12667930.1541
12668401.6763
Energy percent =  90.0033499551


In [13]:
U_reduced = U[:, :components] #consider the eigen vectors corresponding to the selected eigen values
sigma_reduced = sigma[:components, :components]
Vt_reduced = Vt[:components, :] #consider the eigen vectors corresponding to the selected eigen values

In [14]:
A_pred_red = U_reduced.dot(sigma_reduced.dot(Vt_reduced)) #get the new predicted matrix.

In [15]:
rmse_90 = np.sqrt(((A_pred_red - A)**2).sum() / (A.shape[0] * A.shape[1]))
print(rmse_90)

0.250486587444
