In [1]:
import pandas as pd
import numpy as np
import math
import copy

In [2]:
def createMatrix():
    ratings_df_columns = ["UserId", "MovieId", "Rating", "Timestamp"]
    ratings_df = pd.read_table("data_set/ratings.dat", sep="::",engine= 'python', names=ratings_df_columns)
    #print(ratings_df.head(10))
    
    ratings_df.drop(ratings_df.index[1000000:1000209], inplace = True)
    
    users = max(ratings_df["UserId"])
    movies = max(ratings_df["MovieId"])
    ratings_df = ratings_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    
    zero_mat = np.zeros(users);
    
    for i in range(1,movies + 1):
        print(i, end = "\r")
        if i not in ratings_df.columns:
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, users + 1))
            ratings_df = ratings_df.join(zero_df)
    
    ratings_matrix = np.asarray(ratings_df)
    #print(ratings_matrix.shape)
    #print(ratings_matrix)
    return ratings_matrix
    

In [3]:
ratings_matrix = createMatrix()

3952

In [4]:
def mean_centre(ratings_matrix):
    rating_matrix_centred = np.zeros(ratings_matrix.shape)
    for row in range(ratings_matrix.shape[0]):
        ratings_sum = ratings_matrix[row].sum()
        movies_watched = (ratings_matrix[row] > 0).sum()
        mean = ratings_sum/movies_watched
        for i in range(ratings_matrix.shape[1]):
            if ratings_matrix[row][i] != 0:
                rating_matrix_centred[row][i] = ratings_matrix[row][i] - mean
    return rating_matrix_centred

In [5]:
ratings_matrix_centred = mean_centre(ratings_matrix)
#print(ratings_matrix_centred)

In [6]:
#### Collabarative filtering
#### Find k most similar users who have rated that movie.

### Find similarity matrix based on row
#The similarity matrix will be a user x user matrix
def cosine_similarity(A):
    similarity = np.zeros((A.shape[0], A.shape[0])) 
    for i in range(A.shape[0]):
        mod1 = np.sqrt(np.dot(A[i],A[i])) #magnitude of the row vector
        for j in range(i+1, A.shape[0]):
            similarity_ij = np.dot(A[i], A[j]) #dot product of any pair of row vectors
            mod2 = np.sqrt(np.dot(A[j], A[j])) #magnitude of the row vector
            if mod2 == 0:
                similarity[i][j] = similarity[j][i] = 0
            else:
                similarity_ij /= (mod1 * mod2) #cosine similarity
                similarity[i][j] = similarity[j][i] = similarity_ij 
        print(i, end = "\r")
    return similarity

In [7]:
### User User collabrative filtering with k = 20

k = 20
predicted_rating_matrix = np.zeros(ratings_matrix_centred.shape)
user_user_similarity = cosine_similarity(ratings_matrix_centred)

6039

In [8]:
for user in range(ratings_matrix_centred.shape[0]):
    print(user, end = '\r')
    similarity = user_user_similarity[user] #similarity  array of a every user
    sorted_similarity = np.argsort(-similarity) #sorted in the descending order according to position
    
    for movie in range(ratings_matrix_centred.shape[1]):
        if ratings_matrix[user][movie] != 0: #check if the test matrix has a value in that position
            cnt = 0
            weighted_sum = 0
            sum_of_weights = 0  #select k most similar users
            for similar_user in sorted_similarity:
                if ratings_matrix[similar_user][movie] != 0 and similarity[similar_user] > 0:
                    cnt += 1
                    weighted_sum += similarity[similar_user] * ratings_matrix_centred[similar_user][movie]
                    #add the weighte of sum for each similar user
                    sum_of_weights += similarity[similar_user]
                    #add the similarity for every user
                    predicted_rating_matrix[user][movie] = weighted_sum/sum_of_weights
                if cnt == k:
                    break

        
               

6039

In [9]:
error = 0
cnt = 0
for user in range(ratings_matrix.shape[0]):
    for movie in range(ratings_matrix_centred.shape[1]):
        ## Both conditions are necessary as both are required.
        if ratings_matrix[user][movie] != 0 and predicted_rating_matrix[user][movie] != 0:
            error += ((predicted_rating_matrix[user][movie] - ratings_matrix_centred[user][movie]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)
print(error)
print(mse)

595690.077212
0.771870978206
