In [1]:
import numpy as np
import pandas as pd
import copy
import math

In [2]:
# Train-Test-Split to separate the test values from the values to be trained
def split(ratio):
    rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
    rating_df = pd.read_table("data_set/ratings.dat", sep="::", engine = "python", names=rating_df_columns)
    rating_df.drop('TimeStamp', axis=1, inplace=True)
    #print(rating_df.shape)

    #rating_df.drop(index= range(100000,1000209), inplace=True)
    
    rating_df.drop(rating_df.index[1000000:1000209], inplace = True) #droping the rows from 1000000 to 1000209
    #print("rating_df", rating_df.shape)
    
    train_rating_df = rating_df.sample(frac = ratio, random_state=201) #sampling the training df according to the fraction
    test_rating_df = rating_df.drop(train_rating_df.index) #using the remaining elements to make the test df.

    #print("train_rating_df", train_rating_df.shape)
    #print("test_rating_df", test_rating_df.shape)
    
    train_users = max(train_rating_df["UserId"]) #getting the max value of userID
    train_movies = max(train_rating_df["MovieId"]) #getting the max value of MovieId

    test_users = max(test_rating_df["UserId"])
    test_movies = max(test_rating_df["MovieId"])

    assert train_users == test_users
    users = train_users
    movies = max(train_movies, test_movies)

    #pivoting the df to have UserID as the index and all the movieIDs to have individual columns
    train_rating_df = train_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    test_rating_df = test_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    
    #print(train_rating_df.head())
    
    #print("train_rating_df", train_rating_df.shape)
    #print("test_rating_df", test_rating_df.shape)
  
    zero_mat = np.zeros(users)
    #adding the columns to both the dfs if a particular movie is not found.
    for i in range(1, movies + 1):
        print(i, end = "\r")
        if i not in train_rating_df.columns:   
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, users + 1))
            train_rating_df = train_rating_df.join(zero_df)
        if i not in test_rating_df.columns:
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, users + 1))
            #print("zero_df",zero_df.shape)
            test_rating_df = test_rating_df.join(zero_df)
            #print("df shape", test_rating_df[i].shape)
            
    zero_mat = np.zeros(movies)
    #print("zero_mat",zero_mat.shape)
    
    #adding rows for both the dfs if any userId is not present.
    for i in range(1,users+1):
        print(i,end = "\r")
        if i not in train_rating_df.index:
            #print("train", i)
            zero_df = pd.DataFrame(zero_mat, index = [i], columns = range(1,movies+1))
            print(zero_df.shape)
            train_rating_df = train_rating_df.join(zero_df)
        if i not in test_rating_df.index:
            #print("test",i)
            #zero_df = pd.DataFrame(zero_mat, index = [i], columns = range(1,movies+1))
            #print(zero_df.shape)
            test_rating_df.loc[i] = zero_mat
            print(test_rating_df.shape)
        
    
    #print("train_rating_df", train_rating_df.shape)
    #print("test_rating_df", test_rating_df.shape)
    
    train_rating_df = train_rating_df[:]
    test_rating_df = test_rating_df[:]
    #print("train_rating_df", train_rating_df.shape)
    #print("test_rating_df", test_rating_df.shape)
    
    train_rating_matrix = np.asarray(train_rating_df) #Convert dataframe to numpy array
    test_rating_matrix = np.asarray(test_rating_df) #Convert dataframe to numpy array.
    #print("train_rating_matrix", train_rating_matrix.shape)
    #print("test_rating_matrix", test_rating_matrix.shape)
    assert train_rating_matrix.shape == test_rating_matrix.shape

    return train_rating_matrix, test_rating_matrix

In [3]:
train_rating_matrix, test_rating_matrix = split(0.8)

(6039, 3952)
(6040, 3952)
6040

In [6]:
### Mean center
#A is the training matrix, B is the test matrix
def mean_center(A, B):
    assert A.shape == B.shape
    S = A + B # S is the complete matrix
    
    A_centered = np.zeros(A.shape)
    B_centered = np.zeros(A.shape)
    
    for row in range(S.shape[0]):
        ratings_sum = S[row].sum() #total sum of rating of a particular user
        movies_rated = (S[row] > 0).sum() #number of mvies rated by a user
        mean = ratings_sum/movies_rated #mean movie rating by a user
        for i in A[row].nonzero():
            A_centered[row][i] = S[row][i] - mean #calculating the central value of the A and B
        for j in B[row].nonzero():
            B_centered[row][j] = S[row][j] - mean
    return A_centered, B_centered

In [7]:
train_rating_matrix_centered, test_rating_matrix_centered = mean_center(train_rating_matrix, test_rating_matrix)

In [8]:
#### Collabarative filtering
#### Find k most similar users who have rated that movie.

### Find similarity matrix based on row
#The similarity matrix will be a user x user matrix
def cosine_similarity(A):
    similarity = np.zeros((A.shape[0], A.shape[0])) 
    for i in range(A.shape[0]):
        mod1 = np.sqrt(np.dot(A[i],A[i])) #magnitude of the row vector
        for j in range(i+1, A.shape[0]):
            similarity_ij = np.dot(A[i], A[j]) #dot product of any pair of row vectors
            mod2 = np.sqrt(np.dot(A[j], A[j])) #magnitude of the row vector
            if mod2 == 0:
                similarity[i][j] = similarity[j][i] = 0
            else:
                similarity_ij /= (mod1 * mod2) #cosine similarity
                similarity[i][j] = similarity[j][i] = similarity_ij 
        print(i, end = "\r")
    return similarity

In [9]:
### User User collabrative filtering with k = 20

k = 20
predicted_rating_matrix = np.zeros(train_rating_matrix_centered.shape)
user_user_similarity = cosine_similarity(train_rating_matrix_centered)


6039

In [10]:

for user in range(train_rating_matrix_centered.shape[0]):
    print(user, end = '\r')
    similarity = user_user_similarity[user] #similarity  array of a every user
    
    sorted_similarity = np.argsort(-similarity) #sorted in the descending order according to position
    
    for movie in range(train_rating_matrix_centered.shape[1]):
        if test_rating_matrix[user][movie] != 0:  #check if the test matrix has a value in that position
            cnt = 0
            weighted_sum = 0
            sum_of_weights = 0 
            #select k most similar users
            for similar_user in sorted_similarity:
                if train_rating_matrix[similar_user][movie] != 0 and similarity[similar_user] > 0:
                    cnt += 1
                    weighted_sum += similarity[similar_user] * train_rating_matrix_centered[similar_user][movie]
                    #add the weighter sum for each similar user
                    sum_of_weights += similarity[similar_user]
                    #add the similarity for every user
                    if cnt == k:
                        predicted_rating_matrix[user][movie] = weighted_sum/sum_of_weights #calculate the predicted ratings
                        break
            
                
        
                

6039

In [11]:
error = 0
cnt = 0
for user in range(test_rating_matrix.shape[0]):
    for movie in range(test_rating_matrix_centered.shape[1]):
        ## Both conditions are necessary as both are required.
        if test_rating_matrix[user][movie] != 0 and predicted_rating_matrix[user][movie] != 0:
            error += ((predicted_rating_matrix[user][movie] - test_rating_matrix_centered[user][movie]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)
#print(error)
print(mse)

1.39933395789
