# Movie Recommendation System
## Recommender System using SVD


#### Imports

In [17]:
import pandas as pd
import numpy as np
import scipy
import scipy.linalg as linalg
from scipy.linalg import sqrtm
from numpy import *

#### Dataset

The ml-1m, or the [MovieLens dataset 100 k](https://grouplens.org/datasets/movielens/100k).

##### 1. Data Preprocessing
We will begin by loading the dataset file present in the `.csv` file into pandas dataframes and visualizing the entries.

In [18]:
data = pd.read_csv('movielens100k.csv')
data['userId'] = data['userId'].astype('str')
data['movieId'] = data['movieId'].astype('str')

users = data['userId'].unique() #list of all users
movies = data['movieId'].unique() #list of all movies

print("Number of users", len(users))
print("Number of movies", len(movies))

data.head()

Number of users 718
Number of movies 8915


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,5.0,847117005
1,1,2,3.0,847642142
2,1,10,3.0,847641896
3,1,32,4.0,847642008
4,1,34,4.0,847641956


##### 2. Split the data into a train and test set

In [19]:
test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)

test_ratio = 0.2 #fraction of data to be used as test set.

for u in users:
    temp = data[data['userId'] == u]
    n = len(temp)
    test_size = int(test_ratio*n)
    
    temp = temp.sort_values('timestamp').reset_index()
    temp.drop('index', axis=1, inplace=True)


    dummy_test = temp.iloc[n-1-test_size :]
    dummy_train = temp.iloc[: n-2-test_size]

    test = pd.concat([test, dummy_test])
    train = pd.concat([train, dummy_train])

##### 3. Create the utility matrix

The input data will now be converted to the utility matrix $(n\times m)$ where the rows of the matrix are users $n$ and the columns are the ratings for the $m$-th movie.

In [20]:
# Create_utility_matrix
def create_utility_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):

        """
        :param data:   			Array-like, 2D, nx3
        :param formatizer:      pass the formatizer
        :return:                the utility matrix. 2D, n x m, n=users, m=items
        """
        
        itemField = formatizer['item']
        userField = formatizer['user']
        valueField = formatizer['value']

        userList = data.iloc[:,userField].tolist()
        itemList = data.iloc[:,itemField].tolist()
        valueList = data.iloc[:,valueField].tolist()

        users = list(set(data.iloc[:,userField]))
        items = list(set(data.iloc[:,itemField]))

        users_index = {users[i]: i for i in range(len(users))}

        pd_dict = {item: [np.nan for i in range(len(users))] for item in items}

        for i in range(0,len(data)):
            item = itemList[i]
            user = userList[i]
            value = valueList[i]

            pd_dict[item][users_index[user]] = value
            

        X = pd.DataFrame(pd_dict)
        X.index = users
        
        itemcols = list(X.columns)
        items_index = {itemcols[i]: i for i in range(len(itemcols))}
        

        return X, users_index, items_index

utilMat, users_index, items_index = create_utility_matrix(train)

#### Metric computation

The function rmse computes the root mean square error (RMSE) for the true and the predicted movie ratings.

In [21]:
def rmse(true, pred):
    # this will be used to compute the root mean square error for the true and the predicted movie rating
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

#### Code for computing SVD for the utility matrix 

In [24]:
def svd(train,k):

    utilMat = np.array(train)
    #the utilMat is masked to hide the nan values
    mask = isnan(utilMat)
    masked_array = ma.masked_array(utilMat, mask)
    
    #the mean of the masked array is computed and the utilMat is filled with the mean values
    mean = np.mean(masked_array, axis =0)
    utilMat = np.full((718,7731), mean)

    #an array of repeat elements of mean is created
    sub = np.tile(mean, (718,1))
    
    #the utilmat is subtracted from the mean 
    utilMat = utilMat - sub
    
    #svd is carried out
    U, d, V= linalg.svd(utilMat, full_matrices = False)
    d = np.diag(d)

    #only k most significant features are selected
    U = U[:,0:k]    #print("shape of U", U.shape)
    d = d[0:k,0:k]  #print("shape of d", d.shape)
    V = V[0:k,:]    #print("shape of V", V.shape)
    
    d = sqrtm(d)
    
    Ud = np.dot(U,d)
    dV = np.dot(d,V)
    UdV = np.dot(Ud,dV)
    UdV = UdV + sub
    
    return UdV


#### Code for the test set 

Write the code that computes the RMSE for the predicted ratings for the test data present in the `test` matrix.

In [35]:
features = [6,8,10,13,17,20]

for i in features: 
    output = svd(utilMat, k=i)
    pred = [] 

    for _,row in test.iterrows():
        users = row['userId']
        items = row['movieId']

        user_index = users_index[users]
        if items in items_index:
            item_index = items_index[items]
            predicted_rating = output[user_index, item_index]
        else:
            predicted_rating = np.mean(output[user_index, :])
        pred.append(predicted_rating)

    print("Root Mean Square Error of test set for the range of features is: ",rmse(test['rating'], pred))

Root Mean Square Error of test set for the range of features is:  0.9944775688254684
Root Mean Square Error of test set for the range of features is:  0.9944775688254684
Root Mean Square Error of test set for the range of features is:  0.9944775688254684
Root Mean Square Error of test set for the range of features is:  0.9944775688254684
Root Mean Square Error of test set for the range of features is:  0.9944775688254684
Root Mean Square Error of test set for the range of features is:  0.9944775688254684
