## Importing Libraries

In [30]:
import numpy as np
import pandas as pd

## Importing Dataset

In [2]:
ratings= pd.read_csv('rating.csv', usecols=['userId', 'movieId', 'rating','timestamp'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32','timestamp':'int32'})
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv('movies.csv',usecols=['movieId','title','genres'],dtype={'movieId': 'int32', 'title': 'str','genres':'str'})
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 610 | Number of movies = 9724


In [7]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Denormalizing Data

In [8]:
R = Ratings.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

  """Entry point for launching an IPython kernel.


In [11]:
Ratings_demeaned

array([[ 3.8958247 , -0.10417524,  3.8958247 , ..., -0.10417524,
        -0.10417524, -0.10417524],
       [-0.01177499, -0.01177499, -0.01177499, ..., -0.01177499,
        -0.01177499, -0.01177499],
       [-0.00976964, -0.00976964, -0.00976964, ..., -0.00976964,
        -0.00976964, -0.00976964],
       ...,
       [ 2.2321575 ,  1.7321576 ,  1.7321576 , ..., -0.26784244,
        -0.26784244, -0.26784244],
       [ 2.9875565 , -0.01244344, -0.01244344, ..., -0.01244344,
        -0.01244344, -0.01244344],
       [ 4.506119  , -0.4938811 , -0.4938811 , ..., -0.4938811 ,
        -0.4938811 , -0.4938811 ]], dtype=float32)

In [12]:
sparsity = round(1.0 - len(ratings) / float(n_users * n_movies), 3)
print('The sparsity level of MovieLens1M dataset is ' +  str(sparsity * 100) + '%')

The sparsity level of MovieLens1M dataset is 98.3%


## Using SVD

In [13]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 60)

In [14]:
sigma = np.diag(sigma)
sigma

array([[ 63.860077,   0.      ,   0.      , ...,   0.      ,   0.      ,
          0.      ],
       [  0.      ,  64.27604 ,   0.      , ...,   0.      ,   0.      ,
          0.      ],
       [  0.      ,   0.      ,  64.71876 , ...,   0.      ,   0.      ,
          0.      ],
       ...,
       [  0.      ,   0.      ,   0.      , ..., 184.86171 ,   0.      ,
          0.      ],
       [  0.      ,   0.      ,   0.      , ...,   0.      , 231.22456 ,
          0.      ],
       [  0.      ,   0.      ,   0.      , ...,   0.      ,   0.      ,
        474.20602 ]], dtype=float32)

## Predictions from Decomposed Matrices

In [15]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [16]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,0.945883,0.594404,1.359682,0.179632,-0.205582,4.174883,-0.573908,0.209205,0.131954,1.701394,...,-0.039842,-0.032567,-0.047116,-0.047116,-0.039842,-0.047116,-0.039842,-0.039842,-0.039842,-0.0299
1,0.20982,-0.079494,0.069298,0.017304,0.153519,0.024531,0.069663,0.017386,0.055092,-0.228387,...,0.020516,0.017644,0.023387,0.023387,0.020516,0.023387,0.020516,0.020516,0.020516,0.028867
2,0.019901,-0.041338,0.045096,0.010763,-0.006962,0.187223,-0.012368,0.003243,-0.004163,-0.050898,...,0.007694,0.007877,0.007511,0.007511,0.007694,0.007511,0.007694,0.007694,0.007694,0.012149
3,2.100811,-0.590882,-0.18201,0.126127,0.353362,0.845935,0.62295,0.090081,-0.001539,0.252572,...,0.012189,0.011114,0.013264,0.013264,0.012189,0.013264,0.012189,0.012189,0.012189,0.026958
4,1.602985,0.645545,-0.030871,0.094529,0.278333,0.63261,0.231293,0.103077,-0.10343,0.950407,...,-0.004744,-0.004505,-0.004984,-0.004984,-0.004744,-0.004984,-0.004744,-0.004744,-0.004744,-0.00219


In [24]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [25]:
already_rated, predictions = recommend_movies(preds, 510, movies, ratings, 20)

User 510 has already rated 108 movies.
Recommending highest 20 predicted ratings movies not already rated.


In [26]:
# Top 20 movies that User 1310 has rated 
already_rated.head(20)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
50,510,1997,4.5,1141158973,"Exorcist, The (1973)",Horror|Mystery
25,510,926,4.5,1141159343,All About Eve (1950),Drama
24,510,858,4.5,1141160565,"Godfather, The (1972)",Crime|Drama
105,510,35836,4.5,1141159422,"40-Year-Old Virgin, The (2005)",Comedy|Romance
23,510,750,4.5,1141158725,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
92,510,6787,4.5,1141159450,All the President's Men (1976),Drama|Thriller
36,510,1248,4.5,1141159316,Touch of Evil (1958),Crime|Film-Noir|Thriller
14,510,457,4.5,1141158931,"Fugitive, The (1993)",Thriller
55,510,2395,4.5,1141158970,Rushmore (1998),Comedy|Drama
73,510,3198,4.0,1141159217,Papillon (1973),Crime|Drama


## Recommending Movies

In [27]:
# Top 20 movies that User 1310 hopefully will enjoy
predictions

Unnamed: 0,movieId,title,genres
215,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
1450,2028,Saving Private Ryan (1998),Action|Drama|War
889,1221,"Godfather: Part II, The (1974)",Crime|Drama
799,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller
266,318,"Shawshank Redemption, The (1994)",Crime|Drama
1025,1387,Jaws (1975),Action|Horror
248,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
869,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance
879,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
117,150,Apollo 13 (1995),Adventure|Drama|IMAX


## Model Evaluation

In [34]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


In [35]:
# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm.
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8713  0.8680  0.8756  0.8726  0.8762  0.8727  0.0030  
Fit time          22.45   15.14   14.42   14.90   15.09   16.40   3.03    
Test time         1.56    0.73    0.66    0.71    0.87    0.91    0.34    


{'test_rmse': array([0.87125285, 0.86801527, 0.87564593, 0.87257517, 0.87615708]),
 'fit_time': (22.446306467056274,
  15.135005474090576,
  14.415002822875977,
  14.898981094360352,
  15.094993352890015),
 'test_time': (1.5634486675262451,
  0.7309892177581787,
  0.6610105037689209,
  0.7050056457519531,
  0.8730030059814453)}